From e3686aadfcf268180c2961e3bd8409d882b9b601 Mon Sep 17 00:00:00 2001 From: Roman Fitzjalen Date: Sun, 2 Feb 2025 01:01:06 +0100 Subject: [PATCH 1/8] fix hostname -I for macOS #6497 BUGFIX for Apple Silicon hostname --- deepspeed/comm/comm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index df8e8022081d..90eb7e4a8ba5 100755 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -698,9 +698,13 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True) master_addr = None if rank == 0: import shlex - hostname_cmd = shlex.split("hostname -I") - result = subprocess.check_output(hostname_cmd) - master_addr = result.decode('utf-8').split()[0] + try: + hostname_cmd = shlex.split("hostname -I") + result = subprocess.check_output(hostname_cmd) + master_addr = result.decode('utf-8').split()[0] + except subprocess.CalledProcessError: # hostname -I not available (e.g. on macOS) + import socket + master_addr = socket.gethostbyname(socket.gethostname()) master_addr = comm.bcast(master_addr, root=0) # Determine local rank by assuming hostnames are unique From 4d11f93dc12a0a577a1ed697834c36325027b5ab Mon Sep 17 00:00:00 2001 From: Roman Fitzjalen Date: Wed, 5 Feb 2025 09:42:31 +0100 Subject: [PATCH 2/8] resolve macos hostname error; pre-commit-reqs --- deepspeed/comm/comm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index 90eb7e4a8ba5..759ad7d7e089 100755 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -702,7 +702,7 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True) hostname_cmd = shlex.split("hostname -I") result = subprocess.check_output(hostname_cmd) master_addr = result.decode('utf-8').split()[0] - except subprocess.CalledProcessError: # hostname -I not available (e.g. on macOS) + except subprocess.CalledProcessError: # hostname -I not available (e.g. on macOS) import socket master_addr = socket.gethostbyname(socket.gethostname()) master_addr = comm.bcast(master_addr, root=0) From d31085658a37866802fed038886ca9bd9b6f9bac Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Tue, 4 Feb 2025 18:47:44 +0100 Subject: [PATCH 3/8] Allow NVIDIA Blackwell (#6991) NVIDIA Blackwell GPU generation has number 10. The SM code and architecture should be `100`, but the current code generates `1.`, because it expects a 2 characters string. This change modifies the logic to consider it as a string that contains a `.`, hence splits the string and uses the array of strings. Signed-off-by: Fabien Dupont --- op_builder/builder.py | 10 +++++----- op_builder/fp_quantizer.py | 2 +- op_builder/inference_core_ops.py | 2 +- op_builder/inference_cutlass_builder.py | 2 +- op_builder/ragged_ops.py | 2 +- op_builder/ragged_utils.py | 2 +- op_builder/transformer_inference.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/op_builder/builder.py b/op_builder/builder.py index cdd11f00cabf..ac850775cb11 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -612,8 +612,8 @@ def compute_capability_args(self, cross_compile_archs=None): - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples: - TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ... - TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ... + TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;9.0;10.0" pip install ... + TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 9.0 10.0+PTX" pip install ... - `cross_compile_archs` uses ; separator. @@ -651,9 +651,9 @@ def compute_capability_args(self, cross_compile_archs=None): args = [] self.enable_bf16 = True for cc in ccs: - num = cc[0] + cc[2] + num = cc[0] + cc[1].split('+')[0] args.append(f'-gencode=arch=compute_{num},code=sm_{num}') - if cc.endswith('+PTX'): + if cc[1].endswith('+PTX'): args.append(f'-gencode=arch=compute_{num},code=compute_{num}') if int(cc[0]) <= 7: @@ -666,7 +666,7 @@ def filter_ccs(self, ccs: List[str]): Prune any compute capabilities that are not compatible with the builder. Should log which CCs have been pruned. """ - return ccs + return [cc.split('.') for cc in ccs] def version_dependent_macros(self): # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456 diff --git a/op_builder/fp_quantizer.py b/op_builder/fp_quantizer.py index daa41a8148f5..e42927bd065d 100644 --- a/op_builder/fp_quantizer.py +++ b/op_builder/fp_quantizer.py @@ -78,7 +78,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 8: ccs_retained.append(cc) else: diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py index f7c0b47f92c6..b6665ebb7618 100755 --- a/op_builder/inference_core_ops.py +++ b/op_builder/inference_core_ops.py @@ -46,7 +46,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 6: ccs_retained.append(cc) else: diff --git a/op_builder/inference_cutlass_builder.py b/op_builder/inference_cutlass_builder.py index aa5294b1cbda..a4a607288ca8 100644 --- a/op_builder/inference_cutlass_builder.py +++ b/op_builder/inference_cutlass_builder.py @@ -45,7 +45,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 8: # Only support Ampere and newer ccs_retained.append(cc) diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py index 10afb193c738..0df28cc2282a 100644 --- a/op_builder/ragged_ops.py +++ b/op_builder/ragged_ops.py @@ -46,7 +46,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 8: # Blocked flash has a dependency on Ampere + newer ccs_retained.append(cc) diff --git a/op_builder/ragged_utils.py b/op_builder/ragged_utils.py index 654ba07e0879..208c9f833ebe 100755 --- a/op_builder/ragged_utils.py +++ b/op_builder/ragged_utils.py @@ -46,7 +46,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 6: ccs_retained.append(cc) else: diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py index 1b056ecef3ed..642aed56a192 100755 --- a/op_builder/transformer_inference.py +++ b/op_builder/transformer_inference.py @@ -44,7 +44,7 @@ def is_compatible(self, verbose=False): def filter_ccs(self, ccs): ccs_retained = [] ccs_pruned = [] - for cc in ccs: + for cc in [cc.split('.') for cc in ccs]: if int(cc[0]) >= 6: ccs_retained.append(cc) else: From 3970bf1a869a4285eeda738b1eda63a46823a062 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 4 Feb 2025 19:56:50 -0500 Subject: [PATCH 4/8] Update GH org references (#6998) Signed-off-by: Olatunji Ruwase Signed-off-by: Logan Adams Signed-off-by: Fabien Dupont Co-authored-by: Fabien Dupont --- .../deepspeed_chat_bug_report.md | 2 +- .../ISSUE_TEMPLATE/inference_bug_report.md | 2 +- .github/workflows/nv-a6000.yml | 2 +- .github/workflows/nv-ds-chat.yml | 2 +- .github/workflows/nv-mii.yml | 2 +- CONTRIBUTING.md | 8 +- README.md | 68 +++++++-------- accelerator/real_accelerator.py | 2 +- benchmarks/README.md | 4 +- blogs/deepspeed-chat/README.md | 12 +-- blogs/deepspeed-chat/chinese/README.md | 14 +-- .../ds-chat-release-8-31/README.md | 86 +++++++++---------- blogs/deepspeed-chat/japanese/README.md | 12 +-- blogs/deepspeed-domino/README.md | 2 +- blogs/deepspeed-fastgen/2024-01-19/README.md | 36 ++++---- blogs/deepspeed-fastgen/README.md | 20 ++--- blogs/deepspeed-fastgen/chinese/README.md | 20 ++--- blogs/deepspeed-fastgen/japanese/README.md | 22 ++--- .../03-05-2024/README-Chinese.md | 6 +- blogs/deepspeed-fp6/03-05-2024/README.md | 6 +- blogs/deepspeed-gds/README.md | 2 +- blogs/deepspeed-gds/japanese/README.md | 2 +- blogs/deepspeed-offloadpp/README.md | 2 +- blogs/deepspeed-triton/README.md | 6 +- blogs/deepspeed-ucp/README.md | 4 +- blogs/deepspeed-ulysses/README.md | 2 +- blogs/deepspeed-ulysses/japanese/README.md | 2 +- .../10-03-2023/README-Chinese.md | 12 +-- .../10-03-2023/README-Japanese.md | 12 +-- .../deepspeed-visualchat/10-03-2023/README.md | 12 +-- blogs/deepspeed4science/chinese/README.md | 4 +- blogs/deepspeed4science/japanese/README.md | 4 +- blogs/intel-inference/README.md | 6 +- blogs/windows/08-2024/README.md | 2 +- blogs/windows/08-2024/japanese/README.md | 2 +- blogs/zeropp/japanese/README.md | 2 +- deepspeed/autotuning/README.md | 6 +- deepspeed/inference/engine.py | 2 +- deepspeed/inference/v2/engine_factory.py | 2 +- .../containers/features/meta_tensor.py | 2 +- deepspeed/module_inject/replace_module.py | 2 +- deepspeed/moe/sharded_moe.py | 2 +- .../bert_sparse_self_attention.py | 2 +- deepspeed/profiling/flops_profiler/README.md | 2 +- .../runtime/comm/coalesced_collectives.py | 4 +- deepspeed/runtime/zero/stage3.py | 2 +- docker/Dockerfile | 2 +- docs/_data/navigation.yml | 2 +- docs/_pages/deepspeed4science.md | 4 +- docs/_pages/inference.md | 4 +- docs/_posts/2020-05-19-bert-record.md | 2 +- .../2020-05-28-fastest-bert-training.md | 2 +- .../2020-09-08-sparse-attention-news.md | 2 +- docs/_posts/2020-09-09-ZeRO-Offload.md | 2 +- docs/_posts/2020-09-09-onebit-adam-news.md | 2 +- .../_posts/2020-09-09-pipeline-parallelism.md | 2 +- docs/_posts/2020-09-09-sparse-attention.md | 2 +- ...0-10-28-progressive-layer-dropping-news.md | 2 +- docs/_posts/2021-11-15-autotuning.md | 6 +- docs/_posts/2021-12-09-deepspeed-moe-nlg.md | 4 +- docs/_posts/2022-07-26-deepspeed-azure.md | 12 +-- docs/_posts/2022-09-10-zero-inference.md | 2 +- docs/_posts/2022-10-11-mii.md | 6 +- docs/_posts/2022-12-12-data-efficiency.md | 2 +- docs/_posts/2023-03-31-multi-modal.md | 2 +- .../2023-04-24-deepspeed-chat-chinese.md | 2 +- .../2023-04-24-deepspeed-chat-japanese.md | 2 +- docs/_posts/2023-04-24-deepspeed-chat.md | 2 +- docs/_posts/2023-06-22-zeropp-chinese.md | 2 +- docs/_posts/2023-06-22-zeropp-japanese.md | 2 +- docs/_posts/2023-08-24-ulysses-chinese.md | 2 +- docs/_posts/2023-08-24-ulysses-japanese.md | 2 +- docs/_posts/2023-08-24-ulysses.md | 2 +- docs/_posts/2023-09-12-ZeRO-Inference.md | 2 +- .../2023-09-19-deepspeed4science-chinese.md | 2 +- .../2023-09-19-deepspeed4science-japanese.md | 2 +- ...2023-10-04-deepspeed-visualchat-chinese.md | 2 +- ...023-10-04-deepspeed-visualchat-japanese.md | 2 +- .../_posts/2023-10-04-deepspeed-visualchat.md | 2 +- .../2023-11-06-deepspeed-fastgen-chinese.md | 2 +- .../2023-11-06-deepspeed-fastgen-japanese.md | 2 +- docs/_posts/2023-11-06-deepspeed-fastgen.md | 2 +- .../accelerator-abstraction-interface.md | 4 +- docs/_tutorials/accelerator-setup-guide.md | 2 +- docs/_tutorials/advanced-install.md | 4 +- .../automatic-tensor-parallelism.md | 2 +- docs/_tutorials/autotuning.md | 10 +-- docs/_tutorials/azure.md | 4 +- docs/_tutorials/bert-finetuning.md | 2 +- docs/_tutorials/bert-pretraining.md | 6 +- docs/_tutorials/cifar-10.md | 2 +- docs/_tutorials/comms-logging.md | 2 +- docs/_tutorials/curriculum-learning.md | 10 +-- docs/_tutorials/data-efficiency.md | 12 +-- docs/_tutorials/deepnvme.md | 18 ++-- docs/_tutorials/domino.md | 2 +- docs/_tutorials/ds-sequence.md | 6 +- docs/_tutorials/flops-profiler.md | 2 +- docs/_tutorials/gan.md | 2 +- docs/_tutorials/inference-tutorial.md | 6 +- docs/_tutorials/large-models-w-deepspeed.md | 2 +- docs/_tutorials/megatron.md | 2 +- docs/_tutorials/mixed_precision_zeropp.md | 4 +- .../mixture-of-experts-inference.md | 2 +- docs/_tutorials/mixture-of-experts-nlg.md | 8 +- docs/_tutorials/mixture-of-experts.md | 8 +- docs/_tutorials/model-compression.md | 34 ++++---- docs/_tutorials/monitor.md | 2 +- docs/_tutorials/onebit-adam.md | 8 +- docs/_tutorials/onebit-lamb.md | 6 +- docs/_tutorials/sparse-attention.md | 16 ++-- docs/_tutorials/ulysses-offload.md | 6 +- docs/_tutorials/universal-checkpointing.md | 6 +- docs/_tutorials/zero-offload.md | 4 +- docs/_tutorials/zero-one-adam.md | 8 +- docs/_tutorials/zero.md | 4 +- docs/_tutorials/zeropp.md | 2 +- docs/code-docs/source/model-checkpointing.rst | 2 +- docs/contributing.md | 6 +- docs/index.md | 36 ++++---- examples/README.md | 4 +- op_builder/sparse_attn.py | 2 +- setup.py | 2 +- tests/unit/runtime/zero/test_zero.py | 10 +-- .../zero/test_zero_context_ancestry.py | 2 +- .../unit/sequence_parallelism/test_ulysses.py | 2 +- 126 files changed, 405 insertions(+), 405 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md index bf997775fe32..f27b1c6303eb 100644 --- a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md +++ b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md @@ -32,7 +32,7 @@ If applicable, add screenshots to help explain your problem. **System info (please complete the following information):** - OS: [e.g. Ubuntu 18.04] - GPU count and types [e.g. two machines with x8 A100s each] - - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using + - (if applicable) what [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) version are you using - (if applicable) Hugging Face Transformers/Accelerate/etc. versions - Python version - Any other relevant info about your setup diff --git a/.github/ISSUE_TEMPLATE/inference_bug_report.md b/.github/ISSUE_TEMPLATE/inference_bug_report.md index bc5df17258b0..8a4144ce049a 100644 --- a/.github/ISSUE_TEMPLATE/inference_bug_report.md +++ b/.github/ISSUE_TEMPLATE/inference_bug_report.md @@ -29,7 +29,7 @@ If applicable, add screenshots to help explain your problem. **System info (please complete the following information):** - OS: [e.g. Ubuntu 18.04] - GPU count and types [e.g. two machines with x8 A100s each] - - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using + - (if applicable) what [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) version are you using - (if applicable) Hugging Face Transformers/Accelerate/etc. versions - Python version - Any other relevant info about your setup diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index e48d9a7fffa2..5631d318d027 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -66,7 +66,7 @@ jobs: BRANCH="${{ github.event.inputs.mii_branch }}" fi echo "Cloning DeepSpeed-MII branch: $BRANCH" - git clone -b $BRANCH --depth=1 https://github.com/microsoft/DeepSpeed-MII.git + git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git cd DeepSpeed-MII pip install .[dev] cd tests diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 5d47519fe204..ec9b99b2c2db 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -54,7 +54,7 @@ jobs: BRANCH="${{ github.event.inputs.dse_branch }}" fi echo "DeepSpeedExamples Branch: $BRANCH" - git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git + git clone -b $BRANCH https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-Chat pip install -r requirements.txt pip install -e . diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index a576e5933b08..a2397f347799 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -66,7 +66,7 @@ jobs: BRANCH="${{ github.event.inputs.mii_branch }}" fi echo "Cloning DeepSpeed-MII branch: $BRANCH" - git clone -b $BRANCH --depth=1 https://github.com/microsoft/DeepSpeed-MII.git + git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git cd DeepSpeed-MII pip install .[dev] unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f6e5f39869eb..e8e160269695 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,7 @@ and then repeat the previous `git commit` command. ## Testing DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests. The model convergence tests train -[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) and measure +[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) and measure end-to-end convergence and related metrics. Unit tests are found in `tests/unit/` and the model convergence tests are found in `tests/model/`. @@ -40,7 +40,7 @@ tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) an ### Model Tests To execute model tests, first [install DeepSpeed](#installation). The -[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) repository is cloned +[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) repository is cloned as part of this process. Next, execute the model test driver: ```bash cd tests/model/ @@ -85,8 +85,8 @@ Based on the issue we shall discuss the merit of the new feature and decide whet ### Step 2: implementation and verification Contributor will go ahead and implement the feature, and the DeepSpeed team will provide guidance/helps as needed. The required deliverables include: -* A PR to [microsoft/DeepSpeed](https://github.com/microsoft/DeepSpeed) including (1) the feature implementation (2) unit tests (3) documentation (4) tutorial -* A PR to [microsoft/DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) or [microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) including the examples of how to use the feature (this is related to the planned testing experiments in proposal) +* A PR to [deepspeedai/DeepSpeed](https://github.com/deepspeedai/DeepSpeed) including (1) the feature implementation (2) unit tests (3) documentation (4) tutorial +* A PR to [deepspeedai/DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) or [deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) including the examples of how to use the feature (this is related to the planned testing experiments in proposal) * In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance. After receiving the PRs, we will review them and merge them after necessary tests/fixes. diff --git a/README.md b/README.md index 03aba73cfa04..9f071c3f0a65 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE) +[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/deepspeedai/DeepSpeed/blob/master/LICENSE) [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/) [![Downloads](https://static.pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed) [![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status) @@ -14,33 +14,33 @@ ## Latest News - DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). - -* [2024/12] [Ulysses-Offload: Democratizing Long Context LLM Training ](https://github.com/microsoft/DeepSpeed/blob/master/blogs/ulysses-offload/README.md) -* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) -* [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)] -* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)] -* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] -* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] -* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) -* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html) -* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp) -* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)] -* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)] -* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)] + DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat). + +* [2024/12] [Ulysses-Offload: Democratizing Long Context LLM Training ](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/ulysses-offload/README.md) +* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) +* [2024/08] [DeepSpeed on Windows](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)] +* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)] +* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] +* [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] +* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) +* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html) +* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-offloadpp) +* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)] +* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)] +* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
More news
@@ -48,7 +48,7 @@ # Extreme Speed and Scale for DL Training and Inference -***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can: +***[DeepSpeed](https://www.deepspeed.ai/) enabled the world's most powerful language models (at the time of this writing) such as [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can: * Train/Inference dense or sparse models with billions or trillions of parameters * Achieve excellent system throughput and efficiently scale to thousands of GPUs @@ -86,21 +86,21 @@ In line with Microsoft's mission to solve humanity's most pressing challenges, t ## DeepSpeed Library - The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)). + The [DeepSpeed](https://github.com/deepspeedai/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)). ## Model Implementations for Inference (MII) - [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions. + [Model Implementations for Inference (MII)](https://github.com/deepspeedai/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions. ## DeepSpeed on Azure - DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/). + DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/). --- # DeepSpeed Adoption -DeepSpeed is an important part of Microsoft’s new +DeepSpeed was an important part of Microsoft’s [AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/) initiative to enable next-generation AI capabilities at scale, where you can find more information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale). @@ -135,14 +135,14 @@ DeepSpeed has been integrated with several different popular open-source DL fram | Description | Status | | ----------- | ------ | -| NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) | -| AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) | -| CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) | -| Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) | -| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) | -| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | -| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | -| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | +| NVIDIA | [![nv-torch110-p40](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-nightly.yml) | +| AMD | [![amd-mi200](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/amd-mi200.yml) | +| CPU | [![torch-latest-cpu](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/cpu-inference.yml) | +| Intel Gaudi | [![hpu-gaudi2](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/hpu-gaudi2.yml) | +| Intel XPU | [![xpu-max1100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/xpu-max1100.yml) | +| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | +| Integrations | [![nv-transformers-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-sd.yml) | +| Misc | [![Formatting](https://github.com/deepspeedai/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/deepspeedai/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/deepspeedai/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/python.yml) | | Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml/badge.svg?branch=main)](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml) | # Installation @@ -192,7 +192,7 @@ of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/). ## Windows -Many DeepSpeed features are supported on Windows for both training and inference. You can read more about this in the original blog post [here](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md). Among features that are currently not supported are async io (AIO) and GDS (which does not support Windows). +Many DeepSpeed features are supported on Windows for both training and inference. You can read more about this in the original blog post [here](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/windows/08-2024/README.md). Among features that are currently not supported are async io (AIO) and GDS (which does not support Windows). 1. Install PyTorch, such as pytorch 2.3+cu121. 2. Install Visual C++ build tools, such as VS2022 C++ x64/x86 build tools. 3. Launch Cmd console with Administrator permissions for creating required symlink folders and ensure MSVC tools are added to your PATH or launch the Developer Command Prompt for Visual Studio 2022 with administrator permissions. @@ -222,7 +222,7 @@ DeepSpeed welcomes your contributions! Please see our etc.
Thanks so much to all of our amazing contributors! - + diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index eb4e17850882..ac17fe2c67e5 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -169,7 +169,7 @@ def get_accelerator(): # ensuring that we are free from CUDA initialization errors. # While "torch.cuda.device_count() > 0" check ensures that #ignore-cuda # we won't try to do any CUDA calls when no device is available - # For reference: https://github.com/microsoft/DeepSpeed/pull/6810 + # For reference: https://github.com/deepspeedai/DeepSpeed/pull/6810 if torch.cuda.device_count() > 0 and torch.cuda.is_available(): #ignore-cuda accelerator_name = "cuda" except (RuntimeError, ImportError) as e: diff --git a/benchmarks/README.md b/benchmarks/README.md index 4c88b2dd091c..a2b332732042 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -2,5 +2,5 @@ If you are looking for DeepSpeed benchmarks, please see the following resources: -1. [Communication Benchmarking Suite](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/communication) -2. [Inference Benchmarks](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference) +1. [Communication Benchmarking Suite](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/communication) +2. [Inference Benchmarks](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference) diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md index 75b0841231a4..3ed8b284940a 100644 --- a/blogs/deepspeed-chat/README.md +++ b/blogs/deepspeed-chat/README.md @@ -65,7 +65,7 @@ DeepSpeed-RLHF system is capable of unparalleled efficiency at scale, making com *Table 2. Multi-Node 64x A100-80GB: Training Time and Corresponding Approximate Cost on Azure.* -> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details. +> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details. ***Democratizing RLHF Training***: With just a single GPU, DeepSpeed-HE supports training models with over 13 billion parameters, enabling data scientists without access to multi-GPU systems to create not just toy RLHF models but large and powerful ones that can be used in real-world scenarios. @@ -95,7 +95,7 @@ We use an example of pretrained OPT-13B as the actor model and OPT-350M as the r ```python pip install deepspeed>=0.9.0 -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-Chat/ pip install -r requirements.txt @@ -285,7 +285,7 @@ This improvement in efficiency stems from DeepSpeed-HE’s ability to accelerate ## Effective Throughput and Scalability Analysis -***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput. +***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
@@ -320,13 +320,13 @@ As a result, for a given max global batch size, DeepSpeed-HE achieves the best t We are very excited to share that DeepSpeed-Chat is now open-sourced and available to the AI community. -* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) +* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) -* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future. +* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future. DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more, * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. * You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. -DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md index 7a45735b3f09..4ec29c6811e5 100644 --- a/blogs/deepspeed-chat/chinese/README.md +++ b/blogs/deepspeed-chat/chinese/README.md @@ -63,7 +63,7 @@ DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率,使复 *表 2. 多节点 64x A100-80GB:训练时长及预估的 Azure 费用。*
-> ***非常重要的细节***: 上述两个表格(即表一和表二)中的数据均针对 RLHF 训练的第 3 步,基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿(135M)个字符(token)上进行一个时期(epoch)的训练。我们总共有 6750 万个查询(query)字符(131.9k 个 query,每个序列长度为 256)和 6750 万个生成/回答字符(131.9k 个答案,每个序列长度为 256),每步的最大全局字符批量大小约为 500 万个字符(1024 个查询-答案对)。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前,我们建议读者注意这些设定。想要了解更多详细信息,请参阅我们的页面 [benchmark setting](https://github.com/microsoft/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。 +> ***非常重要的细节***: 上述两个表格(即表一和表二)中的数据均针对 RLHF 训练的第 3 步,基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿(135M)个字符(token)上进行一个时期(epoch)的训练。我们总共有 6750 万个查询(query)字符(131.9k 个 query,每个序列长度为 256)和 6750 万个生成/回答字符(131.9k 个答案,每个序列长度为 256),每步的最大全局字符批量大小约为 500 万个字符(1024 个查询-答案对)。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前,我们建议读者注意这些设定。想要了解更多详细信息,请参阅我们的页面 [benchmark setting](https://github.com/deepspeedai/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。 ***实现 RLHF 训练的普及化***:仅凭单个 GPU,DeepSpeed-HE 就能支持训练超过 130 亿参数的模型。这使得那些无法使用多 GPU 系统的数据科学家和研究者不仅能够轻松创建轻量级的 RLHF 模型,还能创建大型且功能强大的模型,以应对不同的使用场景。 @@ -91,7 +91,7 @@ DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率,使复 ``` pip install deepspeed>=0.9.0 -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-Chat/ pip install -r requirements.txt @@ -274,7 +274,7 @@ DeepSpeed-HE可以在训练和推理之间无缝更改模型分区,以支持 ## 有效吞吐量和可扩展性分析 -***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段,DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF (详见 [benchmarking setting](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md))中,生成阶段占总计算的约 20%,而 RL 训练阶段占剩余的 80%。然而,尽管比例较小,前者可能会占用大部分的端到端时间,因为它需要为每个生成的字符运行一次 actor 模型,使其受到内存带宽限制,难以实现高吞吐量。相比之下,RL 训练阶段是计算密集型的,仅需运行参考 actor 模型进行几次前向和后向传递,每个样本都有来自提示和生成的全部 512 个字符,可以实现良好的吞吐量。 +***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段,DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF (详见 [benchmarking setting](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md))中,生成阶段占总计算的约 20%,而 RL 训练阶段占剩余的 80%。然而,尽管比例较小,前者可能会占用大部分的端到端时间,因为它需要为每个生成的字符运行一次 actor 模型,使其受到内存带宽限制,难以实现高吞吐量。相比之下,RL 训练阶段是计算密集型的,仅需运行参考 actor 模型进行几次前向和后向传递,每个样本都有来自提示和生成的全部 512 个字符,可以实现良好的吞吐量。
@@ -308,13 +308,13 @@ DeepSpeed-HE 的核心技术基于 ZeRO,用于训练过程中将模型状态 我们非常高兴地宣布,DeepSpeed-Chat现已开源并向 AI 社区开放。 -* 如果你发现我们的成果对你有用或者喜欢我们的开源成果,请在 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)上点⭐。 +* 如果你发现我们的成果对你有用或者喜欢我们的开源成果,请在 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)上点⭐。 -* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用:[GitHub 登陆页面](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) +* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用:[GitHub 登陆页面](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) -* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。 +* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。 DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分,包括众多深度学习系统和建模技术。要了解更多信息, @@ -322,4 +322,4 @@ DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分,包括众多深 * 我们会在[知乎](https://www.zhihu.com/people/deepspeed)上发布最新中文博客及动态。你还可以关注我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed) 和[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP)。 -DeepSpeed 欢迎你的贡献!我们鼓励你在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作,共同开展深度学习研究,将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求(以及其他不适合在 GitHub 上提出的需求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 +DeepSpeed 欢迎你的贡献!我们鼓励你在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作,共同开展深度学习研究,将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求(以及其他不适合在 GitHub 上提出的需求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 diff --git a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md index e1d4bf952bf1..828a77aa8431 100644 --- a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md +++ b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md @@ -24,7 +24,7 @@ # 1. Introduction -DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper. +DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper. We are happy to share that today we are improving DeepSpeed-Chat along three areas: i) system support for Llama/Llama-2 family of models, ii) system features for improved efficiency and accessibility, and iii) stability and software enhancements. @@ -33,15 +33,15 @@ We are happy to share that today we are improving DeepSpeed-Chat along three are We ***introduce system support for training Llama and Llama-2 models*** in DeepSpeed-Chat enabling and leveraging various optimizations and features including the Hybrid Engine, ZeRO family of optimizations, Low-Rank Adaptation (LoRA) support, as well as full integration into the three-stage DeepSpeed-Chat RLHF pipeline. By leveraging the Hybrid-Engine, we speed up the experience generation phase for Llama-2-7B and Llama-2-13B models by **up to 7.1X**. - **New System Features for Improved Efficiency and Accessibility** - - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs. + - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/deepspeedai/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs. - ***[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)***. It is an optimization that offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources. After training stability fixes and testing, we have enabled this feature across all three stages of the DeepSpeed-Chat RLHF training pipeline. ZeRO-Offload reduces the minimum number of GPUs required to train large models by **up to 16x**. - **Stability and Software Enhancements** - - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/microsoft/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)*. + - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/deepspeedai/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)*. - - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml). + - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml). We now dive into the details of our new features, training stability, and software improvements. @@ -54,19 +54,19 @@ The DeepSpeed-Chat training framework now provides system support for the Llama The following key optimizations in DeepSpeed are now fully integrated for Llama and Llama-2 models: - **DeepSpeed-Chat Integration**: Fully integrated into the complete, end-to-end three-stage DeepSpeed-Chat RLHF training framework, based on the OpenAI InstructGPT training strategy. -- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models. -- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models. +- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models. +- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models. - **Mixed Precision ZeRO++ (MixZ++)**: Enhanced support for larger models like Llama-2-70B through the new MixZ++ feature, improving efficiency and reducing memory usage when there are frozen or non-trainable parameters. -- **LoRA**: Fully supported by the [LoRA](https://github.com/microsoft/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices. +- **LoRA**: Fully supported by the [LoRA](https://github.com/deepspeedai/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices. ## Getting Started Users looking to try the new Llama and Llama-2 model support can get started by using the newly added Llama scripts. | Step Number | Scripts | | --- | --- | -| 1 | [Llama-2 Step 1 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) | -| 2 | [Llama-2 Step 2 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) | -| 3 | [Llama-2 Step 3 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) | +| 1 | [Llama-2 Step 1 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) | +| 2 | [Llama-2 Step 2 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) | +| 3 | [Llama-2 Step 3 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) | *Note*: While all the system aspects of Llama and Llama-2 support have been extensively tested, there are no guarantees about training convergence and may require hyper-parameter tuning to achieve convergence. @@ -103,11 +103,11 @@ We now dive into the details of two new features we are introducing today: 1) Mi ## 3.3x Higher Throughput with MixZ++ for LoRA -Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. +Mixed Precision ZeRO++ ([MixZ++](https://github.com/deepspeedai/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. Similar to [ZeRO](https://www.deepspeed.ai/tutorials/zero/), MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them only when needed. In addition, similar to ZeRO++, MixZ++ allows for hierarchical partitioning and quantized communication. The hierarchical partitioning allows all the parameters to be stored within a node when possible so that the communication happens within a node, where communication bandwidth is significantly higher than communicating across nodes. The communication overhead is further reduced by quantizing the weights before gathering them. -Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/microsoft/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency. +Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/deepspeedai/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency. A comprehensive exploration of technical details can be accessed through our [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/), [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/), and [paper](https://arxiv.org/pdf/2306.10209.pdf). @@ -147,13 +147,13 @@ To try this feature, please refer to [MixZ++ tutorial](https://www.deepspeed.ai/
-ZeRO-Offload was [disabled](https://github.com/microsoft/DeepSpeedExamples/pull/553) +ZeRO-Offload was [disabled](https://github.com/deepspeedai/DeepSpeedExamples/pull/553) with the initial release of DeepSpeed-Chat due to training instability that was observed when it was used with Hybrid Engine and LoRA. After improvements to Hybrid Engine and LoRA as well as extensive testing of all feature configurations for ZeRO Stage2 and ZeRO Stage 3, this feature can now be enabled across all three steps of the DeepSpeed-Chat training framework. Please note that configuring ZeRO-Offload with ZeRO Stage 2 and Hybrid Engine with LoRA disabled is currently unsupported due to observed training instability.
- *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).* + *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
@@ -164,11 +164,11 @@ A wide range of issues have been addressed in the DeepSpeed runtime and the Deep
- *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).* + *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
-*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403). +*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403). We now dive into the details of all the fixes across different areas. @@ -178,13 +178,13 @@ In this section we discuss the functionality and training stability fixes in the - **Training Stability:** - - [PR #620 - Make training more stable](https://github.com/microsoft/DeepSpeedExamples/pull/620) + - [PR #620 - Make training more stable](https://github.com/deepspeedai/DeepSpeedExamples/pull/620) - To improve the training stability in Step 3, several different areas of training were tuned and changed. To start, the Kullback-Liebler (KL) divergence used in the Proximal Policy Optimization (PPO) trainer was slightly tuned to reduce divergence between the new and reference policies and improve the reward score. Next, the sequence generation function in the PPO trainer (`_generate_sequence()`) removed the specification of a `min_length` in the Actor model's `generate()` call, which means generated sequences won't be artificially enlarged, allowing for the possibility of sequence generation to collapse i.e. when training convergence is extremely poor. A minor off-by-one error was also fixed in the PPO trainer's reward computation function (`compute_rewards()`). Finally, the PPO trainer's RLHF training function was updated to zero out the reward and value after the end of a conversation to prevent incorrect `advantages` and `returns`. - - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/microsoft/DeepSpeedExamples/pull/633) + - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/deepspeedai/DeepSpeedExamples/pull/633) - - The [LoRA](https://github.com/microsoft/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations: + - The [LoRA](https://github.com/deepspeedai/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations:
@@ -204,25 +204,25 @@ In this section we discuss the functionality and training stability fixes in the The next fix details the addition of separate LoRA learning rate arguments. - - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/microsoft/DeepSpeedExamples/pull/685) + - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/deepspeedai/DeepSpeedExamples/pull/685) - A *separate* LoRA learning rate argument can now be provided in each of the three training steps, with Step 3 having individual LoRA learning rates for the Actor and Critic models. - **Bug Fixes:** - - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/microsoft/DeepSpeedExamples/pull/636) + - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/deepspeedai/DeepSpeedExamples/pull/636) - During DeepSpeed-Chat Step 3 training, we observed hangs when ZeRO Stage 3 was enabled for the actor model and when the `world_size > 1`. When observing the state of each rank, one rank would still be in the sequence generation phase `self._generate_sequence()`, while the other rank had already progressed to the `self.actor_model()` call. This ZeRO Stage 3 desynchronization, due to misaligned token generation between the GPUs, can normally be automatically detected and accounted for in the HuggingFace Transformers library via `synced_gpus`. However, due to the nature of the DeepSpeed-Chat pipeline and the lifetime of the corresponding model configuration objects, this automatic detection code was not triggered. To resolve this, when invoking the `generate()` function, the `synced_gpus` argument is explicitly passed and set to `True` when ZeRO Stage 3 is being used. - - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/microsoft/DeepSpeedExamples/pull/658) + - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/deepspeedai/DeepSpeedExamples/pull/658) - This fix allows Step 3 training to run with the combination of gradient checkpointing and *LoRA-only* parameter optimization, a previously unsupported training case. With the addition of the [enable_input_require_grads](https://github.com/huggingface/transformers/blob/f26099e7b5cf579f99a42bab6ddd371bf2c8d548/src/transformers/modeling_utils.py#L1225) model utility function in the HuggingFace Transformers library, which enables the gradients for the input embeddings, gradient checkpointing and optimization of *only* the LoRA parameters is made possible. - - [PR #576 - Fix argparse](https://github.com/microsoft/DeepSpeedExamples/pull/576) + - [PR #576 - Fix argparse](https://github.com/deepspeedai/DeepSpeedExamples/pull/576) - An external contributor helped in resolving an argument parsing issue. - - [PR #584 - Fix unused parameter bug](https://github.com/microsoft/DeepSpeedExamples/pull/584) + - [PR #584 - Fix unused parameter bug](https://github.com/deepspeedai/DeepSpeedExamples/pull/584) - An external contributor fixed the passing of an uninitialized parameter that was hardcoded earlier. @@ -230,11 +230,11 @@ In this section we discuss the functionality and training stability fixes in the ## Hybrid Engine Fixes In this section we discuss several fixes in the Hybrid Engine. -- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/microsoft/DeepSpeed/pull/3563) +- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/deepspeedai/DeepSpeed/pull/3563) - During Step 3 training for OPT with LoRA and Hybrid Engine enabled, an issue arose regarding a tensor size mismatch of the LoRA weights. Specifically, the LoRA QKV weights were not fused in the OPT container policy, yet they were expected to be fused by the Hybrid Engine. This challenge was effectively resolved by introducing both fused and unfused LoRA methods in the Hybrid Engine. We thank @sxjscience for providing this fix. -- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/microsoft/DeepSpeed/pull/3883) +- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/deepspeedai/DeepSpeed/pull/3883) - The Hybrid Engine was updated to properly check whether ZeRO Stage 3 was enabled when resetting the inference container parameters, along with expanding the corresponding unit tests. @@ -242,17 +242,17 @@ In this section we discuss several fixes in the Hybrid Engine. ## ZeRO Stage 3 Fixes In this section we discuss several fixes in support of the ZeRO Stage 3 feature. -- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/microsoft/DeepSpeed/pull/3819) +- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/deepspeedai/DeepSpeed/pull/3819) - A race condition in the the ZeRO `GatheredParameters` context, which resulted in various `'status': 'INFLIGHT'` issues, was fixed by removing duplicate input parameters that were being passed from the Hybrid Engine. -- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/microsoft/DeepSpeed/pull/3884) +- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/deepspeedai/DeepSpeed/pull/3884) - The ZeRO Stage 3 `InflightParamRegistry` was updated to use a separate `InflightParamRegistry` for training and evaluation, fixing an issue where leftover parameters in flight were causing inflight parameter errors. These fixes, along with related fixes in the Hybrid Engine, enabled the use of the ZeRO-Offload feature in the DeepSpeed-Chat training pipeline. -- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/microsoft/DeepSpeed/pull/3928) +- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/deepspeedai/DeepSpeed/pull/3928) - - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/microsoft/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point. + - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/deepspeedai/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point. # 5. Software Improvements @@ -263,9 +263,9 @@ To improve the characterization, ease of debug, and maintainability of the DeepS The DeepSpeed-Chat training framework provides a rich set of features (Hybrid Engine, ZeRO, LoRA, etc.) that can be composed in many different combinations, depending on the scenario. The interactions between the features are often complex and composing them in a systematic way for characterization is useful for understanding their behavior. To support such use cases, characterization scripts have been added to run sweeps of Steps 1, 2, and 3 training for various combinations of features. The scripts default to OPT but can be modified to run with Llama. Please see the READMEs in the following folders for more details: -- [Step 1 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep) -- [Step 2 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep) -- [Step 3 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep) +- [Step 1 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep) +- [Step 2 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep) +- [Step 3 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep) For example, the Step 3 characterization script sweeps across various training features: | Feature | Values | @@ -286,13 +286,13 @@ The training log for each combination of features will be stored in a folder wit Related PRs: -- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/microsoft/DeepSpeedExamples/pull/638) -- [Add step 2 sweep script, clean up scripts](https://github.com/microsoft/DeepSpeedExamples/pull/664) -- [Update script location and docs for all 3 steps](https://github.com/microsoft/DeepSpeedExamples/pull/681) +- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/deepspeedai/DeepSpeedExamples/pull/638) +- [Add step 2 sweep script, clean up scripts](https://github.com/deepspeedai/DeepSpeedExamples/pull/664) +- [Update script location and docs for all 3 steps](https://github.com/deepspeedai/DeepSpeedExamples/pull/681) ## Instrumentation -To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/microsoft/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`. +To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/deepspeedai/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`. | Argument | Description | Step(s) | | --- | --- | --- | @@ -318,11 +318,11 @@ TensorBoard logging can be enabled in each of the three training steps, with som ## Testing -As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml). +As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml). | Description | Status | | ----------- | ------ | -| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) | +| Integrations | [![nv-ds-chat](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml) | The workflow is run on a **nightly** basis across a **16-case** test matrix (see table below), and uses the **facebook/opt-125m** model for both the actor and critic. @@ -338,15 +338,15 @@ Each configuration (16 total) runs through a limited number of Step 3 non-overfl # 6. Try Out DeepSpeed-Chat We are very excited to share this DeepSpeed-Chat feature and stability release. -* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) +* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat) -* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future. +* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future. DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising of a multitude of Deep Learning systems and modeling technologies. To learn more, * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. * You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. -DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. -* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work! +* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) repositories if you like our work! diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md index 3ec570a9ea40..7bcbb61d630c 100644 --- a/blogs/deepspeed-chat/japanese/README.md +++ b/blogs/deepspeed-chat/japanese/README.md @@ -62,7 +62,7 @@ DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見 *表2. 複数ノード(64x A100-80GB)を用いた場合の訓練時間とAzureでの概算実行コスト*
-> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン(配列長256の13万件のクエリー)と6750万の生成トークン(配列長256の13万件の回答)があり、ステップごとの最大グローバルバッチサイズは 50万 トークン(クエリーと回答それぞれ1024件)です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/microsoft/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。 +> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン(配列長256の13万件のクエリー)と6750万の生成トークン(配列長256の13万件の回答)があり、ステップごとの最大グローバルバッチサイズは 50万 トークン(クエリーと回答それぞれ1024件)です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/deepspeedai/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。 ***RLHFを誰もが利用できるように***: DeepSpeed-HEは、1台のGPUのみで130億以上のパラメーターを持つモデルの訓練を実行できます。複数のGPUを備えた高価な計算設備を持たないデータサイエンティストも、小規模なトイモデルではなく、実際のシナリオで使用できる大規模で強力なRLHFモデルを作成できます。 @@ -92,7 +92,7 @@ DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見 ```python pip install deepspeed>=0.9.0 -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-Chat/ pip install -r requirements.txt @@ -279,7 +279,7 @@ DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHuggin ## 実効スループットとスケーラビリティ -***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています(詳細は[ベンチマークのページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照)。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。 +***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています(詳細は[ベンチマークのページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照)。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
@@ -314,8 +314,8 @@ DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、 DeepSpeed-ChatをオープンソースソフトウェアとしてAIコミュニティに公開できることを嬉しく思います。 -* DeepSpeed-Chatの[GitHubページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。 -* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。 +* DeepSpeed-Chatの[GitHubページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。 +* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。 # 7. DeepSpeedについて @@ -341,5 +341,5 @@ DeepSpeedについてのより詳しい情報は、以下をご覧ください DeepSpeedチームは、ユーザの方々からのフィードバックやご連絡を受け付けています。 -* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/microsoft/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。 +* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/deepspeedai/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。 * DeepSpeedチームでは、DeepSpeedを用いた深層学習の研究や実世界へのAIモデルやアプリケーションに関して、大学、研究所、企業との方々とのコラボレーションを行っています(日本語でコミュニケーション可能な研究員も在籍しています)。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については、deepspeed-info@microsoft.com まで直接メールをお送りください。 diff --git a/blogs/deepspeed-domino/README.md b/blogs/deepspeed-domino/README.md index 7dfdc7dac1c0..ce190ed1e459 100644 --- a/blogs/deepspeed-domino/README.md +++ b/blogs/deepspeed-domino/README.md @@ -181,7 +181,7 @@ Backward is a bit more challenging because backward computation graph is automat # Getting Started: Try out DeepSpeed-Domino -To try out DeepSpeed-Domino, please refer to [Domino tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in our DeepSpeedExample repo. +To try out DeepSpeed-Domino, please refer to [Domino tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in our DeepSpeedExample repo. ## Citation diff --git a/blogs/deepspeed-fastgen/2024-01-19/README.md b/blogs/deepspeed-fastgen/2024-01-19/README.md index a98c9856164c..108f3af23d45 100644 --- a/blogs/deepspeed-fastgen/2024-01-19/README.md +++ b/blogs/deepspeed-fastgen/2024-01-19/README.md @@ -20,7 +20,7 @@ # 1. Introduction -[DeepSpeed-FastGen](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) is an inference system framework that enables easy, fast, and affordable inference for large language models (LLMs). From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. DeepSpeed-FastGen utilizes the Dynamic SplitFuse technique to tackle the unique challenges of serving these applications and offer higher effective throughput than other state-of-the-art systems like vLLM. +[DeepSpeed-FastGen](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) is an inference system framework that enables easy, fast, and affordable inference for large language models (LLMs). From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. DeepSpeed-FastGen utilizes the Dynamic SplitFuse technique to tackle the unique challenges of serving these applications and offer higher effective throughput than other state-of-the-art systems like vLLM. Today, we are happy to share that we are improving DeepSpeed-FastGen along three areas: i) three new model families, ii) performance optimizations, and iii) feature enhancements: - **New Model Families** @@ -29,13 +29,13 @@ Today, we are happy to share that we are improving DeepSpeed-FastGen along three - **Performance Optimizations** - We drastically reduced the scheduling overhead of Dynamic SplitFuse and increased the efficiency of token sampling. As a result, we see higher throughput and lower latency, particularly when handling concurrent requests from many clients. We demonstrate the performance optimizations with benchmarks and evaluation of DeepSpeed-FastGen against vLLM for the newly added model families. The benchmark results can be seen in [Performance Evaluation](#performance-optimizations) and the benchmark code is available at [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference/mii). + We drastically reduced the scheduling overhead of Dynamic SplitFuse and increased the efficiency of token sampling. As a result, we see higher throughput and lower latency, particularly when handling concurrent requests from many clients. We demonstrate the performance optimizations with benchmarks and evaluation of DeepSpeed-FastGen against vLLM for the newly added model families. The benchmark results can be seen in [Performance Evaluation](#performance-optimizations) and the benchmark code is available at [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference/mii). - **Feature Enhancements** DeepSpeed-FastGen contains a rich set of features for running inference with many different model families and over 20,000 HuggingFace hosted models. We extend this feature set for all models to include a RESTful API, more generation options, and support for models using the safetensor checkpoint format. Additionally, we improve on overall stability and address bugs in our original DeepSpeed-FastGen release. -We now dive into the details of the new model families, performance optimizations, and software improvements. If you would like to get started right away please see [Try Out DeepSpeed-FastGen](#try-out-deepspeed-fastgen). This new release is available in [DeepSpeed versions >= 0.13.0](https://github.com/microsoft/DeepSpeed/tree/v0.13.0) and [DeepSpeed-MII versions >= 0.2.0](https://github.com/microsoft/DeepSpeed-MII/tree/v0.2.0). +We now dive into the details of the new model families, performance optimizations, and software improvements. If you would like to get started right away please see [Try Out DeepSpeed-FastGen](#try-out-deepspeed-fastgen). This new release is available in [DeepSpeed versions >= 0.13.0](https://github.com/deepspeedai/DeepSpeed/tree/v0.13.0) and [DeepSpeed-MII versions >= 0.2.0](https://github.com/deepspeedai/DeepSpeed-MII/tree/v0.2.0). # 2. New Model Families @@ -63,9 +63,9 @@ A closer examination of the architectural nuances within the Falcon series revea SplitFuse effectively enhances utilization by simultaneously computing prompts and decoding (generating tokens). However, we observed a significant overhead for scheduling ragged batching, especially when generating a large number of tokens from numerous concurrent requests. In this release, we've minimized this scheduling overhead for querying KV cache states. As a result, there's a notable improvement in the performance for scenarios with a large number of generation steps. -In general for long prompts and a smaller number of generated tokens, we can fully utilize the benefits of SplitFuse, which combines prompt processing and decoding (token generation) in a single forward pass. This provides a significant advantage over vLLM in these scenarios as shown in our [previous blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen). For short prompts and a larger number of generated tokens, where most forward passes run purely for decoding, our highly optimized engine and the efficient scheduler for ragged batching demonstrate impressive performance. +In general for long prompts and a smaller number of generated tokens, we can fully utilize the benefits of SplitFuse, which combines prompt processing and decoding (token generation) in a single forward pass. This provides a significant advantage over vLLM in these scenarios as shown in our [previous blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen). For short prompts and a larger number of generated tokens, where most forward passes run purely for decoding, our highly optimized engine and the efficient scheduler for ragged batching demonstrate impressive performance. -We follow the benchmarking methodology we presented in our [previous blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#a-benchmarking-methodology). +We follow the benchmarking methodology we presented in our [previous blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen#a-benchmarking-methodology). *NOTE: All the benchmarks in this blog use the recommended DeepSpeed-FastGen persistent deployment mode.* @@ -124,54 +124,54 @@ In this section we introduce several feature enhancements that have been release ## Performance improvements We achieve a notable improvement in performance by minimizing the scheduling overhead for querying KV cache states as discussed in [Performance Optimizations](#performance-optimizations). -See [PR-4965](https://github.com/microsoft/DeepSpeed/pull/4965), [PR-377](https://github.com/microsoft/DeepSpeed-MII/pull/377) for more details. +See [PR-4965](https://github.com/deepspeedai/DeepSpeed/pull/4965), [PR-377](https://github.com/deepspeedai/DeepSpeed-MII/pull/377) for more details. ## Support for safetensor checkpoints Some HuggingFace-hosted model checkpoint weights are provided only in the safetensor format. We extend our HuggingFace checkpoint engine to work with the safetensor format to support even more models! -See [PR-4659](https://github.com/microsoft/DeepSpeed/pull/4659), [PR-296](https://github.com/microsoft/DeepSpeed-MII/pull/296) for more details. +See [PR-4659](https://github.com/deepspeedai/DeepSpeed/pull/4659), [PR-296](https://github.com/deepspeedai/DeepSpeed-MII/pull/296) for more details. ## Added RESTful API -We add the option to automatically stand up a RESTful API when creating DeepSpeed-FastGen persistent deployments in DeepSpeed-MII. This API provides a way for users to send prompts to their deployments and receive responses using HTTP POST methods and tools like `curl` or python's `request` package. The RESTful API provides the same high throughput and low latency performance as our python APIs. For more information, please see [MII RESTful API](https://github.com/microsoft/DeepSpeed-MII#restful-api). +We add the option to automatically stand up a RESTful API when creating DeepSpeed-FastGen persistent deployments in DeepSpeed-MII. This API provides a way for users to send prompts to their deployments and receive responses using HTTP POST methods and tools like `curl` or python's `request` package. The RESTful API provides the same high throughput and low latency performance as our python APIs. For more information, please see [MII RESTful API](https://github.com/deepspeedai/DeepSpeed-MII#restful-api). -See [PR-348](https://github.com/microsoft/DeepSpeed-MII/pull/348), [PR-328](https://github.com/microsoft/DeepSpeed-MII/pull/328), [PR-294](https://github.com/microsoft/DeepSpeed-MII/pull/294) for more details. +See [PR-348](https://github.com/deepspeedai/DeepSpeed-MII/pull/348), [PR-328](https://github.com/deepspeedai/DeepSpeed-MII/pull/328), [PR-294](https://github.com/deepspeedai/DeepSpeed-MII/pull/294) for more details. ## Added deployment and generate options -We extend the customizability of DeepSpeed-FastGen deployments and text-generation. Users can now specify a `device_map` when creating non-persistent pipelines and persistent deployments that controls which GPUs to use for hosting a model. Additionally, the interfaces between pipelines and deployments now match and include options for setting top-p, top-k, and temperature values. For additional information about the user-exposed options, please see [MII Pipeline](https://github.com/microsoft/DeepSpeed-MII#non-persistent-pipeline) and [MII Deployment](https://github.com/microsoft/DeepSpeed-MII#persistent-deployment). +We extend the customizability of DeepSpeed-FastGen deployments and text-generation. Users can now specify a `device_map` when creating non-persistent pipelines and persistent deployments that controls which GPUs to use for hosting a model. Additionally, the interfaces between pipelines and deployments now match and include options for setting top-p, top-k, and temperature values. For additional information about the user-exposed options, please see [MII Pipeline](https://github.com/deepspeedai/DeepSpeed-MII#non-persistent-pipeline) and [MII Deployment](https://github.com/deepspeedai/DeepSpeed-MII#persistent-deployment). -See [PR-331](https://github.com/microsoft/DeepSpeed-MII/pull/331), [PR-280](https://github.com/microsoft/DeepSpeed-MII/pull/280), [PR-275](https://github.com/microsoft/DeepSpeed-MII/pull/275), [PR-268](https://github.com/microsoft/DeepSpeed-MII/pull/268), [PR-295](https://github.com/microsoft/DeepSpeed-MII/pull/295), for more details. +See [PR-331](https://github.com/deepspeedai/DeepSpeed-MII/pull/331), [PR-280](https://github.com/deepspeedai/DeepSpeed-MII/pull/280), [PR-275](https://github.com/deepspeedai/DeepSpeed-MII/pull/275), [PR-268](https://github.com/deepspeedai/DeepSpeed-MII/pull/268), [PR-295](https://github.com/deepspeedai/DeepSpeed-MII/pull/295), for more details. ## Mitigate risk of deadlock In use cases where many prompts are sent to a deployment in a small time window, deadlock can occur in the DeepSpeed-FastGen inference engine, resulting in no text-generation progress is made on any prompts. To mitigate this, we ensure that there is a sufficient margin in the KV cache when scheduling requests. While not completely resolved, we continue to investigate a fix for these situations that arrive when the deployment is under heavy load. -See [PR-274](https://github.com/microsoft/DeepSpeed-MII/pull/274) for more details. +See [PR-274](https://github.com/deepspeedai/DeepSpeed-MII/pull/274) for more details. ## Inference Checkpoints We add the capability to create inference engine snapshots to DeepSpeed-FastGen. This reduces the loading time for large models in future deployments. -See [PR-4664](https://github.com/microsoft/DeepSpeed/pull/4664) for more details. +See [PR-4664](https://github.com/deepspeedai/DeepSpeed/pull/4664) for more details. ## General stability and bug fixes We include many bug fixes and stability improvements to DeepSpeed-FastGen. This includes fixing issues with some OPT model size variants, bugs with MII configuration options, and improved error messages. -See [PR-4938](https://github.com/microsoft/DeepSpeed/pull/4938), [PR-4920](https://github.com/microsoft/DeepSpeed/pull/4920), [PR-4739](https://github.com/microsoft/DeepSpeed/pull/4739), [PR-4694](https://github.com/microsoft/DeepSpeed/pull/4694), [PR-4634](https://github.com/microsoft/DeepSpeed/pull/4634), [PR-367](https://github.com/microsoft/DeepSpeed-MII/pull/367), [PR-350](https://github.com/microsoft/DeepSpeed-MII/pull/350), for more details. +See [PR-4938](https://github.com/deepspeedai/DeepSpeed/pull/4938), [PR-4920](https://github.com/deepspeedai/DeepSpeed/pull/4920), [PR-4739](https://github.com/deepspeedai/DeepSpeed/pull/4739), [PR-4694](https://github.com/deepspeedai/DeepSpeed/pull/4694), [PR-4634](https://github.com/deepspeedai/DeepSpeed/pull/4634), [PR-367](https://github.com/deepspeedai/DeepSpeed-MII/pull/367), [PR-350](https://github.com/deepspeedai/DeepSpeed-MII/pull/350), for more details. # 5. Community Engagement -DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. -*We would like to recognize the contribution from our user community in adding support for the [Qwen](https://arxiv.org/abs/2309.16609) model family to DeepSpeed-FastGen in [PR-4913](https://github.com/microsoft/DeepSpeed/pull/4913).* +*We would like to recognize the contribution from our user community in adding support for the [Qwen](https://arxiv.org/abs/2309.16609) model family to DeepSpeed-FastGen in [PR-4913](https://github.com/deepspeedai/DeepSpeed/pull/4913).* # 6. Try Out DeepSpeed-FastGen We are very excited to share this DeepSpeed-FastGen release. -* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII) +* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeed-MII) DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more, @@ -184,4 +184,4 @@ The following items are on our roadmap and we plan to engage with our community * Quantization support * New hardware backends through collaboration with partners -**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!** +**"Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) repositories if you like our work!** diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md index e287af2540ed..2a5f547fb6bd 100644 --- a/blogs/deepspeed-fastgen/README.md +++ b/blogs/deepspeed-fastgen/README.md @@ -23,11 +23,11 @@ Large language models (LLMs) like GPT-4 and LLaMA have emerged as a dominant workload in serving a wide range of applications infused with AI at every level. From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. While frameworks like DeepSpeed, PyTorch, and several others can regularly achieve good hardware utilization during LLM training, the interactive nature of these applications and the poor arithmetic intensity of tasks like open-ended text generation have become the bottleneck for inference throughput in existing systems. -To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs). +To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs). Today, we are glad to present DeepSpeed-FastGen, a system that overcomes these limitations by leveraging the proposed Dynamic SplitFuse technique and offers up to 2.3x higher effective throughput compared to state-of-the-art systems like vLLM. DeepSpeed-FastGen leverages the combination of DeepSpeed-MII and DeepSpeed-Inference to provide an easy-to-use serving system. -**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) release: +**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) release: ```bash pip install deepspeed-mii @@ -209,7 +209,7 @@ In addition to the deep analysis on A100, we provide additional benchmarking res ## 5. DeepSpeed-FastGen: Implementation and Usage -DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations. +DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
@@ -219,7 +219,7 @@ DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://gith The fastest way to get started with our alpha release of DeepSpeed-FastGen is: `pip install deepspeed-mii`. -Please follow our [Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/microsoft/DeepSpeed-MII). +Please follow our [Getting Started](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/deepspeedai/DeepSpeed-MII). ### A. Supported Models @@ -238,10 +238,10 @@ We currently support the following model architectures in this alpha release of All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer. -We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. +We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) and let us know. ### B. Deployment options -All of the examples below are runnable in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment: +All of the examples below are runnable in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment: #### Non-persistent pipeline @@ -280,20 +280,20 @@ client.terminate_server() ### C. Advanced Installation Information -For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source). +For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/deepspeedai/DeepSpeed-Kernels#source). # 6. Try Out DeepSpeed-FastGen We are very excited to share this DeepSpeed-FastGen alpha release. -* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII) +* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeed-MII) DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more, * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. * You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. -DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs: @@ -302,7 +302,7 @@ The following items are on our roadmap and we plan to engage with our community - New hardware backends through collaboration with partners - Release performance benchmarks (used to generate plots in this blog) -**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!** +**"Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) repositories if you like our work!** # 7. Acknowledgements diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md index 1e92e4169450..357c70ca39eb 100644 --- a/blogs/deepspeed-fastgen/chinese/README.md +++ b/blogs/deepspeed-fastgen/chinese/README.md @@ -23,11 +23,11 @@ GPT-4 和 LLaMA 这样的大型语言模型(LLMs)已在各个层次上成为了集成 AI 的主流服务应用。从常规聊天模型到文档摘要,从自动驾驶到各个软件中的Copilot功能,这些模型的部署和服务需求正在迅速增加。像 DeepSpeed、PyTorch 和其他几个框架可以在 LLM 训练期间实现良好的硬件利用率。但它们在与用户互动及处理开放式文本生成等任务时,受限于这些操作的计算密集度相对较低,现有系统往往在推理吞吐量上遇到瓶颈。 -为了解决这一问题, [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而,这些系统在面对长提示的工作负载时,依旧难以提供良好的服务质量。随着越来越多的模型(例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b))和系统(例如[DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses))支持延伸到数万个令牌的上下文窗口,这些长提示工作负载变得越来越重要。为了更好地理解问题,我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时,生成阶段将被提示处理所抢占,这可能会破坏服务级别协议(SLAs)。 +为了解决这一问题, [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而,这些系统在面对长提示的工作负载时,依旧难以提供良好的服务质量。随着越来越多的模型(例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b))和系统(例如[DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses))支持延伸到数万个令牌的上下文窗口,这些长提示工作负载变得越来越重要。为了更好地理解问题,我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时,生成阶段将被提示处理所抢占,这可能会破坏服务级别协议(SLAs)。 今天,我们很高兴地介绍 DeepSpeed-FastGen 框架,它通过采用我们提出的动态 SplitFuse 技术,能够提供比vLLM 等先进系统高出多达 2.3 倍的有效吞吐量。DeepSpeed-FastGen 是 DeepSpeed-MII 和 DeepSpeed-Inference 的结合,提供了一个易于使用的服务系统。 -**快速开始:** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 发行版: +**快速开始:** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) 发行版: ```bash pip install deepspeed-mii @@ -207,7 +207,7 @@ DeepSpeed-FastGen 提供了副本级负载均衡,可以将请求均匀分布 ## 5. DeepSpeed-FastGen:软件实现与使用指南 -DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) 的协同组合,如下图所示。这两个软件包共同提供了系统的各个组成部分,包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现,以及构建新模型实现的工具。 +DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed) 的协同组合,如下图所示。这两个软件包共同提供了系统的各个组成部分,包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现,以及构建新模型实现的工具。
@@ -217,7 +217,7 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII 使用我们的 alpha 版 DeepSpeed-FastGen 最快的入门方式是:`pip install deepspeed-mii`。 -请按照我们的 [入门指南](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题,请使用 [DeepSpeed-MII Github 仓库](https://github.com/microsoft/DeepSpeed-MII)。 +请按照我们的 [入门指南](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题,请使用 [DeepSpeed-MII Github 仓库](https://github.com/deepspeedai/DeepSpeed-MII)。 ### A. 支持的模型 @@ -233,10 +233,10 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII 所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。 -> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构,请[提交问题](https://github.com/microsoft/DeepSpeed-MII/issues)来让我们知道。 +> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构,请[提交问题](https://github.com/deepspeedai/DeepSpeed-MII/issues)来让我们知道。 ### B. 部署选项 -以下所有示例均可在 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后,您有两种部署方式:交互式非持久管道或持久化服务部署: +以下所有示例均可在 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后,您有两种部署方式:交互式非持久管道或持久化服务部署: #### 非持久管道 @@ -274,20 +274,20 @@ client.terminate_server() ### C. 高级安装方式 -为了使用方便并显著减少许多其他框架所需的冗长编译时间,我们通过名为 [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携,只要这些环境具有 NVIDIA GPU 计算能力 8.0+(Ampere+)、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下,您甚至不需要知道这个库的存在,因为它是 DeepSpeed-MII 的依赖项,并将自动与之一起安装。然而,如果您因任何原因需要手动编译我们的内核,请参阅我们的[高级安装文档](https://github.com/microsoft/DeepSpeed-Kernels#source)。 +为了使用方便并显著减少许多其他框架所需的冗长编译时间,我们通过名为 [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携,只要这些环境具有 NVIDIA GPU 计算能力 8.0+(Ampere+)、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下,您甚至不需要知道这个库的存在,因为它是 DeepSpeed-MII 的依赖项,并将自动与之一起安装。然而,如果您因任何原因需要手动编译我们的内核,请参阅我们的[高级安装文档](https://github.com/deepspeedai/DeepSpeed-Kernels#source)。 # 6. 尝试 DeepSpeed-FastGen 我们非常高兴分享 DeepSpeed-FastGen 的首个 alpha 版本。 -* 要开始,请访问我们的 DeepSpeed-MII GitHub 页面: [GitHub 登陆页面](https://github.com/microsoft/DeepSpeed-MII) +* 要开始,请访问我们的 DeepSpeed-MII GitHub 页面: [GitHub 登陆页面](https://github.com/deepspeedai/DeepSpeed-MII) DeepSpeed-FastGen 是更大的 DeepSpeed 生态系统的一部分,该生态系统包含了多种深度学习系统和建模技术。要了解更多, * 请访问我们的[网站](https://www.deepspeed.ai/),详细查看博客文章、教程和有用的文档。 * 您也可以通过我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日本 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 关注我们,以获取 DeepSpeed 的最新消息。 -DeepSpeed 欢迎您的贡献!我们鼓励您在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上报告问题、贡献 PR,并参与讨论。有关更多详细信息,请参见我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作,比如那些在深度学习研究上共同工作,应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求(以及其他请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 +DeepSpeed 欢迎您的贡献!我们鼓励您在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上报告问题、贡献 PR,并参与讨论。有关更多详细信息,请参见我们的[贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作,比如那些在深度学习研究上共同工作,应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求(以及其他请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 以下项目在我们的路线图上,我们计划通过我们的 GitHub 问题和 PR 与我们的社区在这些项目上进行交流: @@ -296,7 +296,7 @@ DeepSpeed 欢迎您的贡献!我们鼓励您在 [DeepSpeed GitHub](https://git - 通过与合作伙伴的合作支持新硬件后端 - 发布性能测试套件(例如此博客中生成的图表) -如果您喜欢我们的工作,请为我们的 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 仓库打上“星标”! +如果您喜欢我们的工作,请为我们的 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) 仓库打上“星标”! # 7. 致谢 diff --git a/blogs/deepspeed-fastgen/japanese/README.md b/blogs/deepspeed-fastgen/japanese/README.md index 9729854afcf0..5e7c59893d84 100644 --- a/blogs/deepspeed-fastgen/japanese/README.md +++ b/blogs/deepspeed-fastgen/japanese/README.md @@ -24,14 +24,14 @@ AIを様々な目的に利用する幅広いアプリケーションで、GPT-4やLLaMAのような大規模言語モデル(LLM)が、主要なワークロードになってきています。一般的なチャットモデルから、文書の要約、自動運転、ソフトウェアスタックの各層におけるプログラミングの補助まで、これらのモデルを大規模に展開・提供する需要が急増しています。DeepSpeedやPyTorchをはじめとするフレームワークは、一般に、LLMの訓練では良好なハードウェアの利用効率を達成できるものの、オープンエンドのテキスト生成などの課題では、GPUなどのハードウェア上で一度に実行される計算量が少ないことが、既存システムにおいて推論スループットのボトルネックとなっています。 PagedAttentionを搭載した [vLLM](https://arxiv.org/pdf/2309.06180.pdf) や [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) のような既存システムは、こうした課題を解決するために設計され、LLMの推論性能を大幅に向上させました。しかしこれらのシステムは依然として、特に長いプロンプトを含むワークロードにおいて、一貫したサービス品質の提供という点で課題を残しています。 -数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。 +数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。 これらの問題をより深く理解するために、LLMによるテキスト生成がどのように機能するか説明します。LLMによるテキスト生成は、プロンプト処理と生成と呼ばれる2つの異なるフェーズから構成されます。システムがこれらを全く独立に扱うと、生成のフェーズは、プロンプト処理によって中断されることになります。その結果、システムのレイテンシなどを定めた SLA (Service Level Agreement) に違反する可能性が高くなります。 このブログで紹介するDeepSpeed-FastGenは、新たに提案するDynamic SplitFuse技術などを活用することでこうした課題を解決し、vLLMなどの最新の既存システムと比較して最大2.3倍の実効スループットを実現するシステムです。 DeepSpeed-FastGenは、DeepSpeed-MIIとDeepSpeed-Inferenceの組み合わせにより、使いやすいテキスト生成機能を実現します。 -**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。 +**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。 ```bash @@ -218,7 +218,7 @@ A100 GPUを用いた分析に加えて、H100とA6000を使用したベンチマ ## 5. DeepSpeed-FastGen: 実装と使い方 -DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。 +DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。
@@ -228,7 +228,7 @@ DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-M DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方法は、 ``pip install deepspeed-mii`` を実行することです。 -詳細については、[Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/microsoft/DeepSpeed-MII)を使用してください。 +詳細については、[Getting Started](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/deepspeedai/DeepSpeed-MII)を使用してください。 ### A. 対応モデル @@ -240,11 +240,11 @@ DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方 現在のすべてのモデルは、モデルの重みとモデルに対応するトークナイザーの両方を提供するために、バックエンドで [HuggingFace](https://github.com/huggingface) を利用しています。 -初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/microsoft/DeepSpeed-MII/issues) を登録してください。。 +初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) を登録してください。。 ### B. デプロイメントのオプション -以下の例はすべて [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。 +以下の例はすべて [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。 #### 非永続パイプライン @@ -284,21 +284,21 @@ client.terminate_server() ### C. インストールの詳細情報 類似の他のプロジェクトでは、カスタムカーネルのコンパイルに非常に時間がかかることがよくあります。 -DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。 +DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。 このライブラリは、NVIDIA GPUのコンピュート能力が8.0以上(Ampere+)、CUDA 11.6以上、Ubuntu 20以上の環境で非常に移植性が高いことがわかっています。 -このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/microsoft/DeepSpeed-Kernels#source)をご覧ください。 +このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/deepspeedai/DeepSpeed-Kernels#source)をご覧ください。 # 6. DeepSpeed-FastGen を使ってみる このDeepSpeed-FastGenアルファリリースをユーザの皆さんと共有できることを非常に嬉しく思います。 -* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeed-MII) +* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/deepspeedai/DeepSpeed-MII) DeepSpeed-FastGenは、Deep Learningシステムやモデリングテクノロジーを数多く含む、より大きなDeepSpeedエコシステムの一部です。さらに詳しい情報が必要な方は、 [詳細なブログ記事]、チュートリアル、役立つドキュメントがある私たちの [ウェブサイト](https://www.deepspeed.ai/) をご覧ください。 DeepSpeedの最新情報については、[英語のTwitter](https://twitter.com/MSFTDeepSpeed)、[日本語のTwitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語の知乎](https://www.zhihu.com/people/deepspeed)をフォローしてください。 -DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。 +DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。 また、深層学習の研究や、実世界のAIモデルやアプリケーションへのDeepSpeedの適用に取り組む大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 以下の項目は、今後のロードマップです。GitHubの問題やPRを通じてコミュニティと協力して取り組む予定です: @@ -308,7 +308,7 @@ DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeed - パートナーとのコラボレーションによる新しいハードウェアバックエンド - ブログに掲載したプロットを生成するパフォーマンスベンチマークのリリース -このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。 +このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。 # 7. 謝辞 diff --git a/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md index 8273ff3a51a7..017e7c9ab4d8 100644 --- a/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md +++ b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md @@ -107,7 +107,7 @@ pip install qtorch 要使用我们的DeepSpeed-FP6进行基准测试,请访问以下脚本: ```bash -https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh +https://github.com/deepspeedai/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh ``` 也请访问[FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) 获取FP6的独立kernel。不要忘了给仓库加星标以表达您的支持! @@ -121,9 +121,9 @@ https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/ * 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。 * 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们,以获取 DeepSpeed 的最新消息。 -我们欢迎您为 DeepSpeed 做出贡献!我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息,请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度,例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 +我们欢迎您为 DeepSpeed 做出贡献!我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上参加讨论。有关更多详细信息,请查看我们的 [贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度,例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 -* 如果你喜欢我们的工作,请在[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/), [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)仓库“点赞”! +* 如果你喜欢我们的工作,请在[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/), [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) 和 [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/)仓库“点赞”! # 6. 致谢和贡献 diff --git a/blogs/deepspeed-fp6/03-05-2024/README.md b/blogs/deepspeed-fp6/03-05-2024/README.md index 0285dd79b87d..5f5b4700e403 100755 --- a/blogs/deepspeed-fp6/03-05-2024/README.md +++ b/blogs/deepspeed-fp6/03-05-2024/README.md @@ -108,7 +108,7 @@ pip install qtorch To benchmark with our DeepSpeed-FP6, please visit the following script: ```bash -https://github.com/microsoft/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh +https://github.com/deepspeedai/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh ``` Please also visit the [FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) for the standalone kernel of FP6. Don't forget to star the repo to show your support! @@ -122,9 +122,9 @@ Currently, DeepSpeed-FP6 supports only dense models with MoE models support upco * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. * Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. -We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. -* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/microsoft/DeepSpeed-MII/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work! +* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) repositories if you like our work! # 6. Acknowledgments and Contributions diff --git a/blogs/deepspeed-gds/README.md b/blogs/deepspeed-gds/README.md index 84a97cb22eab..536b6f984af0 100644 --- a/blogs/deepspeed-gds/README.md +++ b/blogs/deepspeed-gds/README.md @@ -81,7 +81,7 @@ Figure 3: Using DeepNVMe to scale LLAMA3-70B token generation performance with N # Summary -In this blog post, we introduced DeepNVMe, an I/O optimization technology created to tackle the emergence of I/O operations as key bottlenecks of Deep Learning scalability. DeepNVMe enables fast and efficient data transfers between persistent storage and DL application memory through optimizations built on popular storage technologies such as NVMe SSDs and NVIDIA GDS. We showed benefits of using DeepNVMe for LLAMA3-70B token generation on single A100-80GB GPU with NVMe offloading, for which it achieves up to 7 tokens per second in generation throughput on an Azure NC96ads\_A100\_v4 VM. DeepNVMe will be open-sourced and generally available in DeepSpeed versions >= [0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0). In future blogs, we will report DeepNVMe improvements for other I/O bound DL applications such as model checkpointing and data loading. +In this blog post, we introduced DeepNVMe, an I/O optimization technology created to tackle the emergence of I/O operations as key bottlenecks of Deep Learning scalability. DeepNVMe enables fast and efficient data transfers between persistent storage and DL application memory through optimizations built on popular storage technologies such as NVMe SSDs and NVIDIA GDS. We showed benefits of using DeepNVMe for LLAMA3-70B token generation on single A100-80GB GPU with NVMe offloading, for which it achieves up to 7 tokens per second in generation throughput on an Azure NC96ads\_A100\_v4 VM. DeepNVMe will be open-sourced and generally available in DeepSpeed versions >= [0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0). In future blogs, we will report DeepNVMe improvements for other I/O bound DL applications such as model checkpointing and data loading. # Acknowlegements diff --git a/blogs/deepspeed-gds/japanese/README.md b/blogs/deepspeed-gds/japanese/README.md index 8d65d5225b16..26320d00ab94 100644 --- a/blogs/deepspeed-gds/japanese/README.md +++ b/blogs/deepspeed-gds/japanese/README.md @@ -70,7 +70,7 @@ LLAMA3-70Bモデルの推論を単一のNVIDIA A100-80GBで、プロンプト長 # まとめ -このブログ記事では、深層学習のスケーラビリティにおいて主要なボトルネックとなるI/O操作を最適化する、DeepNVMeを紹介しました。DeepNVMeは、NVMe SSDやNVIDIA GDSなどのストレージ技術に基づいた最適化を通じて、永続ストレージと深層学習アプリケーションのデータ転送を高速かつ効率的に実現します。Azure NC96ads_A100_v4 VMでの単一A100-80GB GPUを使用したLLAMA3-70Bトークン生成において、DeepNVMeを使用することで、NVMeオフロードで最大7トークン/秒の生成スループットを達成しました。DeepNVMeはオープンソース化され、DeepSpeedバージョン[0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0).以上で利用可能です。今後のブログでは、モデルチェックポイントやデータロードなどの他のI/Oがボトルネックとなる深層学習アプリケーションに対するDeepNVMeの改善について報告します。 +このブログ記事では、深層学習のスケーラビリティにおいて主要なボトルネックとなるI/O操作を最適化する、DeepNVMeを紹介しました。DeepNVMeは、NVMe SSDやNVIDIA GDSなどのストレージ技術に基づいた最適化を通じて、永続ストレージと深層学習アプリケーションのデータ転送を高速かつ効率的に実現します。Azure NC96ads_A100_v4 VMでの単一A100-80GB GPUを使用したLLAMA3-70Bトークン生成において、DeepNVMeを使用することで、NVMeオフロードで最大7トークン/秒の生成スループットを達成しました。DeepNVMeはオープンソース化され、DeepSpeedバージョン[0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0).以上で利用可能です。今後のブログでは、モデルチェックポイントやデータロードなどの他のI/Oがボトルネックとなる深層学習アプリケーションに対するDeepNVMeの改善について報告します。 # 謝辞 diff --git a/blogs/deepspeed-offloadpp/README.md b/blogs/deepspeed-offloadpp/README.md index 62587354309c..f58173b7bc8b 100644 --- a/blogs/deepspeed-offloadpp/README.md +++ b/blogs/deepspeed-offloadpp/README.md @@ -43,7 +43,7 @@ We conduct our performance evaluations over both A100 and H100 DGX machine and t ## Tutorials -Examples and Tutorials are [here](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md) +Examples and Tutorials are [here](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md) ## Contributors: diff --git a/blogs/deepspeed-triton/README.md b/blogs/deepspeed-triton/README.md index 91803f3faf2f..57922c5e1a23 100644 --- a/blogs/deepspeed-triton/README.md +++ b/blogs/deepspeed-triton/README.md @@ -65,7 +65,7 @@ We use an example of Bert-base here. ```python pip install deepspeed[triton] -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/inference/huggingface/fill-mask deepspeed --num_gpus 1 test-bert.py --triton @@ -76,7 +76,7 @@ To run a performance benchmark, you can use the following command: ```python pip install deepspeed[triton] -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/benchmarks/inference deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype fp16 --kernel-inject --deepspeed --graphs --triton @@ -84,7 +84,7 @@ deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype # NOTE -* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/microsoft/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation. +* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/deepspeedai/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation. * This feature is currently only supported for BERT, Roberta and other BERT-like models, and not for text-generation models yet. diff --git a/blogs/deepspeed-ucp/README.md b/blogs/deepspeed-ucp/README.md index abd5b4ca017d..f8b5c6fb87e7 100644 --- a/blogs/deepspeed-ucp/README.md +++ b/blogs/deepspeed-ucp/README.md @@ -245,8 +245,8 @@ iteration 501 with a new Target TP=PP=DP=2. We are excited to release DeepSpeed Universal Checkpoint. DeepSpeed Universal Checkpoint is available in DeepSpeed versions >= -[0.14.4](https://github.com/microsoft/DeepSpeed/releases/tag/v0.14.4), -has been fully integrated with [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) ([commit c3a13be](https://github.com/microsoft/Megatron-DeepSpeed/commit/c3a13be721da0d0de16c338d0d665b0f7d13d14f)). +[0.14.4](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.14.4), +has been fully integrated with [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) ([commit c3a13be](https://github.com/deepspeedai/Megatron-DeepSpeed/commit/c3a13be721da0d0de16c338d0d665b0f7d13d14f)). Detailed tutorial on usage is available on [DeepSpeed tutorial page](https://www.deepspeed.ai/tutorials/universal-checkpointing/). diff --git a/blogs/deepspeed-ulysses/README.md b/blogs/deepspeed-ulysses/README.md index 375eb1190325..8ed573916ff4 100644 --- a/blogs/deepspeed-ulysses/README.md +++ b/blogs/deepspeed-ulysses/README.md @@ -367,4 +367,4 @@ on X, formerly Twitter, ([English](https://twitter.com/MSFTDeepSpeed), [Japanese We are open to collaborations with universities, research labs, and companies. For such requests (and other requests unsuitable for GitHub), please directly email to . If you like -our work, please "Star" our [repo](https://github.com/microsoft/DeepSpeed). +our work, please "Star" our [repo](https://github.com/deepspeedai/DeepSpeed). diff --git a/blogs/deepspeed-ulysses/japanese/README.md b/blogs/deepspeed-ulysses/japanese/README.md index 88a0e375ce70..1d9ed1e3a1d2 100644 --- a/blogs/deepspeed-ulysses/japanese/README.md +++ b/blogs/deepspeed-ulysses/japanese/README.md @@ -155,4 +155,4 @@ DeepSpeed-Ulyssesは、DeepSpeedのGitHubを通じてアクセス可能です。 長いコンテキストを扱う際の制約を取り除くことによって何が可能になるのか、ユーザの皆様と共に様々な可能性を探求するため、幅広い協力やコラボレーションを歓迎します。DeepSpeed-Ulyssesは、大規模なAIの訓練と推論のためのより大きなDeepSpeedエコシステムの一部です。DeepSpeedの多くの技術や革新的な機能の詳細については、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただくか、X(以前のTwitter。[英語版](https://twitter.com/MSFTDeepSpeed)、[日本語版](https://twitter.com/MSFTDeepSpeedJP))や、中国の[Zhihu](https://www.zhihu.com/people/deepspeed)でフォローしてください。 -DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 +DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md index e91ff1ecd51e..d4cba8b46560 100644 --- a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md +++ b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md @@ -146,7 +146,7 @@ DeepSpeed-VisualChat 是一个易于使用的训练框架,具有很好的可 使用 DeepSpeed-VisualChat 训练模型是简单和方便的。这里我们给出了基于 CLIP 视觉编码器和 LLaMa-7B 模型的一个例子: ``` -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/ pip install -r requirements.txt cd training @@ -161,21 +161,21 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat 为了支持更大的模型推理,我们已经将 Hugging Face 大模型推理集成到我们的 DeepSpeed-VisualChat API 中。因此,用户可以根据 GPU 内存容量和模型大小选择不同数量的 GPU。 -请参考我们的 [GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。 +请参考我们的 [GitHub 主页](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。 # 7. 发布:今天尝试 DeepSpeed-VisualChat! 我们非常兴奋地分享 DeepSpeed-VisualChat 现已开源并供 AI 社区使用。 -* 要开始使用,请访问我们的 DeepSpeed-VisualChat GitHub 页面:[GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) +* 要开始使用,请访问我们的 DeepSpeed-VisualChat GitHub 页面:[GitHub 主页](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) -* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。 +* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。 DeepSpeed-VisualChat 是更大的 DeepSpeed 生态系统的一部分,其中包括一系列深度学习系统和建模技术。要了解更多信息, * 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。 * 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们,以获取 DeepSpeed 的最新消息。 -我们欢迎您为 DeepSpeed 做出贡献!我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息,请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度,例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 +我们欢迎您为 DeepSpeed 做出贡献!我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上参加讨论。有关更多详细信息,请查看我们的 [贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度,例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求(以及其他不适合 GitHub 的请求),请直接发送电子邮件至 deepspeed-info@microsoft.com。 -* 如果你喜欢我们的工作,请在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) 上为我们的仓库点“星”。 +* 如果你喜欢我们的工作,请在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) 上为我们的仓库点“星”。 diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md index 9aec568b501b..b6c12787778c 100755 --- a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md +++ b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md @@ -152,7 +152,7 @@ DeepSpeed-VisualChatは使いやすく、かつ優れたスケーラビリティ DeepSpeed-VisualChatの訓練は簡単かつ便利に実行できます。ここではCLIPビジュアルエンコーダーとLLaMa-7Bモデルを使用する例を示します: ``` -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/ pip install -r requirements.txt cd training @@ -168,21 +168,21 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat より大規模なモデル推論をサポートするために、我々はHugging Faceの大規模モデル推論をDeepSpeed-VisualChat APIに組み込みました。そのため、ユーザーはGPUメモリ容量とモデルサイズに基づいて、異なるGPU数を選択することができます。 -詳細は[ランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。 +詳細は[ランディングページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。 # 7. 早速使ってみましょう! DeepSpeed-VisualChatがオープンソース化され、AIコミュニティで利用できるようになったことを大変嬉しく思います。 -* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) +* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) -* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。 +* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。 DeepSpeed-VisualChatは、さまざまなDeep Learningシステムやモデリング技術を含む、より大きなDeepSpeedエコシステムの一部です。詳細については、以下をご覧ください。 * 私たちの[ウェブサイト](https://www.deepspeed.ai/)で、詳細なブログ記事、チュートリアル、役立つドキュメントを提供しています。 * DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。 -DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 +DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 -* 私たちの[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください! +* 私たちの[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください! diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md index 98f9f298ea5a..b2bbb08de50e 100755 --- a/blogs/deepspeed-visualchat/10-03-2023/README.md +++ b/blogs/deepspeed-visualchat/10-03-2023/README.md @@ -153,7 +153,7 @@ DeepSpeed-VisualChat is an easy-to-use training framework with great scalability The training experience of DeepSpeed-VisualChat is straightforward and convenient. Here we give an example based on the CLIP visual encoder and the LLaMa-7B model: ``` -git clone https://github.com/microsoft/DeepSpeedExamples.git +git clone https://github.com/deepspeedai/DeepSpeedExamples.git cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/ pip install -r requirements.txt cd training @@ -167,15 +167,15 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat ``` To support larger model inference, we have incorporated Hugging Face large model inference into our DeepSpeed-VisualChat API. Therefore, users can choose a different number of GPUs based on the GPU memory capacity and the model size. -Please refer to our [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details. +Please refer to our [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details. # 7. Release: Try DeepSpeed-VisualChat today! We are very excited to share that DeepSpeed-VisualChat is now open-sourced and available to the AI community. -* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) +* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) -* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future. +* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future. DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more, @@ -183,6 +183,6 @@ DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which inc * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. * Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. -We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. +We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. -* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work! +* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) repositories if you like our work! diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md index dabc4ab077f2..d3bebfec598f 100644 --- a/blogs/deepspeed4science/chinese/README.md +++ b/blogs/deepspeed4science/chinese/README.md @@ -123,11 +123,11 @@ DeepSpeed4Science的旅程始于两个开创性的基于LLM的结构生物学研 *图9:由不同框架在不同规模下支持的两个GenSLMs模型的最大序列长度。使用NVIDIA DGX,每个节点有八个40G A100 GPU。*
-具体在系统层面,我们发布了包括[长序列支持和其他新优化](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/microsoft/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术(如注意力掩码异步处理和位置码分割)、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合,用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面,这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM(对于25B和33B模型分别高达9.8倍和9.1倍)。例如,阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K,而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家,这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。 +具体在系统层面,我们发布了包括[长序列支持和其他新优化](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/deepspeedai/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术(如注意力掩码异步处理和位置码分割)、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合,用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面,这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM(对于25B和33B模型分别高达9.8倍和9.1倍)。例如,阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K,而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家,这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。 ## 总结和路线图 -我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始,我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划,包括关于我们的外部合作者的信息,以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与,帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)上报告问题、贡献PR、参与讨论。 +我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始,我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划,包括关于我们的外部合作者的信息,以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与,帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)上报告问题、贡献PR、参与讨论。 ## 致谢 diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md index 276528650ab5..21b56d2dd944 100644 --- a/blogs/deepspeed4science/japanese/README.md +++ b/blogs/deepspeed4science/japanese/README.md @@ -123,11 +123,11 @@ DeepSpeed4Scienceは、構造生物学研究(タンパク質構造予測や平 *図9: 異なるスケールで異なるフレームワークがサポートする2つのGenSLMsモデルの最大シーケンス長。1ノードあたり8個の40G A100 GPUを搭載したNVIDIA DGXノードを使用。*
-システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/microsoft/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、(アテンションマスクと位置の埋め込みに関する)新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。 +システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/deepspeedai/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、(アテンションマスクと位置の埋め込みに関する)新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。 ## まとめとロードマップ -DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせはまでお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)でお願いします。 +DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせはまでお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)でお願いします。 ## 謝辞 diff --git a/blogs/intel-inference/README.md b/blogs/intel-inference/README.md index 107e46808d2e..917117523abc 100644 --- a/blogs/intel-inference/README.md +++ b/blogs/intel-inference/README.md @@ -128,15 +128,15 @@ This work was made possible through deep collaboration between software engineer [8] Intel, "Intel® Neural Compressor," [Online]. Available: https://github.com/intel/neural-compressor. -[9] Microsoft, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/microsoft/DeepSpeed/blob/master/docs/_tutorials/accelerator-abstraction-interface.md. +[9] Microsoft, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/deepspeedai/DeepSpeed/blob/master/docs/_tutorials/accelerator-abstraction-interface.md. [10] Microsoft, "Automatic Tensor Parallelism for HuggingFace Models," [Online]. Available: https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism. [11] Intel, "Intel® oneCCL Bindings for PyTorch," [Online]. Available: https://github.com/intel/torch-ccl. -[12] Microsoft, "deepspeed," [Online]. Available: https://github.com/microsoft/deepspeed. +[12] Microsoft, "deepspeed," [Online]. Available: https://github.com/deepspeedai/deepspeed. -[13] Intel, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/microsoft/DeepSpeed/pull/3184. +[13] Intel, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/deepspeedai/DeepSpeed/pull/3184. [14] Intel, "Intel® Extension for PyTorch large language model example," [Online]. Available: https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch/examples/cpu/inference/python/llm. diff --git a/blogs/windows/08-2024/README.md b/blogs/windows/08-2024/README.md index 8a23372a1d64..335ef473504c 100644 --- a/blogs/windows/08-2024/README.md +++ b/blogs/windows/08-2024/README.md @@ -11,7 +11,7 @@ DeepSpeed is a popular open-source deep learning optimization library that makes In this blog, we are pleased to announce some early achievements on this journey: DeepSpeed can now be installed in Windows and run natively for single-GPU training, finetuning, and inferencing. Importantly, both the installation and usage experiences are identical to those on Linux. Furthermore, the finetuning and inferencing workloads demonstrate the functioning of three critical DeepSpeed features, HuggingFace Transformers integration, LoRA support, and CPU Offloading. DeepSpeed on Windows is available in DeepSpeed versions 0.14.5 and above. In the rest of this blog, we present examples to demonstrate these achievements. # Evaluation Environment -We conducted the experiments on a Surface Laptop Studio 2 running Windows 11 Version 23H2 and Build 22631.3880. The laptop is equipped with a single NVIDIA RTX A2000 GPU with 4GB VRAM. We used Pytorch version 2.3.0 and HuggingFace Transformers version 4.41.2. The example scripts used are from the [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples), therefore you need to clone the repo before running any of the following examples. +We conducted the experiments on a Surface Laptop Studio 2 running Windows 11 Version 23H2 and Build 22631.3880. The laptop is equipped with a single NVIDIA RTX A2000 GPU with 4GB VRAM. We used Pytorch version 2.3.0 and HuggingFace Transformers version 4.41.2. The example scripts used are from the [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples), therefore you need to clone the repo before running any of the following examples. # Installation DeepSpeed can be installed on Windows in one of two ways. The easier way is to use the pip package manager, while the other is to build from source. The prerequisites for in both cases are Python 3.x and Pytorch with CUDA support. diff --git a/blogs/windows/08-2024/japanese/README.md b/blogs/windows/08-2024/japanese/README.md index 7e437f737f58..c2f5b9ee2143 100644 --- a/blogs/windows/08-2024/japanese/README.md +++ b/blogs/windows/08-2024/japanese/README.md @@ -12,7 +12,7 @@ DeepSpeedは、分散学習と推論を簡単かつ効率的に行うための # テスト環境 -Windows 11 Version 23H2 および Build 22631.3880 を実行している Surface Laptop Studio 2 でテストを行いました。このハードウェアには、4GBのVRAMを搭載した NVIDIA RTX A2000 GPU が1つ搭載されています。また、PyTorchバージョン 2.3.0 および HuggingFace Transformersバージョン 4.41.2 を使用しました。使用したサンプルスクリプトは[DeepSpeedExamplesリポジトリ](https://github.com/microsoft/DeepSpeedExamples)から取得できます。以下の例を実行する前にリポジトリをクローンしてください。 +Windows 11 Version 23H2 および Build 22631.3880 を実行している Surface Laptop Studio 2 でテストを行いました。このハードウェアには、4GBのVRAMを搭載した NVIDIA RTX A2000 GPU が1つ搭載されています。また、PyTorchバージョン 2.3.0 および HuggingFace Transformersバージョン 4.41.2 を使用しました。使用したサンプルスクリプトは[DeepSpeedExamplesリポジトリ](https://github.com/deepspeedai/DeepSpeedExamples)から取得できます。以下の例を実行する前にリポジトリをクローンしてください。 # インストール diff --git a/blogs/zeropp/japanese/README.md b/blogs/zeropp/japanese/README.md index a4d4e68f6b02..1e5ef91a035f 100644 --- a/blogs/zeropp/japanese/README.md +++ b/blogs/zeropp/japanese/README.md @@ -176,7 +176,7 @@ DeepSpeed-ZeRO++は、DeepSpeedエコシステムの一部です。詳細につ また、[英語版Twitter](https://twitter.com/MSFTDeepSpeed)、[日本語版Twitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語版Zhihuアカウント](https://www.zhihu.com/people/deepspeed)でも最新のDeepSpeedニュースを発信しています。 -DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 +DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望(およびGitHubには適さないその他の話題)については まで直接メールをお送りください。 **Contributors:** diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md index fc76ed1e9f8d..1a9adfede948 100755 --- a/deepspeed/autotuning/README.md +++ b/deepspeed/autotuning/README.md @@ -214,7 +214,7 @@ If `"stage"` is not defined or set as `"all"`, then the overwriting applies to a Currently, the DeepSpeed Autotuner does not tune offloading behaviors but instead uses the values defined in the offload section of the DeepSpeed configuration file. See [Parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) and [Optimizer offloading](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) for details. -If using NVME for offloading, users can run a benchmark offline to select the optimal `aio` setup in DeepSpeed. Refer to [profiling NVMe and configuring aio param section](https://github.com/microsoft/DeepSpeed/issues/998). +If using NVME for offloading, users can run a benchmark offline to select the optimal `aio` setup in DeepSpeed. Refer to [profiling NVMe and configuring aio param section](https://github.com/deepspeedai/DeepSpeed/issues/998). ## Autotuning Output @@ -342,7 +342,7 @@ In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has " ## GPT2-large Example -This section shows an example of using DeepSpeed autotuning. For more examples, refer to [autotuning](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. +This section shows an example of using DeepSpeed autotuning. For more examples, refer to [autotuning](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. Example training script: @@ -412,4 +412,4 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati | ---------- | -------------------- | ------------------------ | ------------------------------ | | GPT2-large | 27.874 (mbs = 1) | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3) | -As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models. +As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models. diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index be6336d02a19..3089d0c557a4 100755 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -594,7 +594,7 @@ def _generate(self, *inputs, **kwargs): if num_beams > 1: raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please " - "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506") + "add your request to: https://github.com/deepspeedai/DeepSpeed/issues/2506") if ("input_ids" in kwargs) and (kwargs["input_ids"].dim() == 2): for input_tensor in kwargs["input_ids"]: diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py index 314f7f2f0485..9c3188dfebb8 100644 --- a/deepspeed/inference/v2/engine_factory.py +++ b/deepspeed/inference/v2/engine_factory.py @@ -100,7 +100,7 @@ def build_hf_engine(path: str, if model_config.model_type == "opt": if not model_config.do_layer_norm_before: raise ValueError( - "Detected OPT-350m model. This model is not currently supported. If this is not the 350m model, please open an issue: https://github.com/microsoft/DeepSpeed-MII/issues" + "Detected OPT-350m model. This model is not currently supported. If this is not the 350m model, please open an issue: https://github.com/deepspeedai/DeepSpeed-MII/issues" ) policy = OPTPolicy(model_config, checkpoint_engine=checkpoint_engine) elif model_config.model_type == "llama": diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py index 5fb55bc74339..57b136663be3 100644 --- a/deepspeed/module_inject/containers/features/meta_tensor.py +++ b/deepspeed/module_inject/containers/features/meta_tensor.py @@ -60,7 +60,7 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix): layer of the model for searching the parameter's name in a checkpoint file. For more information of how this is used please see - https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py + https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py 2. `split_qkv` (Default: True): we use this flag when splitting the qkv parameter into heads. If it is False, it means the heads of q, k, and v are stored together and needs to split in the diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 32c88549c821..26d242d33e2f 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -644,7 +644,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No policy.update({plcy._orig_layer_class: (replace_fn, plcy)}) assert len(policy.items()) > 0,\ "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\ - "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py" + "You can find some samples here: https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py" replaced_module, _ = _replace_module(model, policy, state_dict=sd) return replaced_module diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index 340bc82de508..a00d694fbc14 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -43,7 +43,7 @@ try: # To enable Tutel MoE optimizations: - # python3 -m pip install --user --upgrade git+https://github.com/microsoft/tutel@v0.1.x + # python3 -m pip install --user --upgrade git+https://github.com/deepspeedai/tutel@v0.1.x from tutel import moe as tutel_moe TUTEL_INSTALLED = True except: diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py index e25621bd0977..37f065e48631 100755 --- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py @@ -8,7 +8,7 @@ class BertSparseSelfAttention(nn.Module): - """Implements Sparse Self Attention layer of Bert model based on https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373 + """Implements Sparse Self Attention layer of Bert model based on https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373 For more information please see, TODO DeepSpeed Sparse Transformer. diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md index 82011eb68568..68ac3dc285c7 100644 --- a/deepspeed/profiling/flops_profiler/README.md +++ b/deepspeed/profiling/flops_profiler/README.md @@ -178,7 +178,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d #### Example: Megatron-LM -For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). +For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM). An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attention_heads = 32, batch_size = 1024, seq_length = 1024`) is shown below. diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py index c2fa907d7dbb..2fadce52222c 100644 --- a/deepspeed/runtime/comm/coalesced_collectives.py +++ b/deepspeed/runtime/comm/coalesced_collectives.py @@ -45,7 +45,7 @@ def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]: elif tensor.numel() % (2 * global_world_size) != 0: # Due to the constraint of 2-stage all-to-all, the input tensor must be divisible by 2 * global_world_size # Otherwise, all-to-all cannot be performed because of shape mismatch. - # See more at https://github.com/microsoft/DeepSpeed/pull/5056 + # See more at https://github.com/deepspeedai/DeepSpeed/pull/5056 logger.warning( f"qgZ falls back to reduce_scatter because tensor size = {tensor.numel()} is not divisible by (2 * global_world_size) = {2 * global_world_size}. Please consider allocating a new world to enable qgZ" ) @@ -101,7 +101,7 @@ def all_to_all_loco_quant_reduce( elif tensor.numel() % (2 * global_world_size) != 0: # Due to the constraint of 2-stage all-to-all, the input tensor must be divisible by 2 * global_world_size # Otherwise, all-to-all cannot be performed because of shape mismatch. - # See more at https://github.com/microsoft/DeepSpeed/pull/5056 + # See more at https://github.com/deepspeedai/DeepSpeed/pull/5056 logger.warning( f"qgZ falls back to reduce_scatter because tensor size = {tensor.numel()} is not divisible by (2 * global_world_size) = {2 * global_world_size}. Please consider allocating a new world to enable qgZ" ) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 697cb294d1f3..3627d4675a71 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1793,7 +1793,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): # Sum across all model parallel GPUs. if len(grad_norms) == 0: - # FIX https://github.com/microsoft/DeepSpeed/issues/3564 + # FIX https://github.com/deepspeedai/DeepSpeed/issues/3564 total_norm_cuda = torch.tensor(0, dtype=gradients[0].dtype).to(get_accelerator().device_name()).double() else: diff --git a/docker/Dockerfile b/docker/Dockerfile index 5a62a5a01aba..263a30be27c5 100755 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -174,7 +174,7 @@ USER deepspeed ############################################################################## # DeepSpeed ############################################################################## -RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed +RUN git clone https://github.com/deepspeedai/DeepSpeed.git ${STAGE_DIR}/DeepSpeed RUN cd ${STAGE_DIR}/DeepSpeed && \ git checkout . && \ git checkout master && \ diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index c3ebad4f86af..46875d75059b 100755 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -8,7 +8,7 @@ main: - title: 'Documentation' url: https://deepspeed.readthedocs.io/ - title: 'GitHub' - url: https://github.com/microsoft/DeepSpeed + url: https://github.com/deepspeedai/DeepSpeed lnav: - title: 'Training' diff --git a/docs/_pages/deepspeed4science.md b/docs/_pages/deepspeed4science.md index b35351838f22..b1aa706ad180 100755 --- a/docs/_pages/deepspeed4science.md +++ b/docs/_pages/deepspeed4science.md @@ -24,14 +24,14 @@ To cite DeepSpeed4Science, please cite our [white paper](https://arxiv.org/abs/2 ## New Megatron-DeepSpeed for Large-Scale AI4Science Model Training -We are proud to introduce [new Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning. +We are proud to introduce [new Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning. ![new Megatron-DeepSpeed](/assets/images/new-megatron-ds.png){: .align-center}

The figure depicts system capability in terms of enabling long sequence lengths for training a 33B parameter GPT-like model using our new Megatron-DeepSpeed framework. The results show that the new Megatron-DeepSpeed enables 9x longer sequence lengths than NVIDIA's Megatron-LM without triggering out-of-memory error.

-To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support). +To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support). Meanwhile, our new Megatron-DeepSpeed has been applied to genome-scale foundation model [GenSLMs](https://github.com/ramanathanlab/genslm), which is a 2022 [ACM Gordon Bell award](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022) winning genome-scale language model from Argonne National Lab. To achieve their scientific goal, GenSLMs and similar models require very long sequence support for both training and inference that is beyond generic LLM's long-sequence strategies. By leveraging DeepSpeed4Science's new Megatron-DeepSpeed, GenSLMs team is able to train their 25B model with 512K sequence length, much longer than their original 42K sequence length. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/). GenSLMs team also hosts an [example](https://github.com/ramanathanlab/genslm/tree/main/examples/long-sequences) about how to use DeepSpeed4Science in the GenSLMs repo. diff --git a/docs/_pages/inference.md b/docs/_pages/inference.md index f44d0b94ceec..fb3534872439 100755 --- a/docs/_pages/inference.md +++ b/docs/_pages/inference.md @@ -6,10 +6,10 @@ toc: true toc_label: "Contents" --- ->**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!** +>**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!** DeepSpeed-Inference introduces several features to efficiently serve transformer-based PyTorch models. It supports model parallelism (MP) to fit large models that would otherwise not fit in GPU memory. Even for smaller models, MP can be used to reduce latency for inference. To further reduce latency and cost, we introduce inference-customized kernels. Finally, we propose a novel approach to quantize models, called MoQ, to both shrink the model and reduce the inference cost at production. For more details on the inference related optimizations in DeepSpeed, please refer to our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/). -DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py). +DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py). To get started with DeepSpeed-Inference, please checkout our [tutorial](https://www.deepspeed.ai/tutorials/inference-tutorial/). diff --git a/docs/_posts/2020-05-19-bert-record.md b/docs/_posts/2020-05-19-bert-record.md index b47ad0b0beaf..67d00280348e 100644 --- a/docs/_posts/2020-05-19-bert-record.md +++ b/docs/_posts/2020-05-19-bert-record.md @@ -19,4 +19,4 @@ the same number and generation of GPUs. * Brief overview, see our [press release](https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/). * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html). * Tutorial on how to reproduce our results, see our [BERT pre-training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/). -* The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples). +* The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples). diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md index 62be6c1bffce..2154c36fe279 100644 --- a/docs/_posts/2020-05-28-fastest-bert-training.md +++ b/docs/_posts/2020-05-28-fastest-bert-training.md @@ -284,7 +284,7 @@ and faster convergence. To try out these optimizations and training recipe, please check out our [BERT training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/) and source code at the [DeepSpeed GitHub -repo](https://github.com/microsoft/deepspeed). +repo](https://github.com/deepspeedai/deepspeed). ### References diff --git a/docs/_posts/2020-09-08-sparse-attention-news.md b/docs/_posts/2020-09-08-sparse-attention-news.md index 79de33a82e3a..b9a0aeb88d9b 100644 --- a/docs/_posts/2020-09-08-sparse-attention-news.md +++ b/docs/_posts/2020-09-08-sparse-attention-news.md @@ -11,4 +11,4 @@ DeepSpeed offers sparse attention kernels, an instrumental technology to support * Brief overview, see our [press release]({{ site.press_release_v3 }}). * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/sparse-attention.html). * Tutorial on how to use sparse attention, see our [Sparse attention tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/). -* The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples). +* The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples). diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md index 8e2e8423fd55..e0626f791a4e 100755 --- a/docs/_posts/2020-09-09-ZeRO-Offload.md +++ b/docs/_posts/2020-09-09-ZeRO-Offload.md @@ -10,4 +10,4 @@ We introduce a new technology called ZeRO-Offload to enable **10X bigger model t * For more information on ZeRO-Offload, see our [press release]( {{ site.press_release_v3 }} ). * For more information on how to use ZeRO-Offload, see our [ZeRO-Offload tutorial](https://www.deepspeed.ai/tutorials/ZeRO-offload/). -* The source code for ZeRO-Offload can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). +* The source code for ZeRO-Offload can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed). diff --git a/docs/_posts/2020-09-09-onebit-adam-news.md b/docs/_posts/2020-09-09-onebit-adam-news.md index d0adcb09987f..1fd8ef89edce 100644 --- a/docs/_posts/2020-09-09-onebit-adam-news.md +++ b/docs/_posts/2020-09-09-onebit-adam-news.md @@ -17,4 +17,4 @@ its efficient implementation in DeepSpeed. 1-bit Adam offers the ***same converg * Brief overview, see our [press release]({{ site.press_release_v3 }}). * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html). * Tutorial on how to reproduce our results, see our [1-bit Adam tutorial](/tutorials/onebit-adam/). -* The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/). +* The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/). diff --git a/docs/_posts/2020-09-09-pipeline-parallelism.md b/docs/_posts/2020-09-09-pipeline-parallelism.md index 48343ebd8d1e..fe708bc4d50d 100644 --- a/docs/_posts/2020-09-09-pipeline-parallelism.md +++ b/docs/_posts/2020-09-09-pipeline-parallelism.md @@ -14,5 +14,5 @@ low-bandwidth network by up to 7x. * For a brief overview and results including trillion-parameter capabilities, see our [press release]({{ site.press_release_v3 }}). * To get started with pipeline parallel training in DeepSpeed, we recommend our [tutorial](/tutorials/pipeline/). -* See our AlexNet example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). +* See our AlexNet example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples). * Read our API documentation on [readthedocs](https://deepspeed.readthedocs.io/en/latest/pipeline.html). diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md index 9675ef1058dd..1ab827d6fc8e 100644 --- a/docs/_posts/2020-09-09-sparse-attention.md +++ b/docs/_posts/2020-09-09-sparse-attention.md @@ -28,7 +28,7 @@ In a pre-training experiment, we ran BERT model under three settings: dense, den ![Maximum sequence runnable on BERT](/assets/images/sa_maximum_sequence_runnable_on_bert.png){: .align-center} * **Up to 6.3x faster computation** -We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed: the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results. +We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed: the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results. ![Training time for BERT base with varying sequence length](/assets/images/sa_bert_base_time_result.png){: .align-center} diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md index ee518f53f012..da07edd7b922 100755 --- a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md +++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md @@ -10,4 +10,4 @@ We introduce a new technology called progressive layer dropping (PLD) to speedup * For detailed technology deep dive, see our [technical report](https://arxiv.org/pdf/2010.13369.pdf). * For more information on how to use PLD, see our [Progressive layer dropping tutorial](https://www.deepspeed.ai/tutorials/progressive_layer_dropping/). - * The source code for PLD is now available at the [DeepSpeed repo](https://github.com/microsoft/deepspeed). + * The source code for PLD is now available at the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed). diff --git a/docs/_posts/2021-11-15-autotuning.md b/docs/_posts/2021-11-15-autotuning.md index 71acf54438ea..410e32c878a3 100644 --- a/docs/_posts/2021-11-15-autotuning.md +++ b/docs/_posts/2021-11-15-autotuning.md @@ -8,8 +8,8 @@ toc: false We introduce a new feature called Autotuning to automatically discover the optimal DeepSpeed configuration that delivers good training speed. One pain point in model training is to figure out good performance-relevant configurations such as micro-batch size to fully utilize the hardware and achieve a high throughput number. This configuration exploring process is commonly done manually but is important since model training is repeated many times and benefits from using a good configuration. Not only is the hand-tuning process time-consuming, but the outcome is hardware-dependent. This means that a good configuration on one hardware might not be the best on another different hardware. The user thus has to hand tune the configuration again. With DeepSpeed, there are more configuration parameters that could potentially affect the training speed, thus making it more tedious to manually tune the configuration. -The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) would demonstrate the effectiveness of autotuning across different models. +The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) would demonstrate the effectiveness of autotuning across different models. * For a brief overview, see the [Autotuning tutorial](https://www.deepspeed.ai/tutorials/autotuning/). -* For more information on how to use Autotuning, see the [Autotuning README](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#deepspeed-autotuning). -* The source code can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). +* For more information on how to use Autotuning, see the [Autotuning README](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#deepspeed-autotuning). +* The source code can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed). diff --git a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md index 99a62fbe00ea..69fef131d3c0 100644 --- a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md +++ b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md @@ -170,9 +170,9 @@ high quality language models accessible to a broad audience, even with limited compute resources. To this end we are releasing our [end-to-end pipeline for training MoE based -NLG models](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training), +NLG models](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/moe-training), along with [specific example -scripts](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training/examples_deepspeed/MoE) +scripts](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/moe-training/examples_deepspeed/MoE) and [tutorial](/tutorials/mixture-of-experts-nlg) to help get started with our pipeline. We look forward to the application and the innovations that this may bring to the deep learning community. diff --git a/docs/_posts/2022-07-26-deepspeed-azure.md b/docs/_posts/2022-07-26-deepspeed-azure.md index 749be582d9a0..540f74d4be1b 100644 --- a/docs/_posts/2022-07-26-deepspeed-azure.md +++ b/docs/_posts/2022-07-26-deepspeed-azure.md @@ -19,7 +19,7 @@ In this extended post, we share the details of how DeepSpeed users can train tri ## Making distributed training faster and easier on Azure using DeepSpeed -We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). +We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). ![Workflow](/assets/images/old-vs-new-azure.png){: .align-center} @@ -29,7 +29,7 @@ We compare the existing manual and error-prone workflow with our proposed easy-t For users who have custom environments built using Azure VMs or [Azure VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview), only two steps are needed: - 1) Run the cluster setup script (to be released in the next few weeks) -- 2) Use the Azure VMSS [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) to launch training. +- 2) Use the Azure VMSS [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) to launch training. ## Key Performance Benefits We already shared a summary of our key performance results in the Azure [announcement](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/). We enable the capability to train 2x larger model sizes (2 trillion vs. 1 trillion parameters), scale to 2x more GPUs (1024 vs. 512), and offer up to 1.8x higher compute throughput/GPU (150 TFLOPs vs. 81 TFLOPs) compared to other [cloud providers](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff). @@ -48,7 +48,7 @@ We share the details of our experimental setup and some of the best practices we We used [NDm A100 v4-series](https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) instances in our experiments. Each instance includes two socket AMD EPYC 7V12 64-Core CPUs, 1.7TB main memory and eight A100 80GB GPUs. The system has a balanced PCIe topology connecting 4 GPU devices to each CPU socket. Each GPU within the VM is provided with its own dedicated, topology-agnostic 200 Gb/s NVIDIA Mellanox HDR InfiniBand connection providing an accelerated 200 Gbps high speed fabric. The DeepSpeed library exploits offload capabilities where the activation and optimizer states are allocated in the main memory. Hence, 1.7TB memory capacity per node helps us to scale to large model sizes. ### Training setup using AzureML -Users can directly use the AzureML studio and use our published [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure. +Users can directly use the AzureML studio and use our published [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure. ### Training setup using Azure VMSS @@ -59,7 +59,7 @@ A cluster is created using Azure Virtual Machine Scale Sets (VMSS) to provision | ------------------------------: | :----------------: | | PyTorch | 1.10.2 (installed from source) | | DeepSpeed | 0.6.2 (installed from source) | -| Megatron-LM | [https://github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) | +| Megatron-LM | [https://github.com/deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) | | Apex | 0.1 | | NCCL | 2.12.10 | | CUDNN | 8.2.4.15 | @@ -122,9 +122,9 @@ The 2T parameter model consists of 160 layers, 32k hidden dimension, and 128 att We recognize that DeepSpeed users are diverse and have different environments. In this tutorial, our focus is on making things simpler for users who plan to run large model training experiments on Azure. -> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file. +> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file. -Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations. +Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations. ## Acknowledgement diff --git a/docs/_posts/2022-09-10-zero-inference.md b/docs/_posts/2022-09-10-zero-inference.md index 59a3e3bf15fa..3c588e39c1dc 100644 --- a/docs/_posts/2022-09-10-zero-inference.md +++ b/docs/_posts/2022-09-10-zero-inference.md @@ -83,7 +83,7 @@ Next, we measure the impact on generation throughput using four V100-32GB GPUs. We briefly discuss how users can determine when ZeRO-Inference is suitable for their application and how to enable ZeRO-Inference in DeepSpeed. ### When to use ZeRO-Inference -ZeRO-Inference is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. Also, ZeRO-Inference is optimized for inference applications that are **throughput-oriented** and allow **large batch sizes**. Alternative techniques, such as [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed-Inference](https://www.deepspeed.ai/inference/), and [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) that fit the entire model into GPU memory, possibly using multiple GPUs, are more suitable for inference applications that are latency sensitive or have small batch sizes. +ZeRO-Inference is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. Also, ZeRO-Inference is optimized for inference applications that are **throughput-oriented** and allow **large batch sizes**. Alternative techniques, such as [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed-Inference](https://www.deepspeed.ai/inference/), and [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) that fit the entire model into GPU memory, possibly using multiple GPUs, are more suitable for inference applications that are latency sensitive or have small batch sizes. ### How to use ZeRO-Inference ZeRO-Inference is available in the DeepSpeed library versions >= 0.6.6. Integrating ZeRO-Inference into token generation pipelines, such as [Hugging Face generate](https://huggingface.co/docs/transformers/main_classes/text_generation), requires updating the DeepSpeed configuration to set [ZeRO optimization](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) to stage 3 and [parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) to CPU or NVMe. diff --git a/docs/_posts/2022-10-11-mii.md b/docs/_posts/2022-10-11-mii.md index e0b43f51b1e0..324b7ffbad33 100644 --- a/docs/_posts/2022-10-11-mii.md +++ b/docs/_posts/2022-10-11-mii.md @@ -11,7 +11,7 @@ The Deep Learning (DL) open-source community has seen tremendous growth in the l There has been significant progress in system optimizations for DL model inference that can drastically reduce both latency and cost, but those are not easily accessible. The main reason for this limited accessibility is that the DL model inference landscape is diverse with models varying in size, architecture, system performance characteristics, hardware requirements, etc. Identifying the appropriate set of system optimizations applicable to a given model and applying them correctly is often beyond the scope of most data scientists, making low latency and low-cost inference mostly inaccessible. -[DeepSpeed Model Implementations for Inference (MII)](https://github.com/microsoft/DeepSpeed-MII) is a new open-source python library from DeepSpeed, aimed towards making low-latency, low-cost inference of powerful models not only feasible but also easily accessible. +[DeepSpeed Model Implementations for Inference (MII)](https://github.com/deepspeedai/DeepSpeed-MII) is a new open-source python library from DeepSpeed, aimed towards making low-latency, low-cost inference of powerful models not only feasible but also easily accessible. * MII offers access to highly optimized implementations of **thousands of widely used DL models.** * MII supported models achieve significantly lower latency and cost compared to their original implementation. @@ -33,7 +33,7 @@ Under-the-hood MII is powered by [DeepSpeed-Inference](https://arxiv.org/abs/220 MII supports a growing list of tasks such as text generation, question-answering, text classification, etc, across thousands of transformer models available through multiple open-sourced model repositories such as Hugging Face, FairSeq, EluetherAI, etc. It supports dense models based on BERT, RoBERTa, GPT, OPT, and BLOOM architectures ranging from a few hundred million parameters in size to hundreds of billions of parameters in size. At the same time, it supports recent image generation models such as Stable Diffusion. -See the MII GitHub repo for an up-to-date list of [models and tasks supported by MII](https://github.com/microsoft/deepspeed-mii#supported-models-and-tasks). +See the MII GitHub repo for an up-to-date list of [models and tasks supported by MII](https://github.com/deepspeedai/deepspeed-mii#supported-models-and-tasks). # Inference Optimizations with MII @@ -133,7 +133,7 @@ mii.deploy(task="text-to-image", deployment_type=DeploymentType.AML) ``` -To learn more about these deployment options and get started with MII, please the [MII getting started guide](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii). +To learn more about these deployment options and get started with MII, please the [MII getting started guide](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii). # Concluding Remarks diff --git a/docs/_posts/2022-12-12-data-efficiency.md b/docs/_posts/2022-12-12-data-efficiency.md index 52148707b767..82931a30e167 100644 --- a/docs/_posts/2022-12-12-data-efficiency.md +++ b/docs/_posts/2022-12-12-data-efficiency.md @@ -141,4 +141,4 @@ The composed DeepSpeed Data Efficiency solution leverages both data efficiency t # Concluding Remarks -We are very excited to share DeepSpeed Data Efficiency library with the community and improve it with your feedback. Please find the code, tutorial, and documents at the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed), and [website](/tutorials/data-efficiency/). And for more technical details please read our [Random-LTD paper](https://arxiv.org/abs/2211.11586) and [DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597). We believe that our composable library and novel data efficiency techniques will help users reduce training cost while maintaining model quality or achieve better quality under similar cost. And we hope DeepSpeed Data Efficiency could become a platform that motivates and accelerates future research on deep learning data efficiency. +We are very excited to share DeepSpeed Data Efficiency library with the community and improve it with your feedback. Please find the code, tutorial, and documents at the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed), and [website](/tutorials/data-efficiency/). And for more technical details please read our [Random-LTD paper](https://arxiv.org/abs/2211.11586) and [DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597). We believe that our composable library and novel data efficiency techniques will help users reduce training cost while maintaining model quality or achieve better quality under similar cost. And we hope DeepSpeed Data Efficiency could become a platform that motivates and accelerates future research on deep learning data efficiency. diff --git a/docs/_posts/2023-03-31-multi-modal.md b/docs/_posts/2023-03-31-multi-modal.md index aaef9cfbfd2a..63ea2f94f850 100644 --- a/docs/_posts/2023-03-31-multi-modal.md +++ b/docs/_posts/2023-03-31-multi-modal.md @@ -34,4 +34,4 @@ Specifically, we incorporate the MoE structure into the classical single-tower m A sophisticated MoE model design requires a highly efficient and scalable training system that can support multi-dimensional parallelism and efficient memory management. [DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) training system offers such advanced capabilities including easy-to-use APIs enabling flexible combinations of data, tensor, and expert parallelism. Furthermore, DeepSpeed MoE enables larger model scale than state-of-the-art systems by exploiting expert parallelism and [ZeRO optimizations](https://arxiv.org/abs/1910.02054) together. By leveraging the DeepSpeed MoE system, VL-MoE Base with 32 experts achieves similar model quality as VLMO-dense Large with about 2.5x training speedup. -[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread. +[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/deepspeedai/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread. diff --git a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md index 2fd962327b54..57a77caab32d 100644 --- a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md +++ b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Chat: 一键式RLHF训练,让你的类ChatGPT千亿大模型提速省钱15倍" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md date: 2023-04-24 00:00:00 tags: training ZeRO RLHF Chinese --- diff --git a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md index 63200846ab65..ee3c8dca00fa 100644 --- a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md +++ b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md date: 2023-04-24 00:00:00 tags: training ZeRO RLHF Japanese --- diff --git a/docs/_posts/2023-04-24-deepspeed-chat.md b/docs/_posts/2023-04-24-deepspeed-chat.md index 70b627b951ee..f6cad798ca99 100644 --- a/docs/_posts/2023-04-24-deepspeed-chat.md +++ b/docs/_posts/2023-04-24-deepspeed-chat.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md date: 2023-04-24 00:00:00 tags: training ZeRO RLHF English --- diff --git a/docs/_posts/2023-06-22-zeropp-chinese.md b/docs/_posts/2023-06-22-zeropp-chinese.md index ca52dd5f59ab..71dc2d51cb70 100644 --- a/docs/_posts/2023-06-22-zeropp-chinese.md +++ b/docs/_posts/2023-06-22-zeropp-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed ZeRO++:降低4倍网络通信,显著提高大模型及类ChatGPT模型训练效率" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md date: 2023-06-22 00:00:00 tags: training ZeRO RLHF Chinese --- diff --git a/docs/_posts/2023-06-22-zeropp-japanese.md b/docs/_posts/2023-06-22-zeropp-japanese.md index 745fcac41d97..e81013d11aba 100644 --- a/docs/_posts/2023-06-22-zeropp-japanese.md +++ b/docs/_posts/2023-06-22-zeropp-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed ZeRO++: LLMやチャットモデルの訓練を劇的に高速化 – 通信オーバヘッドを1/4に大幅削減 -" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md date: 2023-06-22 00:00:00 tags: training ZeRO RLHF Japanese --- diff --git a/docs/_posts/2023-08-24-ulysses-chinese.md b/docs/_posts/2023-08-24-ulysses-chinese.md index 613af2fe7583..f8d269217b7a 100644 --- a/docs/_posts/2023-08-24-ulysses-chinese.md +++ b/docs/_posts/2023-08-24-ulysses-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Ulysses: 训练极长序列Transformer模型的系统优化" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md date: 2023-08-24 00:00:00 tags: training ZeRO Chinese --- diff --git a/docs/_posts/2023-08-24-ulysses-japanese.md b/docs/_posts/2023-08-24-ulysses-japanese.md index 921c7c28739d..291407a5523e 100644 --- a/docs/_posts/2023-08-24-ulysses-japanese.md +++ b/docs/_posts/2023-08-24-ulysses-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Ulysses: Transformerモデルを非常に長いシーケンスで訓練するための最適化" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md date: 2023-08-24 00:00:00 tags: training ZeRO Japanese --- diff --git a/docs/_posts/2023-08-24-ulysses.md b/docs/_posts/2023-08-24-ulysses.md index a88a0d66080a..c10b2d599f02 100644 --- a/docs/_posts/2023-08-24-ulysses.md +++ b/docs/_posts/2023-08-24-ulysses.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md date: 2023-08-24 00:00:00 tags: training ZeRO English --- diff --git a/docs/_posts/2023-09-12-ZeRO-Inference.md b/docs/_posts/2023-09-12-ZeRO-Inference.md index 7b9852dc160b..04a6347bec59 100644 --- a/docs/_posts/2023-09-12-ZeRO-Inference.md +++ b/docs/_posts/2023-09-12-ZeRO-Inference.md @@ -1,6 +1,6 @@ title: "ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading" excerpt: "" -link: https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md +link: https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md date: 2023-09-12 00:09:00 tags: inference ZeRO quantization English --- diff --git a/docs/_posts/2023-09-19-deepspeed4science-chinese.md b/docs/_posts/2023-09-19-deepspeed4science-chinese.md index 7b0ccf00aa61..651d61a3b79c 100644 --- a/docs/_posts/2023-09-19-deepspeed4science-chinese.md +++ b/docs/_posts/2023-09-19-deepspeed4science-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed4Science:利用先进的AI系统优化技术实现科学发现" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md date: 2023-09-19 00:00:00 tags: training inference science Chinese --- diff --git a/docs/_posts/2023-09-19-deepspeed4science-japanese.md b/docs/_posts/2023-09-19-deepspeed4science-japanese.md index 8c0a1b6d0082..20d83c8e0b5a 100644 --- a/docs/_posts/2023-09-19-deepspeed4science-japanese.md +++ b/docs/_posts/2023-09-19-deepspeed4science-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md date: 2023-09-19 00:00:00 tags: training inference science Japanese --- diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md index 290b8b4b8ba4..1e0ef0bed34b 100644 --- a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md +++ b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-VisualChat:多轮图像+文字,为你展现不一样的AI聊天魅力" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md date: 2023-10-04 00:00:00 tags: training Chinese --- diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md index f8b7e20cc2cf..745e9052358e 100644 --- a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md +++ b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md +link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md date: 2023-10-04 00:00:00 tags: training Japanese --- diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat.md b/docs/_posts/2023-10-04-deepspeed-visualchat.md index 74a1eb66fd5c..8226597290b2 100644 --- a/docs/_posts/2023-10-04-deepspeed-visualchat.md +++ b/docs/_posts/2023-10-04-deepspeed-visualchat.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md +link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md date: 2023-10-04 00:00:00 tags: training English --- diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md index cc259c20361a..ec936bb6d79e 100644 --- a/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md +++ b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-FastGen:通过 MII 和 DeepSpeed-Inference 实现 LLM 高吞吐量文本生成" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md +link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md date: 2023-11-06 00:00:00 tags: inference Chinese --- diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md index a4bb8237783a..a64b29c88163 100644 --- a/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md +++ b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-FastGen: MIIとDeepSpeed-InferenceによるLLMのための高速なテキスト生成" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md +link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md date: 2023-11-06 00:00:00 tags: inference Japanese --- diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen.md b/docs/_posts/2023-11-06-deepspeed-fastgen.md index 2af103e81f15..d9062ce56da3 100644 --- a/docs/_posts/2023-11-06-deepspeed-fastgen.md +++ b/docs/_posts/2023-11-06-deepspeed-fastgen.md @@ -1,7 +1,7 @@ --- title: "DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference" excerpt: "" -link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen +link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen date: 2023-11-06 00:00:00 tags: inference English --- diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md index a1aa38725970..30a362b82d25 100644 --- a/docs/_tutorials/accelerator-abstraction-interface.md +++ b/docs/_tutorials/accelerator-abstraction-interface.md @@ -90,8 +90,8 @@ This class implements `class DeepSpeedAccelerator` and will be returned by `get_ 2. Op builders following https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder. All op builders needs to inherit `deepspeed.ops.op_builder.builder.OpBuilder` directly or indirectly. A common practice is to implement a base op builder (SYCLOpBuilder in the case of Intel Extension for DeepSpeed) and inherit this base op builder instead. 3. Op kernels as in the following [link](https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder/csrc). -Note that an extension does not have to implement all op builders under https://github.com/microsoft/DeepSpeed/tree/master/op_builder all at a time. A missing op builder usually means certain DeepSpeed functionality cannot be used for that Accelerator, but models that does not use that functionality can still run. +Note that an extension does not have to implement all op builders under https://github.com/deepspeedai/DeepSpeed/tree/master/op_builder all at a time. A missing op builder usually means certain DeepSpeed functionality cannot be used for that Accelerator, but models that does not use that functionality can still run. When implementing op builder for an accelerator extension, one thing needs to be noted is that the op builder native code is being built by DeepSpeed jit load mechanism. This mean the native source file being built needs to be in DeepSpeed installation directory. However these files are defined in accelerator extension installation directory, which cannot be built by DeepSpeed directly. To solve this, follow the example in https://github.com/intel/intel-extension-for-deepspeed/blob/main/intel_extension_for_deepspeed/op_builder/cpu_adam.py to use 'sycl_kernel_path' and 'sycl_kernel_include' (User can change 'sycl' to other prefix in their own accelerator extension) to allow native code be built during DeepSpeed jit load. -When accelerator extension is installed in the environment, it can be used by either explicit call deepspeed.accelerator.set_accelerator(XYZ_Accelerator()) following the example in https://github.com/microsoft/DeepSpeed/blob/master/accelerator/real_accelerator.py, or add an implicit detection code in get_accelerator in the same file above. +When accelerator extension is installed in the environment, it can be used by either explicit call deepspeed.accelerator.set_accelerator(XYZ_Accelerator()) following the example in https://github.com/deepspeedai/DeepSpeed/blob/master/accelerator/real_accelerator.py, or add an implicit detection code in get_accelerator in the same file above. diff --git a/docs/_tutorials/accelerator-setup-guide.md b/docs/_tutorials/accelerator-setup-guide.md index 75c20134b5b7..ecb77f475375 100644 --- a/docs/_tutorials/accelerator-setup-guide.md +++ b/docs/_tutorials/accelerator-setup-guide.md @@ -246,7 +246,7 @@ accelerator: npu ## Multi-card parallel training using Huawei Ascend NPU -To perform model training across multiple Huawei Ascend NPU cards using DeepSpeed, see the examples provided in [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/cifar/cifar10_deepspeed.py). +To perform model training across multiple Huawei Ascend NPU cards using DeepSpeed, see the examples provided in [DeepSpeed Examples](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/cifar/cifar10_deepspeed.py). # Intel Gaudi PyTorch models can be run on Intel® Gaudi® AI accelerator using DeepSpeed. Refer to the following user guides to start using DeepSpeed with Intel Gaudi: diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md index d27ecf021421..d01378484172 100755 --- a/docs/_tutorials/advanced-install.md +++ b/docs/_tutorials/advanced-install.md @@ -106,7 +106,7 @@ pip install . For installs spanning multiple nodes we find it useful to install DeepSpeed using the -[install.sh](https://github.com/microsoft/DeepSpeed/blob/master/install.sh) +[install.sh](https://github.com/deepspeedai/DeepSpeed/blob/master/install.sh) script in the repo. This will build a Python wheel locally and copy it to all the nodes listed in your hostfile (either given via `--hostfile`, or defaults to `/job/hostfile`). @@ -195,7 +195,7 @@ DS_SKIP_CUDA_CHECK=1 Some DeepSpeed features require specific dependencies outside the general dependencies of DeepSpeed. * Python package dependencies per feature/op please -see our [requirements directory](https://github.com/microsoft/DeepSpeed/tree/master/requirements). +see our [requirements directory](https://github.com/deepspeedai/DeepSpeed/tree/master/requirements). * We attempt to keep the system level dependencies to a minimum, however some features do require special system-level packages. Please see our `ds_report` tool output to see if you are missing any system-level packages for a given feature. diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md index 6488f9b718fe..a7de4721a5ce 100755 --- a/docs/_tutorials/automatic-tensor-parallelism.md +++ b/docs/_tutorials/automatic-tensor-parallelism.md @@ -66,7 +66,7 @@ With automatic tensor parallelism, we do not need to provide the injection polic # Example Script -We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information. +We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information. ## Launching diff --git a/docs/_tutorials/autotuning.md b/docs/_tutorials/autotuning.md index 38648daa89f2..2935f38946ac 100644 --- a/docs/_tutorials/autotuning.md +++ b/docs/_tutorials/autotuning.md @@ -8,23 +8,23 @@ Make sure you've read the DeepSpeed tutorials on [Getting Started](https://www.d One pain point in model training is to figure out good performance-relevant configurations such as micro-batch size to fully utilize the hardware and achieve a high throughput number. This configuration exploring process is commonly done manually but is important since model training is repeated many times and benefits from using a good configuration. Not only is the hand-tuning process time-consuming, but the outcome is hardware-dependent. This means that a good configuration on one hardware might not be the best on another different hardware. The user thus has to hand tune the configuration again. With DeepSpeed, there are more configuration parameters that could potentially affect the training speed, thus making it more tedious to manually tune the configuration. -The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. In this tutorial, we showcase the usage and benefits of the autotuning feature in DeepSpeed. For more details, please see the [README.md](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning). +The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. In this tutorial, we showcase the usage and benefits of the autotuning feature in DeepSpeed. For more details, please see the [README.md](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning). ## Tuning scope and strategy The DeepSpeed Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. Currently, the DeepSpeed Autotuner tunes ZeRO stages, micro-batch size per GPU, and ZeRO configurations (offloading is not yet supported) on top of other configurations such as optimizer, scheduler, fp16 defined by the user in the DeepSpeed configuration file. -Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune are also configurable and can be overwritten by the user through the DeepSpeed configuration file. See [Configuring Tuning Scope](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#configuring-tuning-scope) for details. +Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune are also configurable and can be overwritten by the user through the DeepSpeed configuration file. See [Configuring Tuning Scope](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#configuring-tuning-scope) for details. ## Ease of use DeepSpeed Autotuning is easy to use, requiring no code change from DeepSpeed users. -Compared to the original training script (`deepspeed your_program.py --deepspeed ds_config.json`), invoking the autotuning feature in DeepSpeed only requires setting an `autotuning` flag after the DeepSpeed launcher (see [Usage](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#usage) for details), and adding `" autotuning": {"enabled": true}` to the DeepSpeed configuration file. Users can further tailor the autotuning process by changing the autotuning configuration in the DeepSpeed configuration JSON file (See [Autotuning Configuration](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#autotuning-configuration) for details). +Compared to the original training script (`deepspeed your_program.py --deepspeed ds_config.json`), invoking the autotuning feature in DeepSpeed only requires setting an `autotuning` flag after the DeepSpeed launcher (see [Usage](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#usage) for details), and adding `" autotuning": {"enabled": true}` to the DeepSpeed configuration file. Users can further tailor the autotuning process by changing the autotuning configuration in the DeepSpeed configuration JSON file (See [Autotuning Configuration](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#autotuning-configuration) for details). ## Example -We demonstrate the usage and benefit of autotuning using the training of a 0.77 billion parameter [GPT2-large model](https://huggingface.co/gpt2-large) from Hugging Face on 16 Nvidia V100 GPUs. For more examples, refer to [autotuning](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. Note that autotuning works with any DeepSpeed-accelerated model training, not limited to Hugging Face models. +We demonstrate the usage and benefit of autotuning using the training of a 0.77 billion parameter [GPT2-large model](https://huggingface.co/gpt2-large) from Hugging Face on 16 Nvidia V100 GPUs. For more examples, refer to [autotuning](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. Note that autotuning works with any DeepSpeed-accelerated model training, not limited to Hugging Face models. The model has: @@ -119,7 +119,7 @@ Note that the performance metric used in autotuning is calculated using the timi Tuning completed in 0:27:33.988447. Total number of experiments: 13. -As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models. +As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models. ### DeepSpeed Autotuning with AzureML diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md index 38af70b3f4b0..1bbfb687d812 100644 --- a/docs/_tutorials/azure.md +++ b/docs/_tutorials/azure.md @@ -13,10 +13,10 @@ The recommended and simplest method to try DeepSpeed on Azure is through [AzureM For AzureML v1 examples, please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). -> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) for end-to-end training on AzureML. +> Our [Megatron-DeepSpeed](https://github.com/deepspeedai/megatron-deepspeed) contains the most up to date [recipe](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) for end-to-end training on AzureML. # DeepSpeed on Azure VMs If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks. -If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) that can easily be modified to train various model configurations. +If you already have a cluster setup, you can use the [azure recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) that can easily be modified to train various model configurations. diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md index f833acebde9a..efb8fa268e29 100755 --- a/docs/_tutorials/bert-finetuning.md +++ b/docs/_tutorials/bert-finetuning.md @@ -14,7 +14,7 @@ example (DeepSpeedExamples/training/BingBertSquad) we will be going over in the this tutorial. ```shell -git clone https://github.com/microsoft/DeepSpeed +git clone https://github.com/deepspeedai/DeepSpeed cd DeepSpeed git submodule update --init --recursive cd DeepSpeedExamples/training/BingBertSquad diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md index 14789d3fda96..342918de958d 100755 --- a/docs/_tutorials/bert-pretraining.md +++ b/docs/_tutorials/bert-pretraining.md @@ -5,7 +5,7 @@ tags: training pre-training --- **Note:** -On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use. +On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use. {: .notice--info} In this tutorial we will apply DeepSpeed to pre-train the BERT @@ -26,7 +26,7 @@ We work from adaptations of [huggingface/transformers](https://github.com/huggingface/transformers) and [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples). We have forked this repo under -[DeepSpeedExamples/bing_bert](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) +[DeepSpeedExamples/bing_bert](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert) and made several modifications in their script: * We adopted the modeling code from NVIDIA's BERT under `bing_bert/nvidia/`. @@ -360,7 +360,7 @@ the scripts/json configs in our DeepSpeedExamples repo. Below is a table contain summary of the configurations. Specifically see the `ds_train_bert_bsz64k_seq128.sh` and `ds_train_bert_bsz32k_seq512.sh` scripts for more details in -[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert). +[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert). | Parameters | 128 Sequence | 512 Sequence | diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md index 8b4990d0431e..2bd06abf0e89 100644 --- a/docs/_tutorials/cifar-10.md +++ b/docs/_tutorials/cifar-10.md @@ -16,7 +16,7 @@ First we will go over how to run the original CIFAR-10 model. Then we will proce ## Running Original CIFAR-10 -Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute: +Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute: ```bash git submodule update --init --recursive diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md index 2719f08ad200..c4f6141a5b6c 100644 --- a/docs/_tutorials/comms-logging.md +++ b/docs/_tutorials/comms-logging.md @@ -64,7 +64,7 @@ The steps to add DeepSpeed communication log summaries are as follows: 2. (Optional) If your application contains `torch.distributed` calls that you wish to log, import `deepspeed.comm` package and modify `torch.distributed` calls to use `deepspeed.comm` (Note: The `deepspeed.comm` collective and pt2pt APIs exactly match `torch.distributed`) 3. Call `deepspeed.comm.log_summary` -For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example: +For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) example: ```python # Step 2: (Optional) Import deepspeed.comm diff --git a/docs/_tutorials/curriculum-learning.md b/docs/_tutorials/curriculum-learning.md index 29f9417363f0..0b74945d3715 100644 --- a/docs/_tutorials/curriculum-learning.md +++ b/docs/_tutorials/curriculum-learning.md @@ -8,7 +8,7 @@ On 12/12/2022, we released DeepSpeed Data Efficiency Library which provides a mo {: .notice--warning} **Note:** -This tutorial was updated on 10/29/2021. Changes include: 1) A more detailed tuning strategy. 2) Pipeline parallelism support. 3) Token-based learning rate decay. 4) A new GPT-2 example at [github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). See details below. +This tutorial was updated on 10/29/2021. Changes include: 1) A more detailed tuning strategy. 2) Pipeline parallelism support. 3) Token-based learning rate decay. 4) A new GPT-2 example at [github.com/deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed). See details below. {: .notice--info} In this tutorial, we introduce DeepSpeed's curriculum learning-based data pipeline, which presents easier or simpler examples earlier during training. By enabling stable training with 8x/4x larger batch size/learning rate (whereas the baseline approach struggles with training divergence), we observe that curriculum learning (based on sequence length) provides stable and 3.3x faster GPT-2 pre-training (tested on 117M and 1.5B parameters), together with better token-wise convergence speed and zero-shot WikiText-103/LAMBADA evaluation results. In addition, since curriculum learning only affects the data pipeline, its benefit is complementary to many DeepSpeed features and other system optimization techniques. For example, curriculum learning is compatible with DeepSpeed's [ZeRO Redundancy Optimizer](/tutorials/zero/), [ZeRO-Offload](/tutorials/zero-offload/), and [3D Parallelism](/tutorials/pipeline/). @@ -114,17 +114,17 @@ After the update on 10/29/2021, now there are two curriculum learning examples f We provide two curriculum learning examples for Megatron-LM GPT-2 pre-training: -The first one is at [Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3. +The first one is at [Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3. -The second one is at [DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning/](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning). This integration is based on an older Megatron-LM hard copy that we will eventually deprecate and this curriculum learning example does not support pipeline parallelism. We recommend you to ONLY use this example if your model requires ZeRO-2/3. +The second one is at [DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning). This integration is based on an older Megatron-LM hard copy that we will eventually deprecate and this curriculum learning example does not support pipeline parallelism. We recommend you to ONLY use this example if your model requires ZeRO-2/3. Besides the DeepSpeed curriculum learning json configurations described above, there are some other necessary changes on the user side to integrate curriculum learning: ### 2.1 Training data truncation -To enable `seqlen`-based curriculum learning, we need to add the functionality of training data truncation based on the given curriculum sequence length. For the case without pipeline parallelism, it is necessary to add a `curriculum_seqlen` argument in the model's forward pass and use it to perform training data sequence length truncation. For Megatron-LM GPT-2 pre-training, we implement this in `forward()` in [megatron/model/gpt2_model.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/megatron/model/gpt2_model.py) and in `forward_step()` in [pretrain_gpt2.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/pretrain_gpt2.py). +To enable `seqlen`-based curriculum learning, we need to add the functionality of training data truncation based on the given curriculum sequence length. For the case without pipeline parallelism, it is necessary to add a `curriculum_seqlen` argument in the model's forward pass and use it to perform training data sequence length truncation. For Megatron-LM GPT-2 pre-training, we implement this in `forward()` in [megatron/model/gpt2_model.py](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/megatron/model/gpt2_model.py) and in `forward_step()` in [pretrain_gpt2.py](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/pretrain_gpt2.py). -For the case with pipeline parallelism, due to DeepSpeed engine limitations we cannot inject the `curriculum_seqlen` argument in the forward pass. Instead, we create a duplicate of `deepspeed.runtime.data_pipeline.curriculum_scheduler` on the user side, and use it to retrieve the `curriculum_seqlen`. This implementation can be found in [megatron/training.py](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/training.py). +For the case with pipeline parallelism, due to DeepSpeed engine limitations we cannot inject the `curriculum_seqlen` argument in the forward pass. Instead, we create a duplicate of `deepspeed.runtime.data_pipeline.curriculum_scheduler` on the user side, and use it to retrieve the `curriculum_seqlen`. This implementation can be found in [megatron/training.py](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/megatron/training.py). ### 2.2 Disable batch size warmup (`--rampup-batch-size`) In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that curriculum learning (`seqlen`-based) provides much better training stability than the batch size warmup technique introduced by Open AI GPT-3. So when using curriculum learning you need to remove the `--rampup-batch-size` config in your training script. It's not recommended using both curriculum learning and batch size warmup, because both of them reduce the number of tokens in a batch. Another related change you might want is to increase your micro batch size, since without batch size warmup your batch size will be fixed now. diff --git a/docs/_tutorials/data-efficiency.md b/docs/_tutorials/data-efficiency.md index 9ea3a33dab92..b49974f1fa78 100644 --- a/docs/_tutorials/data-efficiency.md +++ b/docs/_tutorials/data-efficiency.md @@ -20,7 +20,7 @@ Curriculum learning has been successfully applied to various training tasks (see ### 1.3 How to use Curriculum Learning #### 1.3.1 GPT-3 and BERT pretraining -The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning. +The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning. **Data analysis:** Curriculum learning requires a data analysis before pretraining that calculate the difficulty of each data sample (based on the metric provided by user), and build an index that map difficulty value to corresponding data samples. (There are exceptions: for example the truncation-based sequence length metric can be achieved by data postprocessing without data analysis.) We provide a data analyzer to perform the offline CPU-only data analysis. @@ -31,7 +31,7 @@ The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed re **Eval/finetuning** `examples_deepspeed/data_efficiency/gpt/eval/` and `examples_deepspeed/data_efficiency/bert/finetune` include the example scripts for GPT-3 model's zero-/few-shot evaluation and BERT model's finetuning. Our [paper](https://arxiv.org/abs/2212.03597) includes the reference eval/finetune results if you follow our example scripts to perform the pretraining/eval/finetuning. #### 1.3.2 GPT-2 finetuning -The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1. +The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1. ## 2. Random layerwise token dropping (random-LTD) @@ -44,14 +44,14 @@ When you want to pretrain/fine-tune a transformer-based model, it is always a go ### 2.3 How to use random-LTD #### 2.3.1 GPT-3 and BERT pretraining -The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining. +The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining. `examples_deepspeed/data_efficiency/gpt/pretrain` and `examples_deepspeed/data_efficiency/bert/pretrain` include the example pretraining scripts with random-LTD feature. Several changes are needed to enable random-LTD during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for random-LTD (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) After initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to use the `convert_to_random_ltd` API to convert and wrap the model layers in order to enable the random-LTD feature. We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) In order for random-LTD to understand the input argument mapping of the forward function, user need to change all the input arguments (except the hidden_states input) into keyword/named argument. For example, in `megatron/model/transformer.py` we changed the forward function from `def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):` to `def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):`. (4) When saving model checkpoints, (especially if the state dictionary has non-traditional structure) user needs to use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. We provide an example implementation of this change in `megatron/model/language_model.py`. For eval/finetuning of the pretrained model, see [previous section](#131-gpt-3-and-bert-pretraining) about how to use our example scripts. #### 2.3.2 GPT-2 and ViT finetuning -The `data_efficiency` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply random-LTD to GPT-2 and ViT finetuning. +The `data_efficiency` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to apply random-LTD to GPT-2 and ViT finetuning. Just like pretraining case, similar changes are required to enable random-LTD for finetuning: (1) DeepSpeed json config file. (2) Use the `convert_to_random_ltd` API to convert and wrap the model layers. (3) When saving model checkpoints, use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. @@ -92,9 +92,9 @@ iter 5474 | LR [0.0001]| val_acc 97.97000122070312 | layer_token 305784192 ## 3. Composing curriculum learning and random-LTD to achieve more ### 3.1 GPT-3 and BERT pretraining -The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining. +The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining. The changes needed are the same as described in previous two sections, since DeepSpeed Data Efficiency already handles the complexity when composing the two techniques. However, one thing to note is that since both random-LTD and some of the curriculum learning metrics will change the sequence length, it could require some extra code to calculate the effective sequence length at each step. We provide an example implementation of this change in `megatron/training.py` function `train` where we calculate the `actual_seq_length`. #### 3.2 GPT-2 finetuning -The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to compose curriculum learning random-LTD for GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. +The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to compose curriculum learning random-LTD for GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. diff --git a/docs/_tutorials/deepnvme.md b/docs/_tutorials/deepnvme.md index 4ed528412eae..a6d4545815dc 100644 --- a/docs/_tutorials/deepnvme.md +++ b/docs/_tutorials/deepnvme.md @@ -2,10 +2,10 @@ title: "DeepNVMe" tags: training inference IO large-model --- -This tutorial will show how to use [DeepNVMe](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md) for data transfers between persistent storage and tensors residing in host or device memory. DeepNVMe improves the performance and efficiency of I/O operations in Deep Learning applications through powerful optimizations built on Non-Volatile Memory Express (NVMe) Solid State Drives (SSDs), Linux Asynchronous I/O (`libaio`), and NVIDIA Magnum IOTM GPUDirect® Storage (GDS). +This tutorial will show how to use [DeepNVMe](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md) for data transfers between persistent storage and tensors residing in host or device memory. DeepNVMe improves the performance and efficiency of I/O operations in Deep Learning applications through powerful optimizations built on Non-Volatile Memory Express (NVMe) Solid State Drives (SSDs), Linux Asynchronous I/O (`libaio`), and NVIDIA Magnum IOTM GPUDirect® Storage (GDS). ## Requirements -Ensure your environment is properly configured to use DeepNVMe. First, you need to install DeepSpeed version >= [0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0). Next, ensure that the DeepNVMe operators are available in the DeepSpeed installation. The `async_io` operator is required for any DeepNVMe functionality, while the `gds` operator is required only for GDS functionality. You can confirm availability of each operator by inspecting the output of `ds_report` to check that compatible status is [OKAY]. Below is a snippet of `ds_report` output confirming the availability of both `async_io` and `gds` operators. +Ensure your environment is properly configured to use DeepNVMe. First, you need to install DeepSpeed version >= [0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0). Next, ensure that the DeepNVMe operators are available in the DeepSpeed installation. The `async_io` operator is required for any DeepNVMe functionality, while the `gds` operator is required only for GDS functionality. You can confirm availability of each operator by inspecting the output of `ds_report` to check that compatible status is [OKAY]. Below is a snippet of `ds_report` output confirming the availability of both `async_io` and `gds` operators. ![deepnvme_ops_report](/assets/images/deepnvme_ops_report.png) @@ -171,15 +171,15 @@ True ## Putting it together We hope that the above material helps you to get started with DeepNVMe. You can also use the following links to see DeepNVMe usage in real-world Deep Learning applications. -1. [Parameter swapper](https://github.com/microsoft/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117) in [ZeRO-Inference](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) and [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). -2. [Optimizer swapper](https://github.com/microsoft/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L36-L38) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). -3. [Gradient swapper](https://github.com/microsoft/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L41-L43) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). -4. Simple file read and write [operations](https://github.com/microsoft/DeepSpeedExamples/blob/master/deepnvme/file_access/README.md). +1. [Parameter swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117) in [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) and [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). +2. [Optimizer swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L36-L38) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). +3. [Gradient swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L41-L43) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/). +4. Simple file read and write [operations](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/deepnvme/file_access/README.md). - +2. [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/): used for offloading [parameters](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117), [gradients](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L41-L43), and [optimizer](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L36-L38). +3. Simple file read and write [operations](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/deepnvme/file_access/README.md). --> ## Acknowledgements diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md index 6b116cb87463..e1cb704fc229 100644 --- a/docs/_tutorials/domino.md +++ b/docs/_tutorials/domino.md @@ -3,4 +3,4 @@ title: "Domino" tags: training --- -Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo. +Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo. diff --git a/docs/_tutorials/ds-sequence.md b/docs/_tutorials/ds-sequence.md index 815b99d6de35..ffcb94946e6a 100755 --- a/docs/_tutorials/ds-sequence.md +++ b/docs/_tutorials/ds-sequence.md @@ -3,7 +3,7 @@ title: "Getting Started with DeepSpeed-Ulysses for Training Transformer Models w tags: training --- -In this tutorial we describe how to enable DeepSpeed-Ulysses. DeepSpeed-Ulysses is a simple but highly communication and memory efficient mechanism sequence parallelism approach for training of large transformer models with massive sequence lengths. It partitions input tensors along the sequence dimension and uses a communication-efficient all-2-all collective for distributed attention computations. Additionally, DeepSpeed-Ulysses incorporates advanced modeling and system optimizations, such as Flash attention, sparse attention, and ZeRO optimizer, to optimize both computational efficiency and memory usage. Training with DeepSpeed sequence parallelism allows both model size and sequence length to scale near indefinitely unbounded by single GPU memory limitation and at a high fraction of peak compute performance. Currently, DeepSpeed-Ulysses can handle sequences up to 1 million in length (10 times the size of a complete Harry Potter book!) on 64 A100 GPUs. Please read our [DeepSpeed-Ulysses blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) to learn more! +In this tutorial we describe how to enable DeepSpeed-Ulysses. DeepSpeed-Ulysses is a simple but highly communication and memory efficient mechanism sequence parallelism approach for training of large transformer models with massive sequence lengths. It partitions input tensors along the sequence dimension and uses a communication-efficient all-2-all collective for distributed attention computations. Additionally, DeepSpeed-Ulysses incorporates advanced modeling and system optimizations, such as Flash attention, sparse attention, and ZeRO optimizer, to optimize both computational efficiency and memory usage. Training with DeepSpeed sequence parallelism allows both model size and sequence length to scale near indefinitely unbounded by single GPU memory limitation and at a high fraction of peak compute performance. Currently, DeepSpeed-Ulysses can handle sequences up to 1 million in length (10 times the size of a complete Harry Potter book!) on 64 A100 GPUs. Please read our [DeepSpeed-Ulysses blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses) to learn more! ## 1. Installation @@ -12,10 +12,10 @@ You will need to install DeepSpeed v0.10.2 or higher to use the DeepSpeed Sequen ## 2. How to use DeepSpeed-Ulysses in your application? -Integrating DS-Seq into your training code is easy, and in this section we describe how to integrate DeepSpeed-Ulysses through our [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) code repo. +Integrating DS-Seq into your training code is easy, and in this section we describe how to integrate DeepSpeed-Ulysses through our [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) code repo. -* **Replace attention module**: First, you need to update your attention module with DeepSpeed-Ulysses DistributedAttention. Here, we use the attention from [Megatron-DeepSpeed ](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/model/transformer.py) which is the causal attention used in GPT-3 like model training. Rewrite the attention block: +* **Replace attention module**: First, you need to update your attention module with DeepSpeed-Ulysses DistributedAttention. Here, we use the attention from [Megatron-DeepSpeed ](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/megatron/model/transformer.py) which is the causal attention used in GPT-3 like model training. Rewrite the attention block: ```python def __init__(): diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md index 24efc238615a..d4a7496405b9 100644 --- a/docs/_tutorials/flops-profiler.md +++ b/docs/_tutorials/flops-profiler.md @@ -184,7 +184,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d #### Example: Megatron-LM -For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/megatron/Megatron-LM). +For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/megatron/Megatron-LM). An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attention_heads = 32, batch_size = 1024, seq_length = 1024`) is shown below. diff --git a/docs/_tutorials/gan.md b/docs/_tutorials/gan.md index 09572a439eb0..db3734fb3b96 100755 --- a/docs/_tutorials/gan.md +++ b/docs/_tutorials/gan.md @@ -16,7 +16,7 @@ Please go through the [original tutorial](https://pytorch.org/tutorials/beginner ## Enabling DeepSpeed -The codes may be obtained [here](https://github.com/microsoft/DeepSpeedExamples/tree/master/gan). +The codes may be obtained [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/gan). ### Argument Parsing diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md index 29c904da9820..1d5899204f53 100644 --- a/docs/_tutorials/inference-tutorial.md +++ b/docs/_tutorials/inference-tutorial.md @@ -3,17 +3,17 @@ title: "Getting Started with DeepSpeed for Inferencing Transformer based Models" tags: inference --- ->**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!** +>**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!** DeepSpeed-Inference introduces several features to efficiently serve transformer-based PyTorch models. It supports model parallelism (MP) to fit large models that would otherwise not fit in GPU memory. Even for smaller models, MP can be used to reduce latency for inference. To further reduce latency and cost, we introduce inference-customized kernels. Finally, we propose a novel approach to quantize models, called MoQ, to both shrink the model and reduce the inference cost at production. For more details on the inference related optimizations in DeepSpeed, please refer to our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/). -DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py). +DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py). ## Initializing for Inference For inference with DeepSpeed, use `init_inference` API to load the model for inference. Here, you can specify the MP degree, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or the checkpoint path. -To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True for the compatible models. For models not supported by DeepSpeed, the users can submit a PR that defines a new policy in [replace_policy class](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The policy classes in DeepSpeed create a mapping between the parameters of the original user-supplied layer implementation with DeepSpeed's inference-optimized Transformer layer. +To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True for the compatible models. For models not supported by DeepSpeed, the users can submit a PR that defines a new policy in [replace_policy class](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The policy classes in DeepSpeed create a mapping between the parameters of the original user-supplied layer implementation with DeepSpeed's inference-optimized Transformer layer. ```python # create the model diff --git a/docs/_tutorials/large-models-w-deepspeed.md b/docs/_tutorials/large-models-w-deepspeed.md index 8e09cccee1fe..3d0bae0144b4 100644 --- a/docs/_tutorials/large-models-w-deepspeed.md +++ b/docs/_tutorials/large-models-w-deepspeed.md @@ -28,7 +28,7 @@ Since, ZeRO is a replacement to data parallelism, it offers a seamless integrati ## Deciding which technology to use -**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below: +**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/deepspeedai/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below: [3D Parallelism Tutorial](https://www.deepspeed.ai/tutorials/pipeline/) A generic tutorial on how to port your model to use DeepSpeed 3D parallelism diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md index 5242c8184db8..286a9a36a926 100644 --- a/docs/_tutorials/megatron.md +++ b/docs/_tutorials/megatron.md @@ -19,7 +19,7 @@ reduction_** from using DeepSpeed. ## Training GPT-2 with the Original Megatron-LM -We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/microsoft/Megatron-DeepSpeed) and made it available as a submodule. To download, execute: +We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/deepspeedai/Megatron-DeepSpeed) and made it available as a submodule. To download, execute: ```bash git submodule update --init --recursive ``` diff --git a/docs/_tutorials/mixed_precision_zeropp.md b/docs/_tutorials/mixed_precision_zeropp.md index 12ad3556abde..9429b75bac10 100644 --- a/docs/_tutorials/mixed_precision_zeropp.md +++ b/docs/_tutorials/mixed_precision_zeropp.md @@ -3,7 +3,7 @@ title: "Mixed Precision ZeRO++" tags: training ZeRO communication-efficiency large-model --- -Mixed Precision ZeRO++ (MixZ++) is a set of optimization strategies based on [ZeRO](/tutorials/zero/) and [ZeRO++](/tutorials/zeropp/) to improve the efficiency and reduce memory usage for large model training and inference when users use [Low-Rank Adaptation (LoRA)]([/tutorials/zero/](https://arxiv.org/abs/2106.09685)) training. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by up to [3.3x](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31) for the Llama-2-70B model running on 128 V100 GPUs. Read our [DeepSpeed Chat Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31), [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more! +Mixed Precision ZeRO++ (MixZ++) is a set of optimization strategies based on [ZeRO](/tutorials/zero/) and [ZeRO++](/tutorials/zeropp/) to improve the efficiency and reduce memory usage for large model training and inference when users use [Low-Rank Adaptation (LoRA)]([/tutorials/zero/](https://arxiv.org/abs/2106.09685)) training. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by up to [3.3x](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31) for the Llama-2-70B model running on 128 V100 GPUs. Read our [DeepSpeed Chat Blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31), [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more! We recommend that you read the tutorials on [Getting Started](/getting-started/), [ZeRO](/tutorials/zero/) and [Megatron-DeepSpeed](/tutorials/megatron/) before stepping through this tutorial. @@ -16,7 +16,7 @@ Collectively, the optimizations bring better scalability and efficiency to LoRA ## Enabling Mixed Precision ZeRO++ (MixZ++) -A ready to go MixZ++ example has been prepared at [MixZ++ example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh). If you prefer to manually enable MixZ++ in your pipeline, please refer to the instructions below. +A ready to go MixZ++ example has been prepared at [MixZ++ example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh). If you prefer to manually enable MixZ++ in your pipeline, please refer to the instructions below. ### DeepSpeed Configuration Changes An example snippet of deepspeed configurations with all MixZ++ optimization enabled is shown below: diff --git a/docs/_tutorials/mixture-of-experts-inference.md b/docs/_tutorials/mixture-of-experts-inference.md index 882ad7aefd1f..675815dd5d57 100644 --- a/docs/_tutorials/mixture-of-experts-inference.md +++ b/docs/_tutorials/mixture-of-experts-inference.md @@ -54,7 +54,7 @@ output = model('Input String') Here, we show a text-generation example using an MoE model for which we can specify the model-parallel size and number of experts. DeepSpeed inference-engine takes care of creating the different parallelism groups using the tensor-slicing degree, number of experts, and the total number of GPUs used for running the MoE model. Regarding the expert parameters, we first use the expert-parallelism to assign each group of experts to one GPU. If number of GPUs is higher than number of experts, we use expert-slicing to partition each expert vertically/horizontally across the GPUs. -Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/generate_text.sh) for a complete generate-text inference example. +Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/examples_deepspeed/generate_text.sh) for a complete generate-text inference example. ```bash diff --git a/docs/_tutorials/mixture-of-experts-nlg.md b/docs/_tutorials/mixture-of-experts-nlg.md index 6fc7022ba1fb..c4fb072dd82d 100755 --- a/docs/_tutorials/mixture-of-experts-nlg.md +++ b/docs/_tutorials/mixture-of-experts-nlg.md @@ -7,7 +7,7 @@ In this tutorial, we introduce how to apply DeepSpeed Mixture of Experts (MoE) t ## 1. Installation -You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) repo under the MoE folder. +You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) repo under the MoE folder. ## 2. Training NLG+MoE models @@ -15,7 +15,7 @@ You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The To apply MoE to the GPT-style model, we made several changes in Megatron framework, mostly in `megatron/model/` where we add the MoE layers into the model. ### 2.2. Pre-training the Standard MoE model -We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model: +We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model: `--num-experts`: the number of experts per MoE layer. In our experiments we set it to 128. Larger number of experts tend to provide better convergence, but it's a diminishing return. @@ -30,7 +30,7 @@ We provide example training scripts under [examples_deepspeed/MoE](https://githu ### 2.3. Pre-training the PR-MoE model -PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE: +PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE: `--num-experts`: Instead of providing a single number, to enable Pyramid-MoE, you need to provide a list, whose length is the same as the number of MoE layers. We suggest to use more experts in the latter stage (close to output) of the model. @@ -67,4 +67,4 @@ MoS, standing for Mixture-of-Students, is a staged distillation-based technique In addition to the new parameters above, we observe that using the teacher PR-MoE during the entire training process may adversely impact the final student model accuracy. In our experiments, we use a staged distillation method by stopping distillation early in the training process (e.g., after 400K steps) and perform optimization only against the standard language modeling loss for the rest of the training. -We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596). +We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596). diff --git a/docs/_tutorials/mixture-of-experts.md b/docs/_tutorials/mixture-of-experts.md index a2260d98e49e..d4604b929ff4 100644 --- a/docs/_tutorials/mixture-of-experts.md +++ b/docs/_tutorials/mixture-of-experts.md @@ -13,7 +13,7 @@ For more details on results and further discussion, please see our press release {: .notice--info} As a simple starting point we will show how to apply DeepSpeed MoE to a cifar10 example. Please refer to -our [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) going forward. +our [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) going forward. If you are adding MoE to an existing model you can use the snippet below to help guide you: @@ -104,11 +104,11 @@ fc4 = torch.nn.Linear(84, 10) ``` -For a runnable end-to-end example that covers both the standard MoE architecture as well as the PR-MoE model , please look at the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar). In addition, see the advanced usage section of this tutorial that links to a more comprehensive example for NLG models. +For a runnable end-to-end example that covers both the standard MoE architecture as well as the PR-MoE model , please look at the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). In addition, see the advanced usage section of this tutorial that links to a more comprehensive example for NLG models. ### Combining ZeRO-Offload and DeepSpeed MoE for very large models -To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar). +To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). The relevant function that creates these param groups is as follows. @@ -134,7 +134,7 @@ model_engine, optimizer, trainloader, __ = deepspeed.initialize( We are working on automating this functionality in the DeepSpeed ZeRO optimizer so the model training code can be simplified further. -To run the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags +To run the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags ```json "zero_optimization": { diff --git a/docs/_tutorials/model-compression.md b/docs/_tutorials/model-compression.md index c8713cb1f616..d11eadc3d726 100644 --- a/docs/_tutorials/model-compression.md +++ b/docs/_tutorials/model-compression.md @@ -25,7 +25,7 @@ If the model is very deep, you may consider using this method. It works much bet Layer reduction can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#layer-reduction)). Users have the freedom to select any depth by `keep_number_layer` and any subset of the network layers by `teacher_layer`. In addition, users also can choose whether to reinitialize the input/output layers from the given model (teacher model) by `other_module_name`. -To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)): +To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)): (1) When initial the model, the number of layers in the model config should be the same as `keep_number_layer` in DeepSpeed config JSON file. For Hugging Face BERT example, set `config.num_hidden_layers = ds_config["compression_training"]["layer_reduction"]["keep_number_layer"]`. @@ -33,7 +33,7 @@ To apply layer reduction for task-specific compression, we provide an example on (3) During training, if KD is not used, nothing needs to be done. Otherwise, one needs to consider applying KD with the `teacher_layer` JSON configuration when calculating the difference between teacher’s and student’s output. -One can run our layer reduction example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our layer reduction example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -49,7 +49,7 @@ Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.834029 To apply layer reduction for task-agnostic compression, we provide an example on how to do so in the GPT pre-training stage. -Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). +Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed). Step 2: Enter `Megatron-DeepSpeed/examples_deepspeed/compression` directory. @@ -97,13 +97,13 @@ Weight quantization can be enabled and configured using the DeepSpeed config JSO (4)`start_bit` and `target_bit`, to simplify the first experiment we suggest to set them the same such that we apply quantization to the target bit once the iteration reaches `schedule_offset`. -There are two changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)): +There are two changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)): (1) After initialization of the model, apply `init_compression` function to the model with DeepSpeed JSON configurations. (2) After training, apply `redundancy_clean` function to save the quantized weight. -One can run our weight quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our weight quantization example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -130,13 +130,13 @@ It can improve computation efficiency similar to [weight quantization](#12-weigh Activation quantization can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#activation-quantization)). Some of the components are same as weight quantization, such as `schedule_offset` and `quantization_type`. The key configurations we would like to point out are: -(1)`range_calibration`, user has option to set dynamic or static. When using “dynamic”, the activation quantization groups will be automatically set to be token-wise (for Transformer-based models) and image-wise (for CNN-based models). See more in [our ZeroQuant paper](https://arxiv.org/abs/2206.01861) and the code (`deepspeed/compression/basic_layer.py` in [DeepSpeed](https://github.com/microsoft/DeepSpeed)). +(1)`range_calibration`, user has option to set dynamic or static. When using “dynamic”, the activation quantization groups will be automatically set to be token-wise (for Transformer-based models) and image-wise (for CNN-based models). See more in [our ZeroQuant paper](https://arxiv.org/abs/2206.01861) and the code (`deepspeed/compression/basic_layer.py` in [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)). (2)`aq1`/`aq2`, users can expand more groups such as `aq3`, `aq4`, etc. The client code change is the same as [weight quantization](#12-weight-quantization). -One can run our activation quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our activation quantization example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -188,7 +188,7 @@ Sparse pruning can be enabled and configured using the DeepSpeed config JSON fil The client code change is the same as [weight quantization](#12-weight-quantization). -One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -223,7 +223,7 @@ Row pruning can be enabled and configured using the DeepSpeed config JSON file ( The client code change is the same as [weight quantization](#12-weight-quantization). -One can run our row pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our row pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -260,7 +260,7 @@ Head pruning can be enabled and configured using the DeepSpeed config JSON file The client code change is the same as [weight quantization](#12-weight-quantization). -One can run our head pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our head pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -286,7 +286,7 @@ Channel pruning is a feature designed for two back-to-back CONV2d layers (e.g., Channel pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#channel-pruning)). -One can run our channel pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our channel pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell pip install torch torchvision @@ -315,7 +315,7 @@ When you want to quantize the transformer-based model to INT8 or INT4/INT8 forma **How to use ZeroQuant** -One can run our BERT example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by: +One can run our BERT example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -363,7 +363,7 @@ If you want to significantly compress your models while retaining competitive pe **How to use XTC** -**Installation:** Examples of XTC extreme compression for BERT models are at `compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). You will need to install the requirements by: +**Installation:** Examples of XTC extreme compression for BERT models are at `compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples). You will need to install the requirements by: ```shell DeepSpeedExamples/compression/bert$ pip install -r requirements.txt @@ -373,7 +373,7 @@ DeepSpeedExamples/compression/bert$ pip install -r requirements.txt To accommodate users who do not have a fine-tuned model or task-specific model for compression, with the arg `--model_name_or_path yoshitomo-matsubara/bert-base-uncased-${TASK_NAME}` our python script `run_glue_no_trainer.py` automatically downloads the models from Hugging Face. Users can also use their own models with better accuracy as the teacher and the student model initialization. ### 3.1 One-bit or Two-bit BERT-base (12-layer) with 8-bit activation quantization -For the configurations, see `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`). In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization. +For the configurations, see `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`). In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization. One can run this example by: @@ -387,7 +387,7 @@ And the final result is: Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8293428425878757/0.8396053702196908 ``` -The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case. +The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case. With this config, we quantize the existing fined-tuned models downloaded from Hugging Face. For 2-bit weight quantization, user needs to update the ds_config JSON file. To give a sense of the compression performance of downloaded models compared to our paper, we collect the results (1/2-bit BERT on MNLI and QQP with 18 training epochs) in table below. The difference between this tutorial and paper is because they use different checkpoints. Data augmentation introduces in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) will help significantly for smaller tasks (such as mrpc, rte, sst-b and cola). See more details in [our paper](https://arxiv.org/abs/2206.01859). @@ -399,7 +399,7 @@ This section consists of two parts: (a) we first perform a light-weight layer re **3.2.1 Light-weight Layer Reduction** -`compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `compression/bert/config/ds_config_TEMPLATE.json`). +`compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `compression/bert/config/ds_config_TEMPLATE.json`). One can run this example by: @@ -421,7 +421,7 @@ For mnli/qqp, we set `--num_train_epochs 36`, `--learning_rate 5e-5`, and with t **3.2.2 One-bit or Two-bit quantization for 6-layer (5-layer) BERT** -Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`. +Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`. One can run this example by: diff --git a/docs/_tutorials/monitor.md b/docs/_tutorials/monitor.md index 572e3f4558a7..5e7a6fc4e834 100644 --- a/docs/_tutorials/monitor.md +++ b/docs/_tutorials/monitor.md @@ -81,7 +81,7 @@ The steps to create a custom monitor are as follows: \* Note - Some Monitor backends don't support mixed sample values. Be sure to use your DeepSpeed engine object's `global_samples` attribute in each 3-tuple -For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example: +For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) example: ```python # Step 1: Import monitor (and DeepSpeed config, if needed) diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md index e66bba3f818b..e24dc8f86554 100644 --- a/docs/_tutorials/onebit-adam.md +++ b/docs/_tutorials/onebit-adam.md @@ -33,7 +33,7 @@ If you don't already have a copy of the DeepSpeed repository, please clone it now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples. ```shell -git clone https://github.com/microsoft/DeepSpeed +git clone https://github.com/deepspeedai/DeepSpeed cd DeepSpeed git submodule update --init --recursive cd DeepSpeedExamples/ @@ -115,7 +115,7 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_ (New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`. #### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients -Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script. +Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script. **Watch out!** 1-bit Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence. @@ -136,7 +136,7 @@ You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [Hug ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam -We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun. +We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
More news @@ -66,15 +66,15 @@ In line with Microsoft's mission to solve humanity's most pressing challenges, t ## DeepSpeed Library - The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)). + The [DeepSpeed](https://github.com/deepspeedai/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)). ## Model Implementations for Inference (MII) - [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions. + [Model Implementations for Inference (MII)](https://github.com/deepspeedai/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions. ## DeepSpeed on Azure - DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/). + DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/). # DeepSpeed Adoption @@ -131,15 +131,15 @@ comments. 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html). 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie). [[paper]](https://arxiv.org/abs/2101.06840) [[slides]](https://www.usenix.org/system/files/atc21_slides_ren-jie.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/) 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html). -6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) +6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) 7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/). 8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh). 9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009). -10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) +10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) 11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990). 12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl). -13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/) -14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/) +13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/) +14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/) 15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586). 16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597) [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) 17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017) and [ICML2023](https://icml.cc/Conferences/2023). @@ -148,10 +148,10 @@ comments. 20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023. 21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023. 22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Xiaoxia Wu, Connor Holmes, Zhewei Yao, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209) and [ML for Sys Workshop at NeurIPS2023](http://mlforsystems.org/) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) -23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) +23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) 24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847) 25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320). -26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) +26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) 27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf) 28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/) 29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723) diff --git a/examples/README.md b/examples/README.md index 14393ef4545d..c7ff01dcd2d4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,8 +2,8 @@ If you are looking for examples using DeepSpeed please see the following resources: -1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) -2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) +1. [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) +2. [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) 3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed) 4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/deepspeed) 5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html) diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py index 66f748ee8f7a..b8e5e8a6fd9a 100644 --- a/op_builder/sparse_attn.py +++ b/op_builder/sparse_attn.py @@ -68,7 +68,7 @@ def is_compatible(self, verbose=False): import triton except ImportError: # auto-install of triton is broken on some systems, reverting to manual install for now - # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710 + # see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710 if verbose: self.warning(f"please install triton==1.0.0 if you want to use sparse attention") return False diff --git a/setup.py b/setup.py index ec2e9a754d30..20afc651a142 100755 --- a/setup.py +++ b/setup.py @@ -313,7 +313,7 @@ def op_enabled(op_name): url='http://deepspeed.ai', project_urls={ 'Documentation': 'https://deepspeed.readthedocs.io', - 'Source': 'https://github.com/microsoft/DeepSpeed', + 'Source': 'https://github.com/deepspeedai/DeepSpeed', }, install_requires=install_requires, extras_require=extras_require, diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py index d39953f40f07..2ae2755086f8 100644 --- a/tests/unit/runtime/zero/test_zero.py +++ b/tests/unit/runtime/zero/test_zero.py @@ -85,7 +85,7 @@ def test(self, zero_stage): run_unbalanced_gradients(model, data_loader) -# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227 +# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227 @pytest.mark.parametrize("mics_enabled", [True, False]) class TestZero3RepeatForwardLoop(DistributedTest): world_size = 1 @@ -144,8 +144,8 @@ def forward(self, x, y): model.step() -# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227 -# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372 +# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227 +# also reproduces the https://github.com/deepspeedai/DeepSpeed/pull/1372 @pytest.mark.parametrize("zero_stage", [2, 3]) @pytest.mark.parametrize("freeze_params", [True, False]) class TestZeroToFP32(DistributedTest): @@ -178,7 +178,7 @@ class MyModel(torch.nn.Module): def __init__(self, hidden_dim, n_layers, freeze_params): super().__init__() - # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that + # to reproduce https://github.com/deepspeedai/DeepSpeed/pull/1372 it is important that # the number of total elements is uneven: # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers)) @@ -1676,7 +1676,7 @@ def test(self, prefetch_ratio, zero_stage=3): # Avoid overwriting client module id -# https://github.com/microsoft/DeepSpeed/issues/6772 +# https://github.com/deepspeedai/DeepSpeed/issues/6772 class TestZero3ClientModuleID(DistributedTest): world_size = 2 diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py index 21955f5df152..77a8744ab5bc 100644 --- a/tests/unit/runtime/zero/test_zero_context_ancestry.py +++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py @@ -32,7 +32,7 @@ # test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering -# fixed by https://github.com/microsoft/DeepSpeed/pull/1202 +# fixed by https://github.com/deepspeedai/DeepSpeed/pull/1202 class GrandPa(torch.nn.Module): def __init__(self, *args): diff --git a/tests/unit/sequence_parallelism/test_ulysses.py b/tests/unit/sequence_parallelism/test_ulysses.py index 821847c44265..bd20900a3d1e 100644 --- a/tests/unit/sequence_parallelism/test_ulysses.py +++ b/tests/unit/sequence_parallelism/test_ulysses.py @@ -61,7 +61,7 @@ def test_alltoall_output_consistency(self, d0: int, d1: int, head_dim: int, num_ outputs = [] seq_dims = [0] #seq first API #TODO: Add support for batch first (that seq_dims=[0,1]) after PR for bs>1 issue with batch first is fixed - ## See discussion in : https://github.com/microsoft/DeepSpeed/issues/5808 + ## See discussion in : https://github.com/deepspeedai/DeepSpeed/issues/5808 for seq_dim in seq_dims: gather_idx = seq_dim #first all2all: sequence parallel to head parallel From f5d9a67e64c50fbf02bf8964d406ed3dcf243370 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 5 Feb 2025 11:48:14 -0800 Subject: [PATCH 5/8] Update CNAME --- docs/CNAME | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/CNAME b/docs/CNAME index 72033bc5f7fe..47cf5e28f3ea 100644 --- a/docs/CNAME +++ b/docs/CNAME @@ -1 +1 @@ -www.deepspeed.ai +deepspeed.ai \ No newline at end of file From 1802d0dccd45b0cc6b6cf380d950de53c52d68de Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:16:37 -0800 Subject: [PATCH 6/8] Update CNAME --- docs/CNAME | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/CNAME b/docs/CNAME index 47cf5e28f3ea..47f170e64eeb 100644 --- a/docs/CNAME +++ b/docs/CNAME @@ -1 +1 @@ -deepspeed.ai \ No newline at end of file +www.deepspeed.ai \ No newline at end of file From 4f06fd8108e12468b16ae4fa5cd74185e4fc0cb2 Mon Sep 17 00:00:00 2001 From: Liangliang Ma <1906710196@qq.com> Date: Thu, 6 Feb 2025 04:17:56 +0800 Subject: [PATCH 7/8] [XPU] max1100 workflow update for docker and softwares (#7003) 1. update intel oneAPI basekit to 2025.0 2. update torch/ipex/oneccl to 2.5 --- .github/workflows/xpu-max1100.yml | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index 56bff4a88ba9..2d84f8f60571 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -36,7 +36,7 @@ jobs: unit-tests: runs-on: [self-hosted, intel, xpu] container: - image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 + image: intel/oneapi-basekit:2025.0.1-0-devel-ubuntu24.04 ports: - 80 options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL @@ -44,18 +44,23 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install prerequisite + shell: bash run: | apt-get update - apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ - pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ - pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ - pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ + apt-get install clinfo libaio-dev python3-pip python3.12-venv -y + python3 -m venv ~/ds_env + source ~/ds_env/bin/activate + pip install torch==2.5.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/ + pip install intel-extension-for-pytorch==2.5.10+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.5.0+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/ + pip install torchvision==0.20.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/ pip install py-cpuinfo numpy pip install .[dev,autotuning] - name: Check container state + shell: bash run: | + source ~/ds_env/bin/activate ldd --version ds_report python3 -c "import torch; print('torch:', torch.__version__, torch)" @@ -64,7 +69,9 @@ jobs: pip list - name: Unit tests + shell: bash run: | + source ~/ds_env/bin/activate cd tests/unit pytest --verbose accelerator/* pytest --verbose autotuning/* From 84beed322e663903ef863313ca8c496c87ff98e6 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 6 Feb 2025 02:39:34 +0800 Subject: [PATCH 8/8] autotp training(fix dco) (#7004) Same as [this PR](https://github.com/deepspeedai/DeepSpeed/pull/6922). [affeb88](https://github.com/deepspeedai/DeepSpeed/pull/6922/commits/affeb884576936b3a0efc5ce86435da37db1d87e) I noticed the CI updated the DCO check recently. Using the suggested rebase method for sign-off would reintroduce many conflicts, so I opted for a squash merge with sign-off instead. thanks: ) Signed-off-by: inkcherry --- deepspeed/__init__.py | 33 +- deepspeed/comm/comm.py | 6 + deepspeed/comm/torch.py | 4 + deepspeed/inference/engine.py | 1 - deepspeed/module_inject/__init__.py | 2 +- deepspeed/module_inject/auto_tp.py | 89 +-- deepspeed/module_inject/layers.py | 661 +++++++++++++++--- deepspeed/module_inject/load_checkpoint.py | 6 +- deepspeed/module_inject/replace_module.py | 6 +- deepspeed/runtime/config.py | 2 + deepspeed/runtime/engine.py | 120 +++- deepspeed/runtime/tensor_parallel/__init__.py | 7 + deepspeed/runtime/tensor_parallel/config.py | 81 +++ .../runtime/tensor_parallel/tp_manager.py | 66 ++ deepspeed/runtime/utils.py | 45 +- deepspeed/utils/groups.py | 123 +++- .../model_parallelism/test_autotp_training.py | 574 +++++++++++++++ 17 files changed, 1662 insertions(+), 164 deletions(-) create mode 100644 deepspeed/runtime/tensor_parallel/__init__.py create mode 100644 deepspeed/runtime/tensor_parallel/config.py create mode 100644 deepspeed/runtime/tensor_parallel/tp_manager.py create mode 100644 tests/unit/model_parallelism/test_autotp_training.py diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index a8d15cd5332b..fd1f421b8954 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -37,7 +37,7 @@ from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError from .runtime.activation_checkpointing import checkpointing from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig -from .module_inject import replace_transformer_layer, revert_transformer_layer +from .module_inject import replace_transformer_layer, revert_transformer_layer, set_autotp_mode from .utils import log_dist, OnDevice, logger from .comm.comm import init_distributed @@ -364,3 +364,34 @@ def init_inference(model, config=None, **kwargs): engine = InferenceEngine(model, config=ds_inference_config) return engine + + +def tp_model_init(model, tp_size, dtype): + """ + Initialize the model for tensor parallelism. + + Args: + model (torch.nn.Module): The model to be initialized. + tp_size (int): The tensor parallelism size. + dtype (torch.dtype): The data type to be used for the model. + + Returns: + torch.nn.Module: The initialized model with tensor parallelism. + """ + # avoid re-entry + assert not hasattr( + model, 'ds_autotp_parsed'), "ds_autotp_parsed' attribute already exists in the model, re-entry is not allowed." + + set_autotp_mode(training=True) + + from deepspeed.runtime.tensor_parallel import TpTrainingManager + # The expected usage here is for it to be invoked by transformers package. + + #TODO: We should provide a custom TP mapping solution without using autoTP + #as modifying the autoTP logic may be more difficult for users compared to configuring it + + model = TpTrainingManager(model=model, tp_size=tp_size, dtype=dtype).module + + setattr(model, 'ds_autotp_parsed', True) + + return model diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index 759ad7d7e089..800bc6078c3a 100755 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -224,6 +224,12 @@ def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='bro return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) +@timed_op +def broadcast_object_list(object_list, src, group=None, device=None): + global cdb + return cdb.broadcast_object_list(object_list=object_list, src=src, group=group, device=device) + + @timed_op def all_gather(tensor_list, tensor, diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 5461ae18d1f0..efa0640fb87b 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -205,6 +205,10 @@ def broadcast(self, tensor, src, group=None, async_op=False): else: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) + @disable_compiler_collective + def broadcast_object_list(self, object_list, src, group=None, device=None): + return torch.distributed.broadcast_object_list(object_list=object_list, src=src, group=group, device=device) + @disable_compiler_collective def all_gather(self, tensor_list, tensor, group=None, async_op=False): if DS_COMM_ALL_GATHER_OFF: diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index 3089d0c557a4..a529d9343228 100755 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -15,7 +15,6 @@ from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine from deepspeed.utils.timer import SynchronizedWallClockTimer from deepspeed.runtime.compiler import is_compile_supported - from ..runtime.state_dict_factory import SDLoaderFactory from ..runtime.weight_quantizer import WeightQuantization from ..module_inject import replace_transformer_layer, generic_injection diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py index 4bdabf383b26..9fc2f979a04b 100755 --- a/deepspeed/module_inject/__init__.py +++ b/deepspeed/module_inject/__init__.py @@ -6,5 +6,5 @@ from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection from .module_quantize import quantize_transformer_layer from .replace_policy import HFBertLayerPolicy -from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize +from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize, set_autotp_mode from .policy import DSPolicy diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 8bdcf6faa053..d148c26968b3 100755 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -11,10 +11,12 @@ from typing import Optional import torch from deepspeed import comm as dist -from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce +from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce, Yuan_LinearAllreduce, Yuan_LinearLayer, GateUpPack_LinearLayer, Conv_LinearALlreduce, fused_LinearLayer, conv_LinearLayer from deepspeed.accelerator import get_accelerator -from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw, shard_value_with_share_qk, shard_chunk_mlp +from .fusedqkv_utils import require_tp_fused_qkvw from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list +from deepspeed.utils import groups +from deepspeed.module_inject.layers import is_autotp_training_mode def move(tensor, device, copy=True): @@ -333,10 +335,18 @@ def tp_parser(model): return policy_list def set_tensor_parallel_config(self, mp_size, mp_group): + + if is_autotp_training_mode(): + self.mp_group = groups.get_tensor_model_parallel_group() + self.mp_size = groups.get_tensor_model_parallel_world_size() + return + self.mp_size = mp_size self.mp_group = mp_group def _replace(self, child, name, conv_linear_layer): + # This function should clearly define the routing rules for specific layers + # and avoid any complex shard-related logic. if getattr(child, "replaced", False) == True: return device_name = 'cpu' if self.keep_module_on_host else get_accelerator().current_device_name() @@ -352,14 +362,15 @@ def _replace(self, child, name, conv_linear_layer): # For Yuan model if 'Yuan' in str(self.module): if 'v_proj' in name: - weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(), - dist.get_world_size(), True) - return LinearLayer(weight=weight, bias=bias) + return Yuan_LinearLayer(child, self.mp_group) + elif 'o_proj' in name: - weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(), - dist.get_world_size(), False) - return LinearAllreduce(weight, bias, self.mp_group) - # For Arctic model, bypass to all_reduce replacement for w2 weights + return Yuan_LinearAllreduce(child, self.mp_group) + + # For MLP including chunk layer. + if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)): + return GateUpPack_LinearLayer(child, self.mp_group) + # For Arctic model, bypass to all_reduce replacement for w2 weights arctic_w2_all_reduce_linear = False if 'Arctic' in str(self.module) and 'w2' in name: arctic_w2_all_reduce_linear = True @@ -367,65 +378,25 @@ def _replace(self, child, name, conv_linear_layer): down_proj = False if 'down_proj' in name: down_proj = True - # For MLP including chunk layer. - if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)): - weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size()) - return LinearLayer(weight=weight, bias=bias) if name in self.all_reduce_linears or arctic_w2_all_reduce_linear or down_proj: - # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size] - # else [weight_shape[0], weight_shape[1] // mp_size] + setattr(child, "replaced", True) if self.conv_linear_layer: - child.weight.data = child.weight.data.transpose(-1, -2).contiguous() - data = child.weight.data.split(get_shard_size_list( - weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size, name), - dim=1) - data_dc = move(data[mp_replace.gpu_index], device_name, return_new_copy).detach() - del data + return Conv_LinearALlreduce(child, self.mp_group, name=name) + elif name == "lm_head" or name == 'embed_out': + return LmHeadLinearAllreduce(child, self.mp_group) - setattr(child, "replaced", True) - if name == "lm_head" or name == 'embed_out': - return LmHeadLinearAllreduce( - torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(), - child.bias if child.bias is None else torch.nn.parameter.Parameter( - move(child.bias, device_name, return_new_copy)), self.mp_group) - return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \ - torch.nn.parameter.Parameter(move(child.bias, device_name, return_new_copy)), self.mp_group) + return LinearAllreduce(child, self.mp_group, name=name) else: - # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size] - # else [weight_shape[0] // mp_size, weight_shape[1]] + setattr(child, "replaced", True) if self.conv_linear_layer: - child.weight.data = child.weight.data.transpose(-1, -2).contiguous() - - if require_tp_fused_qkvw(name, self.mp_size): + conv_LinearLayer(child, self.mp_group) + elif require_tp_fused_qkvw(name, self.mp_size): #Check and handle fused qkv for TP - #The copy is a regular copy, The shape of dst and src is the same - data_dc = move( - prepare_tp_fused_qkvw(self.module, child.weight.data, self.mp_size, mp_replace.gpu_index), - device_name, return_new_copy) - - bias_data_dc = None if child.bias is None else move( - prepare_tp_fused_qkvw(self.module, child.bias.data, self.mp_size, mp_replace.gpu_index), - device_name, return_new_copy) - else: - data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size, name), - dim=1 if self.conv_linear_layer else 0) - data_dc = move(data[mp_replace.gpu_index], device_name, return_new_copy).detach() - del data - - if child.bias is not None: - bias_data = child.bias.data.split(get_shard_size_list( - weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name), - dim=0) - bias_data = move(bias_data[mp_replace.gpu_index], device_name, return_new_copy) - bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False) - del bias_data - else: - bias_data_dc = None + return fused_LinearLayer(child, self.mp_group, fused_module=self.module) - setattr(child, "replaced", True) - return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc) + return LinearLayer(child, self.mp_group, name=name) def _slice_embedding(self, child, name, conv_linear_layer): if getattr(child, "replaced", False) == True: diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 2f884ba4fb09..c410bf900c31 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -7,10 +7,578 @@ from deepspeed import comm as dist from torch import nn from torch.nn import functional as F - from torch.nn.parameter import Parameter from deepspeed.accelerator import get_accelerator from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list +from abc import ABC, abstractmethod +from typing import Iterable, Any, Optional, List, Tuple +from .fusedqkv_utils import shard_value_with_share_qk, shard_chunk_mlp, prepare_tp_fused_qkvw +from deepspeed.runtime.tensor_parallel import AUTOTP_MODE +from copy import deepcopy +from typing import Union + +DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.INFERENCE +DS_IS_REPLACED_MODULE = 'ds_is_replaced_module' +DS_TENSOR_MODEL_PARALLEL = 'tensor_model_parallel' + + +def get_auto_tp_mode(): + global DEEPSPEED_AUTOTP_MODE + return DEEPSPEED_AUTOTP_MODE + + +def is_autotp_training_mode(): + global DEEPSPEED_AUTOTP_MODE + return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING + + +def set_autotp_mode(training=False): + """ + Set the DEEPSPEED_AUTOTP_MODE based on the training flag + """ + global DEEPSPEED_AUTOTP_MODE + if training: + DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.TRAINING + else: + DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.INFERENCE + + +def move(tensor, device): + # TODO: consider the timing of deletion + # to save host resources when DP > 1。 + + if tensor.is_meta: + return torch.empty_like(tensor, device=device) + else: + # Using new tensors help in freeing memory (after split for example) was done before by calling clone(). + # Using copy=True instead of clone() will help in case of cpu --> cpu. + # Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced. + cloned_tensor = tensor.to(device, copy=True) + + # free the memory of the original tensor to reduce memory peak + # Equivalent to directly deleting the tensor reference outside the function. + # see https://github.com/microsoft/DeepSpeed/pull/4353 + tensor.data = torch.empty(0, device=tensor.device) + return cloned_tensor + + +class RowParallel(torch.autograd.Function): + """ + A custom autograd function for performing row-wise parallelism. + """ + + @staticmethod + def symbolic(graph, input): + """Symbolic function for tracing.""" + return input + + @staticmethod + def forward(ctx: Any, group: dist.ProcessGroup, input: torch.Tensor, is_inference_mode: bool) -> torch.Tensor: + """ + Forward pass. + """ + ctx.group = group + if group == None: + return input + if is_inference_mode: + dist.inference_all_reduce(input, group=group) + else: + dist.all_reduce(input.contiguous(), group=group) + return input + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor, None]: + """ + Backward pass. + """ + return None, grad_output, None + + +class ColumnParallel(torch.autograd.Function): + """ + Custom autograd function for column-wise parallelism. + """ + + @staticmethod + def symbolic(graph, input): + """Symbolic function for tracing.""" + return dist.all_reduce(input.contiguous(), dist.get_tensor_model_parallel_group()) + + @staticmethod + def forward(ctx: Any, group: dist.ProcessGroup, input: torch.Tensor) -> torch.Tensor: + """ + Forward pass. + """ + ctx.group = group + return input + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor]: + """ + Backward pass. + """ + if ctx.group == None: + return None, grad_output + + dist.all_reduce(grad_output.contiguous(), group=ctx.group) + return None, grad_output + + +class Replaced_Layer(nn.Module, ABC): + """ + A base class for model layers with tensor parallelism support. + This class is designed to be extended by specific layers that require distributed + operations and parameter gather/partitioning during inference or training. + + Attributes: + mode (str): The mode of operation[INFERENCE or TRAINING], default is "INFERENCE". + mp_group (Optional[dist.ProcessGroup]): The process group used for model parallelism. + tp_world_size (int): The world size of tensor parallelism, i.e., the number of parallel workers. + tp_index (int): The rank (ID) of the current worker in tensor parallelism. + support_training (bool): Flag indicating whether the layer supports training (default: False). + name (Optional[str]): The name of the layer, if provided. + """ + + def __init__(self, mp_group: Optional[dist.ProcessGroup], **kwargs: Any): + """ + Initializes the Replaced_Layer with optional model parallelism group and layer name. + + Args: + mp_group (Optional[dist.ProcessGroup]): The process group for model parallelism. + If None, no model parallelism is set. + """ + super().__init__() + self.support_training: bool = False + if mp_group is not None: + self.mp_group = mp_group + self.tp_world_size: int = dist.get_world_size(self.mp_group) + self.tp_index: int = dist.get_rank(mp_group) + + # backward compatibility + self.world_size = self.tp_world_size + self.rank = self.tp_index + + self.name = getattr(self, 'name', None) + if kwargs.get('name') is not None: + self.name = kwargs.get('name') # Set the layer name if provided. + + @abstractmethod + def forward(self, input): + """ + Forward pass method. Must be implemented by subclasses to define layer-specific operations. + """ + pass + + @abstractmethod + def gather_params(self, params_list): + """ + Gathers parameters across devices for distributed training. Must be implemented by subclasses in "TRAINING" mode. + """ + pass + + @abstractmethod + def partition(self, params_list: List[torch.Tensor]): + """ + Partitions the parameters for tensor parallelism. + It is necessary to ensure that this function only involves the logic of params partitioning. + """ + pass + + def config_tp_params(self, weight): + """ + Configures the weight tensor for training with tensor parallelism. This includes enabling gradients + and associating necessary methods for parameter gathering and partitioning. + + Args: + weight (Optional[torch.Tensor]): The weight tensor to configure for tensor parallelism. + If None, no action is taken. + """ + # # The RNG states have already been synchronized in init_inference. + if self.is_training_mode(): + assert self.support_training, "No implementation of backward." + if weight is not None: + if self.is_training_mode(): + if weight.requires_grad is None: + weight.requires_grad = True + else: + weight.requires_grad = False + setattr(weight, DS_TENSOR_MODEL_PARALLEL, True) + setattr(weight, DS_IS_REPLACED_MODULE, True) + weight.gather_params = self.gather_params + weight.partition = self.partition + + def is_training_mode(self): + global DEEPSPEED_AUTOTP_MODE + return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING + + def __deepcopy__(self, memo): + # This function is designed for + # 'mp_group' (a 'ProcessGroup') cannot be pickled during deepcopy in some usage. + cls = self.__class__ + new_obj = cls.__new__(cls) + + for key, value in vars(self).items(): + if key == 'mp_group': + new_obj.mp_group = self.mp_group + else: + setattr(new_obj, key, deepcopy(value, memo)) + + memo[id(self)] = new_obj + return new_obj + + def extra_repr(self): + if self.weight is not None: + out_features, in_features = self.weight.shape if self.weight is not None else (None, None) + dtype = self.weight.dtype if self.weight is not None else None + extra_repr_str = "in_features={}, out_features={}, bias={}, dtype={}".format( + in_features, out_features, self.bias is not None, dtype) + return extra_repr_str + + +class GatherReplacedLayerParams: + """ + A context manager for gathering parameters of a replaced layer, enabling partitioning and gathering functionality + based on the configuration of the model. + """ + + def __init__(self, + params: Union[Iterable[torch.Tensor], torch.Tensor], + module: torch.nn.Module, + enabled: bool = True): + """ + Initialize the context manager to handle parameter gathering and partitioning for a replaced layer. + + Args: + params (Iterable or torch.Tensor): A collection or single parameter to manage. + module (torch.nn.Module): The module that these parameters belong to. + enabled (bool): Flag indicating whether the parameter management is enabled (default: True). + """ + self.enabled = enabled + self.module = module + if not enabled: + return + + # Ensure params is a list, whether it's a single param or iterable (e.g., model.parameters()) + if isinstance(params, Iterable) and not isinstance(params, torch.Tensor): + self.params: List[torch.Tensor] = list(params) # Convert generators to a list for multiple iterations + else: + self.params: List[torch.Tensor] = [params] # Wrap single parameter in a list for uniform processing + + # Check if the parameters belong to a replaced layer (indicated by a specific attribute) + if not any(self._is_replaced_module_weight(p) for p in params): + self.enabled = False + return + + def _is_replaced_module_weight(self, param: torch.Tensor) -> bool: + """ + Helper function to determine if a parameter belongs to a replaced module. + + Args: + param (torch.Tensor): The parameter to check. + + Returns: + bool: True if the parameter belongs to a replaced module, False otherwise. + """ + return getattr(param, DS_IS_REPLACED_MODULE, False) + + def __enter__(self) -> None: + """ + Enter the context manager. If enabled, gather parameters for the replaced module. + """ + if self.enabled: + self.params[0].gather_params(self.params) + + def __exit__(self, exc_type, exc_value, traceback) -> None: + """ + Exit the context manager. If enabled, partition the parameters for the replaced module. + """ + #TODO : Check whether there are any missing attributes. + if self.enabled: + self.params[0].partition(self.params) + + +class LinearAllreduce(Replaced_Layer): + + def __init__(self, module, mp_group, **kwargs): + super(LinearAllreduce, self).__init__(mp_group, **kwargs) + self.weight = module.weight + self.bias = module.bias + + self.partition([self.weight, self.bias]) + self.support_training = True + self.config_tp_params(self.weight) + if self.bias is not None: + self.config_tp_params(self.bias) + + def forward(self, input): + output = torch.matmul(input, self.weight.transpose(-1, -2)) + output = RowParallel.apply(self.mp_group, output, not self.is_training_mode()) + if self.bias is not None: + output += self.bias + return output + + @torch.no_grad() + def gather_params(self, params_list): + + for idx, param in enumerate(params_list): + if param is None or idx > 0: + # don't gather bias + return + params_list[idx].data_partition = param.data + param = param.transpose(0, 1).contiguous() + output_param = torch.empty(self.tp_world_size * param.shape[0], + param.shape[1], + dtype=param.dtype, + device=param.device) + dist.all_gather_into_tensor(output_param, param, group=self.mp_group) + params_list[idx].data = output_param.transpose(0, 1).contiguous() + return + + @torch.no_grad() + def partition(self, params_list): + + if not self.is_training_mode(): + self.uneven_partition(params_list) + return + + else: + for idx, param in enumerate(params_list): + if param is None or idx > 0: + # don't slipt bias + return + _partition = torch.chunk(param, self.tp_world_size, dim=-1)[self.tp_index] + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + params_list[idx].data = _partition + + def uneven_partition(self, params_list): + for idx, param in enumerate(params_list): + if param is None or idx > 0: + # don't slipt bias + return + assert self.name is not None, "The module name must be provided in the initialization." + _partition = params_list[idx].split(get_shard_size_list(params_list[idx].shape[1], self.tp_world_size, + self.name), + dim=1)[self.tp_index] + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + params_list[idx].data = _partition + + +#remove kwargs from partition. +class LinearLayer(Replaced_Layer): + + def __init__(self, module, mp_group, skip_partition=False, **kwargs): + super(LinearLayer, self).__init__(mp_group, **kwargs) + self.weight = module.weight + self.bias = module.bias + if not skip_partition: + self.partition([self.weight, self.bias]) + self.support_training = True + self.config_tp_params(self.weight) + if self.bias is not None: + self.config_tp_params(self.bias) + + def forward(self, input): + input = ColumnParallel.apply(self.mp_group, input) + output = torch.matmul(input, self.weight.transpose(-1, -2)) + if self.bias is not None: + output += self.bias + return output + + @torch.no_grad() + def gather_params(self, params_list): + # Does not support uneven shard. + for idx, param in enumerate(params_list): + + params_list[idx].data_partition = param.data + output_param = torch.empty(self.tp_world_size * param.shape[0], + param.shape[1], + dtype=param.dtype, + device=param.device) + dist.all_gather_into_tensor(output_param, param, group=self.mp_group) + params_list[idx].data = output_param.contiguous() + + @torch.no_grad() + def partition(self, params_list): + + if not self.is_training_mode(): + self.uneven_partition(params_list) + return + for idx, param in enumerate(params_list): + if param is None: + return + #split bias if provide + _partition = torch.chunk(param, self.tp_world_size, dim=0)[self.tp_index] + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + params_list[idx].data = _partition + + def uneven_partition(self, params_list): + + for idx, param in enumerate(params_list): + if param is None: + #split bias if provide + return + assert self.name is not None, "The module name must be provided in the initialization." + _partition = params_list[idx].split(get_shard_size_list(params_list[idx].shape[0], self.tp_world_size, + self.name), + dim=0)[self.tp_index] + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + params_list[idx].data = _partition + + # for bwc + @classmethod + def from_weights(cls, weight_shape=None, dtype=torch.half, weight=None, bias=None): + if weight is not None: + in_features = weight.shape[1] + out_features = weight.shape[0] + linear = nn.Linear(in_features, out_features, bias=(bias is not None)) + linear.weight.data = weight + if bias is not None: + linear.bias.data = bias + else: + in_features = weight_shape[1] + out_features = weight_shape[0] + linear = nn.Linear(in_features, out_features, bias=(bias is not None)) + return cls(linear, skip_partition=True) + + +class FusedModuleWrapper: + + def __init__(self, fused_module: nn.Module): + self.fused_module = fused_module + + def __getattr__(self, module): + return self.fused_module + + +class fused_LinearLayer(LinearLayer): + + def __init__(self, module, mp_group, skip_partition=False, **kwargs): + assert kwargs.get('fused_module') is not None, "'fused_module' is required but not provided" + # Use the warp class to avoid module circular references. + self.fused_module = FusedModuleWrapper(kwargs.get('fused_module')) + super().__init__(module, mp_group, skip_partition, **kwargs) + + @torch.no_grad() + def partition(self, params_list): + for idx, param in enumerate(params_list): + if param is None: + return + + _partition = prepare_tp_fused_qkvw(self.fused_module.module, param, self.tp_world_size, self.tp_index) + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + params_list[idx].data = _partition + + +class conv_LinearLayer(LinearLayer): + + @torch.no_grad() + def partition(self, params_list): + weight = None + bias = None + if len(params_list) == 1: + weight = params_list[0] + elif len(params_list) == 2: + weight, bias = params_list[0], params_list[1] + _partition = weight.data.split(get_shard_size_list(weight.shape[0], self.tp_world_size, self.name), + dim=1)[self.tp_index] + _partition = move(_partition, get_accelerator().current_device_name()).detach() + weight.data = _partition + + if bias is not None: + _partition = bias.data.split(get_shard_size_list(weight.shape[1], self.tp_world_size, self.name), + dim=0)[self.tp_index] + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + bias.data = _partition + + +#override the subclasses related to weight splitting. +class Yuan_LinearAllreduce(LinearAllreduce): + + #Yuan2 + @torch.no_grad() + def partition(self, params_list): + weight, bias = shard_value_with_share_qk(params_list[0].data, params_list[1], self.tp_index, + self.tp_world_size, False) + params_list[0].data = weight + if bias is not None: + params_list[1].data = bias + + +class Yuan_LinearLayer(LinearLayer): + #Yuan2 + @torch.no_grad() + def partition(self, params_list): + weight, bias = shard_value_with_share_qk(params_list[0].data, params_list[1], self.tp_index, + self.tp_world_size, True) + params_list[0].data = move(weight, get_accelerator().current_device_name()).detach() + if bias is not None: + params_list[1].data = move(bias, get_accelerator().current_device_name()).detach() + + +class GateUpPack_LinearLayer(LinearLayer): + # chatGLM2, chatGLM2 + @torch.no_grad() + def partition(self, params_list): + weight, bias = shard_chunk_mlp(params_list[0].data, params_list[1], self.tp_index, self.tp_world_size) + params_list[0].data = move(weight, device=get_accelerator().current_device_name()).detach() + if bias is not None: + params_list[1].data = move(bias, device=get_accelerator().current_device_name()).detach() + + +class Conv_LinearALlreduce(LinearAllreduce): + + @torch.no_grad() + def partition(self, params_list): + for idx, param in enumerate(params_list): + if param is None: + return + param.data = param.data.transpose(-1, -2).contiguous() + + _partition = param.split(get_shard_size_list(param.shape[0], self.tp_world_size, self.name), + dim=1)[self.tp_index] + + _partition = move(_partition, get_accelerator().current_device_name()).detach() + + params_list[idx].data = _partition + + +#override the subclasses related to fwd/bwd. +class LmHeadLinearAllreduce(LinearAllreduce): + + def __init__(self, module, mp_group, **kwargs): + # set the fixed name before partition + self.name = "lm_head" + + # In some tied_embedding cases, only the lm head is sharded, while the word embedding is not. + # Reinitialization is used to decouple them and prevent the word embedding from being sharded. + # This should also be effective for cases where both are sharded in tied_embedding scenarios. + + # TODO: Training scenario-related tests, is it necessary to re-implement the vocab parallel module? + module.weight = nn.Parameter(module.weight.clone().detach()) + if hasattr(module, 'bias') and module.bias is not None: + module.bias = nn.Parameter(module.bias.clone().detach()) + super().__init__(module, mp_group, **kwargs) + + def forward(self, input): + input_shard_size = get_shard_size(input.shape[-1], self.tp_world_size, "lm_head") + input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.tp_world_size, "lm_head")[0:self.tp_index]) + output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size], + self.weight.transpose(-1, -2)) + if self.mp_group is not None: + dist.inference_all_reduce(output, group=self.mp_group) + if self.bias is not None: + output += self.bias + return output class TensorParallelConv2d(nn.Module): @@ -75,97 +643,6 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: return out -class LinearAllreduce(nn.Module): - - def __init__(self, weight, bias=None, mp_group=None): - super(LinearAllreduce, self).__init__() - self.weight = weight - self.bias = bias - self.mp_group = mp_group - - def forward(self, input): - output = torch.matmul(input, self.weight.transpose(-1, -2)) - if self.mp_group is not None: - dist.inference_all_reduce(output, group=self.mp_group) - if self.bias is not None: - output += self.bias - return output - - def extra_repr(self): - out_features, in_features = self.weight.shape if self.weight is not None else (None, None) - dtype = self.weight.dtype if self.weight is not None else None - extra_repr_str = "in_features={}, out_features={}, bias={}, dtype={}".format( - in_features, out_features, self.bias is not None, dtype) - return extra_repr_str - - -class LmHeadLinearAllreduce(nn.Module): - - def __init__( - self, - weight, - rank, - world_size, - bias=None, - mp_group=None, - ): - super(LmHeadLinearAllreduce, self).__init__() - self.weight = weight - self.bias = bias - self.mp_group = mp_group - self.rank = rank - self.world_size = world_size - - def forward(self, input): - input_shard_size = get_shard_size(input.shape[-1], self.world_size, "lm_head") - input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size, "lm_head")[0:self.rank]) - output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size], - self.weight.transpose(-1, -2)) - if self.mp_group is not None: - dist.inference_all_reduce(output, group=self.mp_group) - if self.bias is not None: - output += self.bias - return output - - def extra_repr(self): - out_features, in_features = self.weight.shape if self.weight is not None else (None, None) - dtype = self.weight.dtype if self.weight is not None else None - extra_repr_str = "in_features={}, out_features={}, bias={}, dtype={}".format( - in_features, out_features, self.bias is not None, dtype) - return extra_repr_str - - -class LinearLayer(nn.Module): - - def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None): - super(LinearLayer, self).__init__() - if weight is not None: - self.weight = weight - self.bias = bias - else: - self.weight = Parameter( - torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name())) - - self.bias = Parameter( - torch.empty(weight_shape[0], - dtype=dtype, - device=get_accelerator().current_device_name())) \ - if bias is not None else None - - def forward(self, input): - output = torch.matmul(input, self.weight.transpose(-1, -2)) - if self.bias is not None: - output += self.bias - return output - - def extra_repr(self): - out_features, in_features = self.weight.shape - dtype = self.weight.dtype - extra_repr_str = "in_features={}, out_features={}, bias={}, dtype={}".format( - in_features, out_features, self.bias is not None, dtype) - return extra_repr_str - - class Normalize(nn.Module): def __init__(self, dim=None, dtype=torch.float, eps=1e-5, weight=None, bias=None): diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py index 4d01fdc69869..862628fa7b4b 100644 --- a/deepspeed/module_inject/load_checkpoint.py +++ b/deepspeed/module_inject/load_checkpoint.py @@ -236,7 +236,7 @@ def load_module_recursive(module, prefix='', level=0): child.weight.ds_id in all_ds_ids): prefix1 = all_ds_ids[child.weight.ds_id] if child.__class__ is nn.Linear: - child = LinearLayer(weight=all_ds_ids[child.weight.ds_id]) + child = LinearLayer.from_weights(weight=all_ds_ids[child.weight.ds_id]) setattr(module, name, child) continue child_params = list(child.parameters()) @@ -249,7 +249,9 @@ def load_module_recursive(module, prefix='', level=0): child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps) setattr(module, name, child) elif child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]: - child = LinearLayer(weight_shape=child.weight.shape, dtype=child.weight.dtype, bias=child.bias) + child = LinearLayer.from_weights(weight_shape=child.weight.shape, + dtype=child.weight.dtype, + bias=child.bias) setattr(module, name, child) elif child.__class__ is OPTLearnedPositionalEmbedding: child = OPTEmbedding(weight_shape=ds_shape) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 26d242d33e2f..9510f96b89c6 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -15,7 +15,7 @@ from .replace_policy import replace_policies, generic_policies from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading from .layers import TensorParallelOcShardConv2d, TensorParallelIcShardConv2d - +from deepspeed.module_inject.layers import is_autotp_training_mode from deepspeed import comm as dist from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads, set_tp_grain_size @@ -323,7 +323,7 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): else: # copy relevant state from child -> new module - if config.replace_with_kernel_inject: + if not is_autotp_training_mode() and config.replace_with_kernel_inject: new_module = replace_with_policy(child, _policy, config.triangular_masking, @@ -475,7 +475,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size): set_lm_head(replaced_module) print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec") - if config.save_mp_checkpoint_path is not None: + if not is_autotp_training_mode() and config.save_mp_checkpoint_path is not None: from collections import OrderedDict import json num_partitions = 8 diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index fb786f29722d..b6dabc161e8c 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -62,6 +62,7 @@ from ..compression.constants import * from .swap_tensor.aio_config import get_aio_config +from .tensor_parallel import get_tensor_parallel_config from .data_pipeline.config import get_data_efficiency_enabled, get_data_efficiency_config, get_curriculum_enabled_legacy, get_curriculum_params_legacy from .data_pipeline.constants import * @@ -913,6 +914,7 @@ def _initialize_params(self, param_dict): **param_dict['weight_quantization']) if 'weight_quantization' in param_dict else None self.timers_config = get_timers_config(param_dict) + self.tensor_parallel_config = get_tensor_parallel_config(param_dict) def _batch_assertion(self): diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 97d2afb8b723..986b68dc1bb1 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -37,6 +37,7 @@ from deepspeed.runtime.bf16_optimizer import BF16_Optimizer from deepspeed.linear.optimized_linear import LoRAOptimizedLinear +from deepspeed.module_inject.layers import GatherReplacedLayerParams from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \ ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \ @@ -75,7 +76,7 @@ from deepspeed.utils.debug import debug_extract_module_and_param_names, debug_clear_module_and_param_names from deepspeed.monitor.monitor import MonitorMaster from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop -from deepspeed.runtime.utils import clip_grad_norm_ +from deepspeed.runtime.utils import clip_grad_norm_, compare_tensors_in_structures from deepspeed.runtime.eigenvalue import Eigenvalue from deepspeed.runtime.data_pipeline.constants import DATA_SAMPLING, \ DATA_ROUTING, DATA_SAMPLING_ENABLED, CURRICULUM_LEARNING, \ @@ -230,7 +231,6 @@ def __init__(self, self._step_applied = False self._global_grad_norm = None self.use_ds_comm = False # False --> Use torch.dist, True --> Use ds.comm backend. - self.checkpoint_engine = None self._is_gradient_accumulation_boundary = None @@ -247,6 +247,8 @@ def __init__(self, self._do_args_sanity_check(args) self._configure_with_arguments(args, mpu) self._do_sanity_check() + if self.autotp_size() > 1: + self._configure_tensor_parallel_states(model) see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown()) if mpu is not None: if self.elasticity_enabled(): @@ -411,6 +413,71 @@ def _optimized_linear_offload_setup(self): else: p.ds_offload = False + def _configure_tensor_parallel_states(self, model): + """ + Configures the tensor parallel states for the model. + This includes setting up the tensor parallel groups, initializing the TP mesh, + and registering a pre-hook to ensure that the Dataloader inputs are consistent across ranks. + """ + self._set_client_model(model) + + # sanity check + # currently, the compatibility between 'autotp' and 'zero > 1' has not been validated + assert self.zero_optimization_stage( + ) <= 1, "Currently, the compatibility between 'autotp' and 'zero_stage > 1' has not been validated" + + self.mpu = groups + self.mpu._init_tp_mesh_device(tensor_model_parallel_size=self.autotp_size()) + + self.first_dataloader_check = None + + def check_dataloader_inputs_same_across_ranks(module, args, kwargs): + + def broadcast_and_check(args, bcast_rank, bcast_group): + if isinstance(args, tuple): + args = list(args) + if len(args) > 0: + if self.mpu.get_tensor_model_parallel_rank() == 0: + _src_args = [args] + dist.broadcast_object_list(object_list=_src_args, + src=bcast_rank, + group=bcast_group, + device=get_accelerator().current_device()) + # Rank 0 does not need to compare with itself + is_equal = True + else: + _src_args = [None] + dist.broadcast_object_list(object_list=_src_args, + src=bcast_rank, + group=bcast_group, + device=get_accelerator().current_device()) + + is_equal = compare_tensors_in_structures(args, _src_args[0]) + + equal_tensor = torch.tensor(is_equal, + dtype=self.communication_data_type, + device=get_accelerator().current_device()) + dist.all_reduce(equal_tensor, group=bcast_group) + assert torch.equal( + equal_tensor, + torch.tensor(groups.get_tensor_model_parallel_world_size(), + dtype=self.communication_data_type, + device=get_accelerator().current_device()) + ), "Data inconsistency within the TP group. Please check the Dataloader implementation to ensure consistency." + + bcast_rank = self.mpu.get_tensor_model_parallel_src_rank() + bcast_group = self.mpu.get_tensor_model_parallel_group() + + broadcast_and_check(args, bcast_rank, bcast_group) + broadcast_and_check(kwargs, bcast_rank, bcast_group) + + logger.info(f":The Dataloader has passed the TP group consistency check.") + self.first_dataloader_check.remove() + + self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks, + prepend=True, + with_kwargs=True) + def destroy(self): if self.optimizer is not None and hasattr(self.optimizer, 'destroy'): self.optimizer.destroy() @@ -832,6 +899,9 @@ def zero_legacy_stage1(self): def zero_ignore_unused_parameters(self): return self._config.zero_config.ignore_unused_parameters + def autotp_size(self): + return self._config.tensor_parallel_config.autotp_size + def graph_harvesting(self): return self._config.graph_harvesting @@ -3569,6 +3639,52 @@ def _save_zero_checkpoint(self, save_path, tag): ckpt_type = 'zero' if self.zero_optimization() else 'bf16_zero' logger.info(f'{ckpt_type} checkpoint saved {zero_checkpoint_name}') + def _replace_module_consolidated_state_dict(self): + """ + Get a full non-partitioned state_dict with fp16 weights on cpu. + Important: this function must be called on all ranks and not just rank 0. + This is similar to nn.Module.state_dict (modelled after _save_to_state_dict) + This method is used for tensor parallel training. + + Returns: + OrderedDict: The consolidated state dictionary if the current process rank is 0, otherwise None. + """ + #TODO: If we use both Zero3 and tensor parallel simultaneously + # we need to consolidate the gather mechanisms of both. + state_dict = OrderedDict() if dist.get_rank() == 0 else None + + def get_layer_state_dict(module, prefix=""): + with GatherReplacedLayerParams(list(module.parameters(recurse=False)), module, enabled=True): + for name, param in module.named_parameters(recurse=False): + if param is None: + continue + key = prefix + name + if (dist.get_rank() == 0): + state_dict[key] = param.detach().cpu() + # print(key,module, param.detach().cpu().shape) + + for name, child in module.named_children(): + if child is not None: + get_layer_state_dict(child, prefix + name + ".") + + get_layer_state_dict(self.module, prefix="") + + # ensure that all GPU communication tasks are completed before the process exits + get_accelerator().synchronize() + return state_dict + + def _consolidated_16bit_state_dict(self, exclude_frozen_parameters=False): + """ + Consolidate the 16-bit state dictionary. + """ + if self.zero_optimization_stage() == ZeroStageEnum.weights: + return self._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters) + elif self.autotp_size() > 1: + return self._replace_module_consolidated_state_dict() + + raise ValueError("consolidated_16bit_state_dict is only applicable to cases where weights are partitioned, " + "including Zero Stage 3 and tensor parallelism.") + def _zero3_consolidated_16bit_state_dict(self, exclude_frozen_parameters=False): """ Get a full non-partitioned state_dict with fp16 weights on cpu. diff --git a/deepspeed/runtime/tensor_parallel/__init__.py b/deepspeed/runtime/tensor_parallel/__init__.py new file mode 100644 index 000000000000..388239345351 --- /dev/null +++ b/deepspeed/runtime/tensor_parallel/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .config import AUTOTP_MODE, get_tensor_parallel_config +from .tp_manager import TpTrainingManager diff --git a/deepspeed/runtime/tensor_parallel/config.py b/deepspeed/runtime/tensor_parallel/config.py new file mode 100644 index 000000000000..1300bf9323cd --- /dev/null +++ b/deepspeed/runtime/tensor_parallel/config.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from enum import Enum +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +import torch +from pydantic import Field +from typing import Optional + + +class AUTOTP_MODE(Enum): + TRAINING = "TRAINING" + INFERENCE = "INFERENCE" + + +class TPConfig(DeepSpeedConfigModel): + """ Configure tensor parallelism settings """ + + tp_size: int = 1 + """ Number of devices to split the model across using tensor parallelism. """ + + tp_grain_size: int = 1 + "The variable required by the autoTP parser has not been activated in training yet" + "as it depends on the gather logic that supports uneven partitioning. " + "Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size." + + mpu: object = None + """ + A model parallelism unit object that implements + ``get_{model,data}_parallel_{rank,group,world_size}()``. + """ + + tp_group: object = None + + +class TPTrainingConfig(DeepSpeedConfigModel): + + dtype: torch.dtype = torch.float16 + """ + Desired model data type, will convert model to this type. + """ + + autotp_size: int = 0 + """ + In automatic tensor-parallelism training, 'tensor_parallel_size' + When set to 0, indicates that it is disabled. + """ + tensor_parallel: TPConfig = Field({}, alias="tp") + """ + Configuration for tensor parallelism used to split the model across several + GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`. + """ + + injection_policy_tuple: Optional[tuple] = None + #The following parameters are required by autoTP parser. + ######################################## + keep_module_on_host: bool = False + """ + When loading checkpoints to model parameters, they are moved to the device. In very large models + this might fill the device and cause OOM. Setting this flag to true, will keep checkpoints on + host and not move them directly to the device (giving an option to quantize checkpoint data before + moving it to the device for example). + """ + + replace_with_kernel_inject: bool = Field(False, alias="kernel_inject") + """ + Set to true to inject inference kernels for models such as, Bert, GPT2, + GPT-Neo and GPT-J. Otherwise, the injection_dict provides the names of two + linear layers as a tuple: + `(attention_output projection, transformer output projection)` + """ + ######################################## + + +def get_tensor_parallel_config(ds_config): + + if 'tensor_parallel' in ds_config: + return TPTrainingConfig(**ds_config['tensor_parallel']) + return TPTrainingConfig() diff --git a/deepspeed/runtime/tensor_parallel/tp_manager.py b/deepspeed/runtime/tensor_parallel/tp_manager.py new file mode 100644 index 000000000000..cf0b5a75c92a --- /dev/null +++ b/deepspeed/runtime/tensor_parallel/tp_manager.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch +from .config import TPTrainingConfig, TPConfig +from deepspeed.utils import groups +import deepspeed.comm as dist + + +class TpTrainingManager(): + + def __init__(self, model, tp_size, dtype): + self.module = model + self.config = self._initialize_config(dtype) + + from deepspeed.module_inject.auto_tp import AutoTP + from deepspeed import get_accelerator + + # Parse model configuration + parser_dict = AutoTP.tp_parser(model) + print("AutoTP: ", parser_dict) + + # Initialize TP configuration and model + self._initialize_tp_config(tp_size) + self._get_model_config_generate() + + # Synchronize random number generator state across devices + _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name()) + dist.broadcast(_rng_state, groups.get_tensor_model_parallel_src_rank(), self.tp_config.tp_group) + get_accelerator().set_rng_state(_rng_state.cpu()) + + # Apply injection policies + self._apply_policies(parser_dict) + + def _initialize_config(self, dtype): + """Initialize and return the DeepSpeed TP training configuration.""" + config = TPTrainingConfig() + config.dtype = dtype + return config + + def _apply_policies(self, parser_dict): + """Apply injection policies to the parsed modules.""" + for client_module, injection_policy in parser_dict: + self.config.injection_policy_tuple = injection_policy + self._apply_injection_policy(self.config, client_module) + + def _apply_injection_policy(self, config, client_module=None): + from deepspeed.module_inject import replace_transformer_layer + """Apply the given injection policy to a client module.""" + if isinstance(self.module, torch.nn.Module): + replace_transformer_layer(client_module, self.module, None, self.config, self.model_config) + + def _initialize_tp_config(self, tp_size): + """Perform TP configuration initialization.""" + self.tp_config = TPConfig() + self.tp_config.tp_size = tp_size + + groups._init_tp_mesh_device(tp_size) + self.tp_config.tp_group = groups.get_tensor_model_parallel_group() + self.config.tensor_parallel = self.tp_config + + def _get_model_config_generate(self): + """Generate and apply HF model configuration.""" + self.model_config = getattr(self.module, 'config', None) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index f48adb58c9bf..91fe7cbdcc96 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -22,7 +22,7 @@ from torch._six import inf except ModuleNotFoundError: from torch import inf - +from typing import Union, List, Dict from deepspeed import comm as dist from deepspeed.moe.utils import is_moe_param from deepspeed.utils import groups, logger @@ -1101,3 +1101,46 @@ def move_back_key(state, key): move_back_key(state, "exp_avg") if "exp_avg_sq" in state: move_back_key(state, "exp_avg_sq") + + +def compare_tensors_in_structures(inputs1: Union[List, Dict], inputs2: Union[List, Dict]) -> bool: + """ + Compare two lists or dictionaries for equality, including any tensors they may contain. + + Args: + inputs1: First input, either a list or a dictionary. + inputs2: Second input, either a list or a dictionary. + + Returns: + True if inputs1 and inputs2 are equal; False otherwise. + """ + if type(inputs1) != type(inputs2): # Ensure types match + return False + + if isinstance(inputs1, list) and isinstance(inputs2, list): + if len(inputs1) != len(inputs2): + return False + for val1, val2 in zip(inputs1, inputs2): + if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor): + val1 = val1.to(get_accelerator().current_device()) + val2 = val2.to(get_accelerator().current_device()) + if not torch.equal(val1, val2): + return False + elif val1 != val2: + return False + return True + + elif isinstance(inputs1, dict) and isinstance(inputs2, dict): + if inputs1.keys() != inputs2.keys(): + return False + for key in inputs1: + val1 = inputs1[key].to(get_accelerator().current_device()) + val2 = inputs2[key].to(get_accelerator().current_device()) + if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor): + if not torch.equal(val1, val2): + return False + elif val1 != val2: + return False + return True + + return False diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index e9550a0ec25a..6dc750035061 100755 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -46,8 +46,6 @@ # All to All quantized graident communication groups _ALL_TO_ALL_GROUP = {} -_DATA_PARALLEL_GROUP = None - mesh_device = None @@ -64,6 +62,127 @@ def _ensure_divisibility(numerator, denominator): assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator) +# ======== Start: Tensor Parallel Group Attributes ======== + +# Intra-layer model parallel group that the current rank belongs to. +_TENSOR_MODEL_PARALLEL_GROUP = None + +# Model parallel group (both intra- and pipeline) that the current rank belongs to. +_MODEL_PARALLEL_GROUP = None +# Data parallel group that the current rank belongs to. +_DATA_PARALLEL_GROUP = None + +# These values enable us to change the mpu sizes on the fly. +_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None +_MPU_TENSOR_MODEL_PARALLEL_RANK = None + + +def _init_tp_mesh_device(tensor_model_parallel_size=1, data_parallel_size=None): + """Initialize model data parallel groups.""" + + global _DATA_PARALLEL_GROUP + global _MODEL_PARALLEL_GROUP + global _TENSOR_MODEL_PARALLEL_GROUP + + if _TENSOR_MODEL_PARALLEL_GROUP is not None: + return + + if data_parallel_size is None: + data_parallel_size = dist.get_world_size() // tensor_model_parallel_size + + mesh_device = dist.initialize_mesh_device((data_parallel_size, tensor_model_parallel_size), + ("data_parallel", "tensor_parallel")) + _TENSOR_MODEL_PARALLEL_GROUP = mesh_device.get_group(mesh_dim="tensor_parallel") + _DATA_PARALLEL_GROUP = mesh_device.get_group(mesh_dim="data_parallel") + + # They are always equal only in 2D (DP + TP) parallelism. + # _MODEL_PARALLEL_GROUP is assigned the same value as _TENSOR_MODEL_PARALLEL_GROUP + # to allow for future potential changes. + _MODEL_PARALLEL_GROUP = _TENSOR_MODEL_PARALLEL_GROUP + + return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP + + +def get_tensor_model_parallel_group(): + """Get the tensor model parallel group the caller rank belongs to.""" + + assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \ + 'intra_layer_model parallel group is not initialized' + return _TENSOR_MODEL_PARALLEL_GROUP + + +def get_model_parallel_group(): + """Get the model parallel group the caller rank belongs to.""" + + assert _MODEL_PARALLEL_GROUP is not None, \ + 'model parallel group is not initialized' + return _MODEL_PARALLEL_GROUP + + +def get_data_parallel_group(): + """Get the data parallel group the caller rank belongs to.""" + assert _DATA_PARALLEL_GROUP is not None, \ + 'data parallel group is not initialized' + return _DATA_PARALLEL_GROUP + + +def set_tensor_model_parallel_world_size(world_size): + """Set the tensor model parallel size""" + global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size + + +def get_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None: + return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE + return dist.get_world_size(group=get_tensor_model_parallel_group()) + + +def get_model_parallel_world_size(): + return get_tensor_model_parallel_world_size() + + +def set_tensor_model_parallel_rank(rank): + """Set tensor model parallel rank.""" + global _MPU_TENSOR_MODEL_PARALLEL_RANK + _MPU_TENSOR_MODEL_PARALLEL_RANK = rank + + +def get_tensor_model_parallel_rank(): + """Return my rank for the tensor model parallel group.""" + global _MPU_TENSOR_MODEL_PARALLEL_RANK + if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None: + return _MPU_TENSOR_MODEL_PARALLEL_RANK + return dist.get_rank(group=get_tensor_model_parallel_group()) + + +def get_model_parallel_rank(): + return get_tensor_model_parallel_rank() + + +def get_tensor_model_parallel_src_rank(): + """Calculate the global rank corresponding to the first local rank + in the tensor model parallel group.""" + global_rank = dist.get_rank() + local_world_size = get_tensor_model_parallel_world_size() + return (global_rank // local_world_size) * local_world_size + + +def get_data_parallel_world_size(): + """Return world size for the data parallel group.""" + return dist.get_world_size(group=get_data_parallel_group()) + + +def get_data_parallel_rank(): + """Return my rank for the data parallel group.""" + return dist.get_rank(group=get_data_parallel_group()) + + +# ======== End: Tensor Parallel Group Attributes ======== + + # Not currently used. Helper function to create a model (tensor) parallel group. def _create_model_parallel(model_parallel_size_): """ diff --git a/tests/unit/model_parallelism/test_autotp_training.py b/tests/unit/model_parallelism/test_autotp_training.py new file mode 100644 index 000000000000..fc1f0624ec87 --- /dev/null +++ b/tests/unit/model_parallelism/test_autotp_training.py @@ -0,0 +1,574 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import deepspeed.comm as dist +import torch +import math +from copy import deepcopy + +from unit.common import DistributedTest, preferred_dtype +import deepspeed +from deepspeed.accelerator import get_accelerator +from unit.simple_model import SimpleModel, random_dataloader +from deepspeed.utils import groups +from contextlib import contextmanager +from torch import nn +from deepspeed.module_inject.layers import LinearAllreduce, LinearLayer, set_autotp_mode +from unit.checkpoint.common import compare_lr_scheduler_states, compare_optimizer_states +import os + + +def skip_on_device(): + if get_accelerator().device_name() == 'xpu': + pytest.skip(f"XPU requires a higher version for test") + + +class SequentialLinearModel(torch.nn.Module): + + def __init__(self, hidden_dim, empty_grad=False, nlayers=1): + super(SequentialLinearModel, self).__init__() + self.linears = torch.nn.ModuleList( + [torch.nn.Linear(hidden_dim, hidden_dim, bias=None) for i in range(nlayers)]) + if empty_grad: + self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=None) + self.cross_entropy_loss = torch.nn.CrossEntropyLoss() + self.empty_grad = empty_grad + + def forward(self, x, y): + if len(self.linears) == 1: + x = self.linears[0](x) + else: + for i, l in enumerate(self.linears): + x = self.linears[i](x) + return self.cross_entropy_loss(x, y) + + +@contextmanager +def should_assert_with_msg(expected_message): + try: + yield + except AssertionError as e: + if dist.get_rank() == 0: + print(expected_message) + print(str(e)) + if str(e) == expected_message: + pass + else: + raise e + + +@pytest.mark.parametrize("tp_size", [2, 4]) +class TestTpParallelStates(DistributedTest): + world_size = 4 + + def test(self, tp_size: int): + skip_on_device() + set_autotp_mode(training=True) + + dp_size = 4 / tp_size + hidden_dim = 128 + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": 0 + } + } + model = SimpleModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + assert groups.get_tensor_model_parallel_world_size() == tp_size + assert groups.get_data_parallel_world_size() == dp_size + + +@pytest.mark.parametrize("tp_size", [2, 4]) +class TestTpDataloaderCorrectness(DistributedTest): + world_size = 4 + reuse_dist_env = True + + def test(self, tp_size: int): + skip_on_device() + hidden_dim = 128 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": 0, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + + model = SimpleModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + data_loader = random_dataloader(model=model, + total_samples=3, + hidden_dim=hidden_dim, + device=model.device, + dtype=preferred_dtype()) + dist.barrier() + with should_assert_with_msg( + "Data inconsistency within the TP group. Please check the Dataloader implementation to ensure consistency." + ): + for batch in data_loader: + # batch[0].requires_grad = requires_grad + batch[0] += dist.get_rank() + model(batch[0], batch[1]) + + model = SimpleModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + data_loader = random_dataloader(model=model, + total_samples=3, + hidden_dim=hidden_dim, + device=model.device, + dtype=preferred_dtype()) + for batch in data_loader: + dist.broadcast(batch[0], + src=groups.get_tensor_model_parallel_src_rank(), + group=groups.get_tensor_model_parallel_group()) + dist.broadcast(batch[1], + src=groups.get_tensor_model_parallel_src_rank(), + group=groups.get_tensor_model_parallel_group()) + model(batch[0], batch[1]) + + +def process_linear_layer(hidden_dim, input): + torch.manual_seed(42) + torch_linear = nn.Linear(hidden_dim, + hidden_dim, + dtype=preferred_dtype(), + device=get_accelerator().current_device(), + bias=None) + torch_out = torch_linear(input) + torch_loss = torch_out.sum() + torch_loss.backward() + return torch_linear, torch_out + + +@pytest.mark.sequential +@pytest.mark.parametrize("tp_size", [2, 4]) +class TestTpLayerFwdBwd(DistributedTest): + world_size = 4 + reuse_dist_env = True + + def testRowParallel(self, tp_size: int): + skip_on_device() + hidden_dim = 128 + batch_size_per_device = 1 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": 0, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + model = SequentialLinearModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + input = torch.randn(batch_size_per_device, + hidden_dim, + dtype=preferred_dtype(), + requires_grad=True, + device=get_accelerator().current_device()) + + dist.broadcast(input, + groups.get_tensor_model_parallel_src_rank(), + group=groups.get_tensor_model_parallel_group()) + + torch_linear, torch_out = process_linear_layer(hidden_dim, input) + linear = LinearAllreduce(deepcopy(torch_linear), groups.get_tensor_model_parallel_group()) + + input_ = torch.chunk(input, tp_size, dim=-1)[groups.get_tensor_model_parallel_rank()] + out = linear(input_.to(get_accelerator().current_device())) + loss = out.sum() + loss.backward() + + torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=1)[groups.get_tensor_model_parallel_rank()] + assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3) + assert torch.allclose(out, torch_out.to(get_accelerator().current_device()), atol=1e-3) + + def testColumnParallel(self, tp_size: int): + skip_on_device() + hidden_dim = 128 + batch_size_per_device = 1 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": 0, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + + model = SequentialLinearModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + input = torch.randn(batch_size_per_device, + hidden_dim, + dtype=preferred_dtype(), + requires_grad=True, + device=get_accelerator().current_device()) + dist.broadcast(input, + groups.get_tensor_model_parallel_src_rank(), + group=groups.get_tensor_model_parallel_group()) + + torch_linear, torch_out = process_linear_layer(hidden_dim, input) + + linear = LinearLayer(deepcopy(torch_linear), groups.get_tensor_model_parallel_group()) + + out = linear(input.to(get_accelerator().current_device())) + loss = out.sum() + loss.backward() + + cur_device_out = torch.chunk(torch_out, tp_size, dim=-1)[groups.get_tensor_model_parallel_rank()] + torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=0)[groups.get_tensor_model_parallel_rank()] + assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3) + assert torch.allclose(cur_device_out.to(get_accelerator().current_device()).contiguous(), + out.contiguous(), + atol=1e-3) + + +@pytest.mark.sequential +class TestParamsGather(DistributedTest): + world_size = 4 + reuse_dist_env = True + + @pytest.mark.parametrize("layer_type", ["linear", "linearallreduce"]) + def test(self, layer_type): + skip_on_device() + tp_size = 4 + hidden_dim = 128 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": 0, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + + torch.manual_seed(42) + model = SequentialLinearModel(hidden_dim=hidden_dim) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + + torch_linear = nn.Linear(hidden_dim, hidden_dim, dtype=preferred_dtype(), device="cpu", bias=None) + total_params = sum(p.numel() for p in torch_linear.parameters()) + + tp_layer = None + if layer_type == "linear": + tp_layer = LinearLayer(torch_linear, groups.get_tensor_model_parallel_group()) + elif layer_type == "linearallreduce": + tp_layer = LinearAllreduce(torch_linear, groups.get_tensor_model_parallel_group()) + else: + raise ValueError(f"Invalid linear type: {config_dict['linear_type']}") + + tp_params = sum(p.numel() for p in tp_layer.parameters()) + + assert total_params // tp_size == tp_params + for name, param in tp_layer.named_parameters(recurse=False): + param.gather_params([param]) + + is_same_weights = all( + torch.equal(param1, param2) for param1, param2 in zip(tp_layer.parameters(), torch_linear.parameters())) + + assert is_same_weights + + params1 = sum(p.numel() for p in tp_layer.parameters()) + assert total_params == params1 + + for name, param in tp_layer.named_parameters(recurse=False): + param.partition([param]) + + tp_params2 = sum(p.numel() for p in tp_layer.parameters()) + + assert total_params // tp_size == tp_params2 + + +def dummy_init_engine(config): + # This is a dummy initialization function for the DeepSpeed engine. + # We only need to use the config to initialize the distributed settings for the test. + model = SequentialLinearModel(hidden_dim=8) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config) + + +def prepare_tp_model(hidden_dim, nlayers, linear_indices, allreduce_indices, group, return_global_copy=False): + model = SequentialLinearModel(hidden_dim=hidden_dim, nlayers=nlayers).to(preferred_dtype()) + base_model = None + if return_global_copy: + base_model = deepcopy(model) + for i in linear_indices: + layer = LinearLayer(model.linears[i], group) + model.linears[i] = layer + + for i in allreduce_indices: + layer = LinearAllreduce(model.linears[i], group) + model.linears[i] = layer + + return model, base_model + + +@pytest.mark.parametrize("zero_stage", [0, 1]) +@pytest.mark.parametrize("tp_size", [2, 4]) +class TestSave(DistributedTest): + + world_size = 4 + reuse_dist_env = True + + def test_save_original_weight(self, tp_size: int, zero_stage: int): + skip_on_device() + hidden_dim = 64 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": zero_stage, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + dummy_init_engine(config_dict) + torch.manual_seed(42) + + model, base_model = prepare_tp_model(hidden_dim, + 8, [2, 5], [3, 6], + groups.get_tensor_model_parallel_group(), + return_global_copy=True) + model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) + + cur_params_numel = sum(p.numel() for p in model.parameters()) + base_params_numel = sum(p.numel() for p in base_model.parameters()) + assert cur_params_numel < base_params_numel + + tp_state_dict = model._consolidated_16bit_state_dict() + + def compare_state_dicts(state_dict1, state_dict2): + if state_dict1.keys() != state_dict2.keys(): + print("The state_dicts have different keys!") + return False + + for key in state_dict1: + if not torch.allclose(state_dict1[key], state_dict2[key], atol=1e-3): + assert state_dict1[key].device == "cpu" + print(f"Parameters for {key} are different!") + return False + + return True + + base_state_dict = base_model.state_dict() + if dist.get_rank() == 0: + # we should consider the case when zero3 is used in the future. + assert compare_state_dicts(base_state_dict, tp_state_dict), f"State_dict is not the same!" + else: + assert tp_state_dict is None, f"noly rank0 should have the state_dict" + + def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int): + skip_on_device() + hidden_dim = 64 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-3 + } + }, + "zero_optimization": { + "stage": zero_stage, + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 + } + } + } + + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + config_dict["bf16"] = {"enabled": True} + + dummy_init_engine(config_dict) + + trained_model, _ = prepare_tp_model(hidden_dim, 8, [2, 5], [3, 6], groups.get_tensor_model_parallel_group()) + loaded_model, _ = prepare_tp_model(hidden_dim, 8, [2, 5], [3, 6], groups.get_tensor_model_parallel_group()) + + trained_model, _, _, _ = deepspeed.initialize(model=trained_model, + model_parameters=trained_model.parameters(), + config=config_dict) + torch.manual_seed(42) + + data_loader = random_dataloader(model=trained_model, + total_samples=3, + hidden_dim=hidden_dim, + device=trained_model.device, + dtype=preferred_dtype()) + ckpt_path = os.path.join(tmpdir, 'tp_saved_checkpoint') + for i, batch in enumerate(data_loader): + batch[0].requires_grad = True + loss = trained_model(batch[0], batch[1]) + loss = loss + trained_model.backward(loss) + trained_model.step() + trained_model.save_checkpoint(ckpt_path) + + loaded_model, _, _, _ = deepspeed.initialize(model=loaded_model, + model_parameters=loaded_model.parameters(), + config=config_dict) + loaded_model.load_checkpoint(ckpt_path, load_optimizer_states=True, load_lr_scheduler_states=True) + compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16=(preferred_dtype() == torch.float16)) + compare_lr_scheduler_states(trained_model, loaded_model) + + +@pytest.mark.parametrize("zero_stage", [0, 1]) +@pytest.mark.parametrize("tp_size", [2, 4]) +class TestTpGradNorm(DistributedTest): + + world_size = 4 + reuse_dist_env = True + + def test(self, tp_size: int, zero_stage: int): + skip_on_device() + hidden_dim = 64 + set_autotp_mode(training=True) + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6 + } + }, + "tensor_parallel": { + "autotp_size": tp_size + }, + "zero_optimization": { + "stage": zero_stage, + } + } + if preferred_dtype() is torch.float16: + config_dict["fp16"] = {"enabled": True} + elif preferred_dtype() is torch.bfloat16: + if zero_stage == 0: + pytest.skip( + "This test has an overflow data and needs to implement an overflow skip mechanism in BF16_optimizer" + ) + config_dict["bf16"] = {"enabled": True} + + torch.manual_seed(42) + + dummy_init_engine(config=config_dict) + tp_model, base_model = prepare_tp_model(hidden_dim, + 8, [2, 5], [3, 6], + groups.get_tensor_model_parallel_group(), + return_global_copy=True) + + base_model, base_optimizer, _, _ = deepspeed.initialize(model=base_model, + model_parameters=base_model.parameters(), + config=config_dict) + data_loader = random_dataloader(model=base_model, + total_samples=20, + hidden_dim=hidden_dim, + device=base_model.device, + dtype=preferred_dtype()) + + for i, batch in enumerate(data_loader): + batch[0].requires_grad = True + loss = base_model(batch[0], batch[1]) + loss = loss + base_model.backward(loss) + base_model.step() + + base_norm = base_optimizer._global_grad_norm + + base_model.destroy() + + tp_model, tp_optimizer, _, _ = deepspeed.initialize(model=tp_model, + model_parameters=tp_model.parameters(), + config=config_dict) + for i, batch in enumerate(data_loader): + batch[0].requires_grad = True + loss = tp_model(batch[0], batch[1]) + loss = loss + tp_model.backward(loss) + tp_model.step() + + tp_norm = tp_optimizer._global_grad_norm + + assert math.isclose(base_norm, tp_norm, abs_tol=1e-3) + tp_params_numel = sum(p.numel() for p in tp_model.parameters()) + base_params_numel = sum(p.numel() for p in base_model.parameters()) + assert tp_params_numel < base_params_numel