From 57c93b22244935419ddf79cb7b6f6f1003d752ed Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:11:08 -0800
Subject: [PATCH 1/8] docs: add hf ckpt to faq, and include verl apis in the
 website

---
 .readthedocs.yaml | 4 +++-
 README.md         | 3 ++-
 docs/faq/faq.rst  | 5 +++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 2df37965..1ba674e3 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,4 +13,6 @@ sphinx:
 
 python:
   install:
-    - requirements: docs/requirements-docs.txt
\ No newline at end of file
+    - requirements: docs/requirements-docs.txt
+    - method: pip
+      path: .
diff --git a/README.md b/README.md
index bcf74724..2807b302 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,8 @@ verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The
 - [Logic R1](https://github.com/Unakar/Logic-RL): a reproduced DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset.
 - [deepscaler](https://github.com/agentica-project/deepscaler): iterative context scaling with GRPO
 - [critic-rl](https://github.com/HKUNLP/critic-rl): Teaching Language Models to Critique via Reinforcement Learning
-- [Easy-R1](https://github.com/hiyouga/EasyR1): Multi-Modality RL
+- [Easy-R1](https://github.com/hiyouga/EasyR1): Multi-Modality RL training framework
+- [Self-rewarding correction for mathematical reasoning](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with generative reward models
 
 ## Contribution Guide
 Contributions from the community are welcome!
diff --git a/docs/faq/faq.rst b/docs/faq/faq.rst
index d27dc9f6..dc0bbd2f 100644
--- a/docs/faq/faq.rst
+++ b/docs/faq/faq.rst
@@ -55,3 +55,8 @@ Please set the following environment variable. The env var must be set before th
     export VLLM_ATTENTION_BACKEND=XFORMERS
 
 If in doubt, print this env var in each rank to make sure it is properly set.
+
+Checkpoints
+------------------------
+
+If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``scripts/model_merger.py``.

From c607ad54d0b26e901f4f13cc2c233c733b3a6134 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:20:57 -0800
Subject: [PATCH 2/8] fix rust in env

---
 .readthedocs.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 1ba674e3..29b33041 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,6 +7,7 @@ build:
   os: ubuntu-22.04
   tools:
     python: "3.8"
+    rust: "1.70"
 
 sphinx:
   configuration: docs/conf.py

From 21ffd047d3cd95adf7ce892c14385d9759992eec Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:26:53 -0800
Subject: [PATCH 3/8] fix tokenizer

---
 docs/requirements-docs.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 439f85ef..49ecc0af 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -6,4 +6,7 @@ sphinx-markdown-tables
 # theme default rtd
 
 # crate-docs-theme
-sphinx-rtd-theme
\ No newline at end of file
+sphinx-rtd-theme
+
+# pin tokenizers version to avoid env_logger version req
+tokenizers==0.19.1

From 444bbb21f1d607ca38c0b56042065bade4a16cc2 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:33:55 -0800
Subject: [PATCH 4/8] fix type

---
 verl/protocol.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/verl/protocol.py b/verl/protocol.py
index 737ec140..c1c10a74 100644
--- a/verl/protocol.py
+++ b/verl/protocol.py
@@ -84,7 +84,7 @@ def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> Ten
     return tensor_dict1
 
 
-def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarray]) -> dict[np.ndarray]:
+def union_numpy_dict(tensor_dict1: Dict[np.ndarray], tensor_dict2: Dict[np.ndarray]) -> Dict[np.ndarray]:
     for key, val in tensor_dict2.items():
         if key in tensor_dict1:
             assert isinstance(tensor_dict2[key], np.ndarray)
@@ -448,19 +448,17 @@ def union(self, other: 'DataProto') -> 'DataProto':
         return self
 
     def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
-        """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
+        r"""Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
         dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
 
+
         Args:
-            mini_batch_size (int): mini-batch size when iterating the dataset. We require that
-                ``batch.batch_size[0] % mini_batch_size == 0``
+            mini_batch_size (int): mini-batch size when iterating the dataset. We require that ``batch.batch_size[0] % mini_batch_size == 0``.
             epochs (int): number of epochs when iterating the dataset.
-            dataloader_kwargs: internally, it returns a DataLoader over the batch.
-                The dataloader_kwargs is the kwargs passed to the DataLoader
+            dataloader_kwargs (Any): internally, it returns a DataLoader over the batch. The dataloader_kwargs is the kwargs passed to the DataLoader.
 
         Returns:
-            Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is
-            ``self.batch.batch_size * epochs // mini_batch_size``
+            Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is ``self.batch.batch_size * epochs // mini_batch_size``
         """
         assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
         # we can directly create a dataloader from TensorDict

From 38890111f361dafc0649cbfc6841ca070e3d5790 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:38:56 -0800
Subject: [PATCH 5/8] fix type

---
 verl/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/protocol.py b/verl/protocol.py
index c1c10a74..e0656e69 100644
--- a/verl/protocol.py
+++ b/verl/protocol.py
@@ -84,7 +84,7 @@ def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> Ten
     return tensor_dict1
 
 
-def union_numpy_dict(tensor_dict1: Dict[np.ndarray], tensor_dict2: Dict[np.ndarray]) -> Dict[np.ndarray]:
+def union_numpy_dict(tensor_dict1: Dict[str, np.ndarray], tensor_dict2: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
     for key, val in tensor_dict2.items():
         if key in tensor_dict1:
             assert isinstance(tensor_dict2[key], np.ndarray)

From fa5d923197bf929f8413f2a114d2e8c94d17a407 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:46:32 -0800
Subject: [PATCH 6/8] fix type

---
 verl/protocol.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/protocol.py b/verl/protocol.py
index e0656e69..463fb6ee 100644
--- a/verl/protocol.py
+++ b/verl/protocol.py
@@ -97,7 +97,7 @@ def union_numpy_dict(tensor_dict1: Dict[str, np.ndarray], tensor_dict2: Dict[str
     return tensor_dict1
 
 
-def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
+def list_of_dict_to_dict_of_list(list_of_dict: List[Dict]):
     if len(list_of_dict) == 0:
         return {}
     keys = list_of_dict[0].keys()
@@ -148,7 +148,7 @@ def unfold_batch_dim(data: 'DataProto', batch_dims=2):
     return DataProto(batch=tensor, non_tensor_batch=non_tensor_new, meta_info=data.meta_info)
 
 
-def collate_fn(x: list['DataProtoItem']):
+def collate_fn(x: List['DataProtoItem']):
     batch = []
     non_tensor_batch = []
     for data in x:

From 6c97e56150fa3649d4ac5734a7e492bd2aa2f432 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 10:50:57 -0800
Subject: [PATCH 7/8] fix readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2807b302..ae1c095e 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The
 - [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning agent training framework
 - [Logic R1](https://github.com/Unakar/Logic-RL): a reproduced DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset.
 - [deepscaler](https://github.com/agentica-project/deepscaler): iterative context scaling with GRPO
-- [critic-rl](https://github.com/HKUNLP/critic-rl): Teaching Language Models to Critique via Reinforcement Learning
+- [Teaching Language Models to Critique via Reinforcement Learning](https://github.com/HKUNLP/critic-rl): LLM critics for code generation
 - [Easy-R1](https://github.com/hiyouga/EasyR1): Multi-Modality RL training framework
 - [Self-rewarding correction for mathematical reasoning](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with generative reward models
 

From 169e226bf2876c57f6dc63f4f17b1d28da43515d Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 28 Feb 2025 13:28:03 -0800
Subject: [PATCH 8/8] add Search-R1

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ae1c095e..b1d9ba08 100644
--- a/README.md
+++ b/README.md
@@ -118,19 +118,20 @@ If you find the project helpful, please cite:
 verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and supported by Anyscale, Bytedance, LMSys.org, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, and University of Hong Kong.
 
 ## Awesome work using verl
-- [Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization](https://arxiv.org/abs/2410.09302)
-- [Flaming-hot Initiation with Regular Execution Sampling for Large Language Models](https://arxiv.org/abs/2410.21236)
-- [Process Reinforcement Through Implicit Rewards](https://github.com/PRIME-RL/PRIME/)
-- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of DeepSeek R1 Zero recipe for reasoning tasks
-- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning agent training framework
-- [Logic R1](https://github.com/Unakar/Logic-RL): a reproduced DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset.
+- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of **DeepSeek R1 Zero** recipe for reasoning tasks
+- [PRIME](https://github.com/PRIME-RL/PRIME): Process reinforcement through implicit rewards
+- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning **agent** training framework
+- [Logic-RL](https://github.com/Unakar/Logic-RL): a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset.
 - [deepscaler](https://github.com/agentica-project/deepscaler): iterative context scaling with GRPO
-- [Teaching Language Models to Critique via Reinforcement Learning](https://github.com/HKUNLP/critic-rl): LLM critics for code generation
-- [Easy-R1](https://github.com/hiyouga/EasyR1): Multi-Modality RL training framework
-- [Self-rewarding correction for mathematical reasoning](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with generative reward models
+- [critic-rl](https://github.com/HKUNLP/critic-rl): LLM critics for code generation
+- [Easy-R1](https://github.com/hiyouga/EasyR1): **Multi-modal** RL training framework
+- [self-rewarding-reasoning-LLM](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with **generative reward models**
+- [Search-R1](https://github.com/PeterGriffinJin/Search-R1): RL with reasoning and **searching (tool-call)** interleaved LLMs
+- [DQO](https://arxiv.org/abs/2410.09302): Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization
+- [FIRE](https://arxiv.org/abs/2410.21236): Flaming-hot initiation with regular execution sampling for large language models
 
 ## Contribution Guide
-Contributions from the community are welcome!
+Contributions from the community are welcome! Please checkout our [roadmap](https://github.com/volcengine/verl/issues/22) and [release plan](https://github.com/volcengine/verl/issues/354).
 
 ### Code formatting
 We use yapf (Google style) to enforce strict code formatting when reviewing PRs. To reformat you code locally, make sure you installed **latest** `yapf`