pytorch · mori360 · Feb 7, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 5, 2025
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -418,6 +418,22 @@ def build_test_list():
             "test_generate",
             ngpu=2,
         ),
+        OverrideDefinitions(
 def test_parse_pp_split_points(self): 
 def test_parse_pp_split_points(self): 
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--training.steps 10",
+                ],
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--checkpoint.exclude_from_loading lr_scheduler,dataloader,optimizer",
+                    "--training.tensor_parallel_degree 2",
+                    "--training.steps 20",
+                ],
+            ],
+            "Optional checkpoint",
+            "optional_checkpoint",
+        ),
     ]
     return integration_tests_flavors
 

diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py
@@ -170,11 +170,8 @@ def __init__(
             which is gauranteed for the model by correct pipeline splitting and for the optimizer by the flattening
             support described in (1).
 
-        3. LR schedulers also index model states like optimizers and would need to be flattened properly to support
-        resharding.  Unfortunately, the implementations of different lr_schedulers do not follow a clear pattern like
-        optimizers do, so it's hard to write a generic 'flattener' utility.
-
-            TODO: This is currently unsolved and needs a fix.
+        3. LR schedulers also index model states like optimizers. Here we flatten the lr_schedulers by the ssumption that
-        3. LR schedulers also index model states like optimizers. Here we flatten the lr_schedulers by the ssumption that
+        3. LR schedulers also index model states like optimizers. Here we flatten the lr_schedulers with the assumption that
-        3. LR schedulers also index model states like optimizers. Here we flatten the lr_schedulers by the ssumption that
+        3. LR schedulers also index model states like optimizers. Here we flatten the lr_schedulers with the assumption that
+        all lr_schedulers have the same state_dict.
         """
         self.states = states
 
@@ -203,6 +200,11 @@ def __init__(
 
         self.model_weights_only = ckpt_config.model_weights_only
         self.export_dtype = TORCH_DTYPE_MAP[ckpt_config.export_dtype]
+        self.exclude_from_loading = (
+            [item.strip() for item in ckpt_config.exclude_from_loading]
+            if ckpt_config.exclude_from_loading
+            else []
+        )
 
         self.mp = None
         if async_mode == AsyncMode.DISABLED:
@@ -435,10 +437,17 @@ def load(self, step: int = -1) -> bool:
         }
         logger.info(f"Loading the checkpoint at step {step}.")
         begin = time.monotonic()
+        shadow_states = {
+            k: v for k, v in states.items() if k not in self.exclude_from_loading
+        }
+        for exclude_key in self.exclude_from_loading:
+            if exclude_key != "" and exclude_key not in states:
+                raise ValueError(f"{exclude_key} not found in state_dict, skipping")
         dcp.load(
-            states,
+            shadow_states,
             checkpoint_id=self._create_checkpoint_id(step),
         )
+        states.update(shadow_states)
         logger.info(
             f"Finished loading the checkpoint in {time.monotonic() - begin:.2f} seconds."
         )

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -511,6 +511,16 @@ def __init__(self):
             default=-1,
             help="Load the checkpoint at the specified step. If -1, load the latest checkpoint.",
         )
+        self.parser.add_argument(
+            "--checkpoint.exclude_from_loading",
+            type=string_list,
+            default="",
 default=[], 
 default=[], 
+            help="""
+                Exclude specific keys from being loaded from the checkpoint.
+                Provide a comma-separated list of keys to exclude, e.g. 'optimizer,lr_scheduler,dataloader'.
+                This will load the model only, excluding the specified keys.
+            """,
+        )
         # activation checkpointing configs
         self.parser.add_argument(
             "--activation_checkpoint.mode",
@@ -618,6 +628,13 @@ def parse_args(self, args_list: list = sys.argv[1:]):
             exp["pipeline_parallel_split_points"] = string_list(
                 exp["pipeline_parallel_split_points"]
             )
+        if (
+            "checkpoint" in args_dict
+            and "exclude_from_loading" in args_dict["checkpoint"]
+            and isinstance(args_dict["checkpoint"]["exclude_from_loading"], str)
+        ):
+            ckpt = args_dict["checkpoint"]
+            ckpt["exclude_from_loading"] = string_list(ckpt["exclude_from_loading"])
 
         # override args dict with cmd_args
         cmd_args_dict = self._args_to_two_level_dict(cmd_args)
@@ -665,6 +682,9 @@ def parse_args_from_command_line(
                 # since the inferred type is just 'list' and it ends up flattening
                 # e.g. from ["layers.0", "layers.1"] into ["l", "a", "y", "e", "r", "s", ".0", ...]
                 aux_parser.add_argument("--" + arg, type=string_list)
+            elif arg == "checkpoint.exclude_from_loading":
+                # same as above for checkpoint.exclude_from_loading
-                # same as above for checkpoint.exclude_from_loading
+                # similar to the case above
-                # same as above for checkpoint.exclude_from_loading
+                # similar to the case above
+                aux_parser.add_argument("--" + arg, type=string_list)
             else:
                 aux_parser.add_argument("--" + arg, type=type(val))