Merge pull request #116 from invoke-ai/vae-mixed-precision

Improve mixed_precision docs
invoke-ai · Apr 25, 2024 · d8a954b · d8a954b
2 parents f0963fc + d0abb4e
commit d8a954b
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 38 deletions.
diff --git a/src/invoke_training/_shared/stable_diffusion/validation.py b/src/invoke_training/_shared/stable_diffusion/validation.py
@@ -38,8 +38,6 @@ def generate_validation_images_sd(  # noqa: C901
     """Generate validation images for the purpose of tracking image generation behaviour on fixed prompts throughout
     training.
     """
-    logger.info("Generating validation images.")
-
     # Record original model devices so that we can restore this state after running the pipeline with CPU model
     # offloading.
     unet_device = unet.device
@@ -68,6 +66,9 @@ def generate_validation_images_sd(  # noqa: C901
 
     validation_images = ValidationImages(images=[], epoch=epoch, step=step)
 
+    validation_step_dir = os.path.join(out_dir, "validation", f"epoch_{epoch:0>8}-step_{step:0>8}")
+    logger.info(f"Generating validation images ({validation_step_dir}).")
+
     # Run inference.
     with torch.no_grad():
         for prompt_idx in range(len(config.validation_prompts)):
@@ -96,15 +97,10 @@ def generate_validation_images_sd(  # noqa: C901
                     )
 
             # Save images to disk.
-            validation_dir = os.path.join(
-                out_dir,
-                "validation",
-                f"epoch_{epoch:0>8}-step_{step:0>8}",
-                f"prompt_{prompt_idx:0>4}",
-            )
-            os.makedirs(validation_dir)
+            validation_prompt_dir = os.path.join(validation_step_dir, f"prompt_{prompt_idx:0>4}")
+            os.makedirs(validation_prompt_dir)
             for image_idx, image in enumerate(images):
-                image_path = os.path.join(validation_dir, f"{image_idx:0>4}.jpg")
+                image_path = os.path.join(validation_prompt_dir, f"{image_idx:0>4}.jpg")
                 validation_images.images.append(
                     ValidationImage(file_path=image_path, prompt=positive_prompt, image_idx=image_idx)
                 )
@@ -160,8 +156,6 @@ def generate_validation_images_sdxl(  # noqa: C901
     """Generate validation images for the purpose of tracking image generation behaviour on fixed prompts throughout
     training.
     """
-    logger.info("Generating validation images.")
-
     # Record original model devices so that we can restore this state after running the pipeline with CPU model
     # offloading.
     unet_device = unet.device
@@ -189,6 +183,9 @@ def generate_validation_images_sdxl(  # noqa: C901
 
     validation_images = ValidationImages(images=[], epoch=epoch, step=step)
 
+    validation_step_dir = os.path.join(out_dir, "validation", f"epoch_{epoch:0>8}-step_{step:0>8}")
+    logger.info(f"Generating validation images ({validation_step_dir}).")
+
     # Run inference.
     with torch.no_grad():
         for prompt_idx in range(len(config.validation_prompts)):
@@ -217,15 +214,10 @@ def generate_validation_images_sdxl(  # noqa: C901
                     )
 
             # Save images to disk.
-            validation_dir = os.path.join(
-                out_dir,
-                "validation",
-                f"epoch_{epoch:0>8}-step_{step:0>8}",
-                f"prompt_{prompt_idx:0>4}",
-            )
-            os.makedirs(validation_dir)
+            validation_prompt_dir = os.path.join(validation_step_dir, f"prompt_{prompt_idx:0>4}")
+            os.makedirs(validation_prompt_dir)
             for image_idx, image in enumerate(images):
-                image_path = os.path.join(validation_dir, f"{image_idx:0>4}.jpg")
+                image_path = os.path.join(validation_prompt_dir, f"{image_idx:0>4}.jpg")
                 validation_images.images.append(
                     ValidationImage(file_path=image_path, prompt=positive_prompt, image_idx=image_idx)
                 )

diff --git a/src/invoke_training/pipelines/_experimental/sd_dpo_lora/config.py b/src/invoke_training/pipelines/_experimental/sd_dpo_lora/config.py
@@ -151,9 +151,20 @@ class SdDirectPreferenceOptimizationLoraConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use ('no','fp16','bf16 or 'fp8'). This value is passed to Hugging Face Accelerate.
-    See accelerate.Accelerator for more details.
-    """
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See `accelerate.Accelerator` for more details.
+    """  # noqa: E501
 
     xformers: bool = False
     """If true, use xformers for more efficient attention blocks.

diff --git a/src/invoke_training/pipelines/stable_diffusion/lora/config.py b/src/invoke_training/pipelines/stable_diffusion/lora/config.py
@@ -111,9 +111,22 @@ class SdLoraConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use ('no','fp16','bf16 or 'fp8'). This value is passed to Hugging Face Accelerate.
-    See accelerate.Accelerator for more details.
-    """
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See
+    [`accelerate.Accelerator.mixed_precision`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.mixed_precision)
+    for more details.
+    """  # noqa: E501
 
     xformers: bool = False
     """If true, use xformers for more efficient attention blocks.

diff --git a/src/invoke_training/pipelines/stable_diffusion/textual_inversion/config.py b/src/invoke_training/pipelines/stable_diffusion/textual_inversion/config.py
@@ -117,11 +117,22 @@ class SdTextualInversionConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use. This value is passed to Hugging Face Accelerate.
-    See
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See
     [`accelerate.Accelerator.mixed_precision`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.mixed_precision)
     for more details.
-    """
+    """  # noqa: E501
 
     xformers: bool = False
     """If `True`, use xformers for more efficient attention blocks.

diff --git a/src/invoke_training/pipelines/stable_diffusion_xl/lora/config.py b/src/invoke_training/pipelines/stable_diffusion_xl/lora/config.py
@@ -111,9 +111,22 @@ class SdxlLoraConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use ('no','fp16','bf16 or 'fp8'). This value is passed to Hugging Face Accelerate.
-    See accelerate.Accelerator for more details.
-    """
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See
+    [`accelerate.Accelerator.mixed_precision`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.mixed_precision)
+    for more details.
+    """  # noqa: E501
 
     xformers: bool = False
     """If true, use xformers for more efficient attention blocks.

diff --git a/src/invoke_training/pipelines/stable_diffusion_xl/lora_and_textual_inversion/config.py b/src/invoke_training/pipelines/stable_diffusion_xl/lora_and_textual_inversion/config.py
@@ -145,9 +145,22 @@ class SdxlLoraAndTextualInversionConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use ('no','fp16','bf16 or 'fp8'). This value is passed to Hugging Face Accelerate.
-    See accelerate.Accelerator for more details.
-    """
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See
+    [`accelerate.Accelerator.mixed_precision`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.mixed_precision)
+    for more details.
+    """  # noqa: E501
 
     xformers: bool = False
     """If true, use xformers for more efficient attention blocks.

diff --git a/src/invoke_training/pipelines/stable_diffusion_xl/textual_inversion/config.py b/src/invoke_training/pipelines/stable_diffusion_xl/textual_inversion/config.py
@@ -117,11 +117,22 @@ class SdxlTextualInversionConfig(BasePipelineConfig):
     """
 
     mixed_precision: Literal["no", "fp16", "bf16", "fp8"] = "no"
-    """The mixed precision mode to use. This value is passed to Hugging Face Accelerate.
-    See
+    """The mixed precision mode to use.
+
+    If mixed precision is enabled, then all non-trainable parameters will be cast to the specified precision. The
+    trainable parameters are always kept in float32 precision to avoid issues with numerical stability.
+
+    Recommendations:
+
+    - `"no"`: Use this mode if you have plenty of VRAM available.
+    - `"bf16"`: Use this mode if you have limited VRAM and a GPU that supports bfloat16.
+    - `"fp16"`: Use this mode if you have limited VRAM and a GPU that does not support bfloat16.
+    - `"fp8"`: You are likely to run into numerical stability issues with this mode. Only use this mode if you know what you are doing and are willing to work through some issues.
+
+    This value is passed to Hugging Face Accelerate. See
     [`accelerate.Accelerator.mixed_precision`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.mixed_precision)
     for more details.
-    """
+    """  # noqa: E501
 
     xformers: bool = False
     """If `True`, use xformers for more efficient attention blocks.