From 5766babd7df0f346517c2ce197970c271f4bf866 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 21:31:50 -0500 Subject: [PATCH 1/8] Flatten TrainingOutputConfig. --- ..._refinement_sd_pokemon_1x24gb_example.yaml | 3 +-- .../dpo_lora_sd_pickapic_1x24gb_example.yaml | 3 +-- ...inetune_lora_sd_pokemon_1x8gb_example.yaml | 3 +-- ...tune_lora_sdxl_pokemon_1x24gb_example.yaml | 3 +-- ...etune_lora_sdxl_pokemon_1x8gb_example.yaml | 3 +-- ...tual_inversion_sd_gnome_1x8gb_example.yaml | 3 +-- ...l_inversion_sdxl_gnome_1x24gb_example.yaml | 3 +-- .../config/shared/training_output_config.md | 1 - mkdocs.yml | 1 - .../config/pipelines/base_pipeline_config.py | 12 ++++++---- .../config/shared/training_output_config.py | 22 ------------------- .../dpo/diffusion_dpo_lora_sd.py | 8 +++---- .../stable_diffusion/finetune_lora_sd.py | 4 ++-- .../stable_diffusion/textual_inversion_sd.py | 8 +++---- .../stable_diffusion_xl/finetune_lora_sdxl.py | 4 ++-- .../textual_inversion_sdxl.py | 8 +++---- 16 files changed, 31 insertions(+), 58 deletions(-) delete mode 100644 docs/reference/config/shared/training_output_config.md delete mode 100644 src/invoke_training/config/shared/training_output_config.py diff --git a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml index abc58171..e39890e6 100644 --- a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml @@ -4,8 +4,7 @@ type: DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD seed: 1 -output: - base_output_dir: output/dpo +base_output_dir: output/dpo optimizer: learning_rate: 1e-4 diff --git a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml index 9b566a60..70264d8e 100644 --- a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml @@ -7,8 +7,7 @@ type: DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD seed: 1 -output: - base_output_dir: output/dpo +base_output_dir: output/dpo optimizer: learning_rate: 1e-4 diff --git a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml index 0f835525..ec9efcd2 100644 --- a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml @@ -9,8 +9,7 @@ type: FINETUNE_LORA_SD seed: 1 -output: - base_output_dir: output/finetune_lora_sd_pokemon/ +base_output_dir: output/finetune_lora_sd_pokemon/ optimizer: learning_rate: 1.0 diff --git a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml index 5f930080..d3d6dc77 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml @@ -8,8 +8,7 @@ # purposes. type: FINETUNE_LORA_SDXL seed: 1 -output: - base_output_dir: output/finetune_lora_sdxl_pokemon/ +base_output_dir: output/finetune_lora_sdxl_pokemon/ optimizer: learning_rate: 1.0 diff --git a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml index 4b8b6548..3b75a1fe 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml @@ -9,8 +9,7 @@ # - Achieve reasonable results *quickly* (<15mins) for demo purposes. type: FINETUNE_LORA_SDXL seed: 1 -output: - base_output_dir: output/finetune_lora_sdxl_pokemon/ +base_output_dir: output/finetune_lora_sdxl_pokemon/ optimizer: learning_rate: 1.0 diff --git a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml index 1e5e62b3..161145e5 100644 --- a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml +++ b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml @@ -4,8 +4,7 @@ type: TEXTUAL_INVERSION_SD seed: 1 -output: - base_output_dir: output/ti_sd_bruce_the_gnome +base_output_dir: output/ti_sd_bruce_the_gnome optimizer: learning_rate: 4e-3 diff --git a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml index f7595c60..9725b8a2 100644 --- a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml +++ b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml @@ -4,8 +4,7 @@ type: TEXTUAL_INVERSION_SDXL seed: 1 -output: - base_output_dir: output/ti_sdxl_bruce_the_gnome +base_output_dir: output/ti_sdxl_bruce_the_gnome optimizer: learning_rate: 2e-3 diff --git a/docs/reference/config/shared/training_output_config.md b/docs/reference/config/shared/training_output_config.md deleted file mode 100644 index 204d67a1..00000000 --- a/docs/reference/config/shared/training_output_config.md +++ /dev/null @@ -1 +0,0 @@ -::: invoke_training.config.shared.training_output_config diff --git a/mkdocs.yml b/mkdocs.yml index 4d0532db..1e3ae20b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -52,7 +52,6 @@ nav: - data_loader_config: reference/config/shared/data/data_loader_config.md - dataset_config: reference/config/shared/data/dataset_config.md - transform_config: reference/config/shared/data/transform_config.md - - training_output_config: reference/config/shared/training_output_config.md - optimizer_config: reference/config/shared/optimizer_config.md - Contributing: - contributing/development_environment.md diff --git a/src/invoke_training/config/pipelines/base_pipeline_config.py b/src/invoke_training/config/pipelines/base_pipeline_config.py index ad0d4fd2..35d95699 100644 --- a/src/invoke_training/config/pipelines/base_pipeline_config.py +++ b/src/invoke_training/config/pipelines/base_pipeline_config.py @@ -1,7 +1,7 @@ +import typing from typing import Optional from invoke_training.config.shared.config_base_model import ConfigBaseModel -from invoke_training.config.shared.training_output_config import TrainingOutputConfig class BasePipelineConfig(ConfigBaseModel): @@ -14,8 +14,12 @@ class BasePipelineConfig(ConfigBaseModel): set to `null`, training will be non-deterministic. """ - output: TrainingOutputConfig - """Configuration for the training run outputs (output directory, log format, checkpoint format, etc.). + base_output_dir: str + """The output directory where the training outputs (model checkpoints, logs, intermediate predictions) will be + written. A subdirectory will be created with a timestamp for each new training run. + """ - See [`TrainingOutputConfig`][invoke_training.config.shared.training_output_config.TrainingOutputConfig] for details. + report_to: typing.Literal["all", "tensorboard", "wandb", "comet_ml"] = "tensorboard" + """The integration to report results and logs to. This value is passed to Hugging Face Accelerate. See + `accelerate.Accelerator.log_with` for more details. """ diff --git a/src/invoke_training/config/shared/training_output_config.py b/src/invoke_training/config/shared/training_output_config.py deleted file mode 100644 index 8ffd430a..00000000 --- a/src/invoke_training/config/shared/training_output_config.py +++ /dev/null @@ -1,22 +0,0 @@ -import typing - -from invoke_training.config.shared.config_base_model import ConfigBaseModel - - -class TrainingOutputConfig(ConfigBaseModel): - """Configuration for a training run's output.""" - - base_output_dir: str - """The output directory where the training outputs (model checkpoints, logs, intermediate predictions) will be - written. A subdirectory will be created with a timestamp for each new training run. - """ - - report_to: typing.Optional[typing.Literal["all", "tensorboard", "wandb", "comet_ml"]] = "tensorboard" - """The integration to report results and logs to ('all', 'tensorboard', 'wandb', or 'comet_ml'). This value is - passed to Hugging Face Accelerate. See accelerate.Accelerator.log_with for more details. - """ - - save_model_as: typing.Literal["ckpt", "pt", "safetensors"] = "safetensors" - """The file type to save the model as. Note that "ckpt" and "pt" are alternative file extensions for the same file - format. - """ diff --git a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py index 84925580..31465b1b 100644 --- a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py +++ b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py @@ -187,12 +187,12 @@ def run_training(config: DirectPreferenceOptimizationLoRASDConfig): # noqa: C90 # ) # Create a timestamped directory for all outputs. - out_dir = os.path.join(config.output.base_output_dir, f"{time.time()}") + out_dir = os.path.join(config.base_output_dir, f"{time.time()}") ckpt_dir = os.path.join(out_dir, "checkpoints") os.makedirs(ckpt_dir) accelerator = initialize_accelerator( - out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.output.report_to + out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.report_to ) logger = initialize_logging(__name__, accelerator) @@ -426,14 +426,14 @@ def prep_peft_model(model, lr: float | None = None): epoch_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_epoch", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) step_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_step", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) diff --git a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py index 73a05ebe..af36b0fd 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py @@ -219,12 +219,12 @@ def run_training(config: FinetuneLoRASDConfig): # noqa: C901 # ) # Create a timestamped directory for all outputs. - out_dir = os.path.join(config.output.base_output_dir, f"{time.time()}") + out_dir = os.path.join(config.base_output_dir, f"{time.time()}") ckpt_dir = os.path.join(out_dir, "checkpoints") os.makedirs(ckpt_dir) accelerator = initialize_accelerator( - out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.output.report_to + out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.report_to ) logger = initialize_logging(__name__, accelerator) diff --git a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py index e037672c..725f5aa7 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py @@ -122,12 +122,12 @@ def _initialize_placeholder_tokens( def run_training(config: TextualInversionSDConfig): # noqa: C901 # Create a timestamped directory for all outputs. - out_dir = os.path.join(config.output.base_output_dir, f"{time.time()}") + out_dir = os.path.join(config.base_output_dir, f"{time.time()}") ckpt_dir = os.path.join(out_dir, "checkpoints") os.makedirs(ckpt_dir) accelerator = initialize_accelerator( - out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.output.report_to + out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.report_to ) logger = initialize_logging(__name__, accelerator) @@ -264,14 +264,14 @@ def run_training(config: TextualInversionSDConfig): # noqa: C901 epoch_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_epoch", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) step_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_step", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py index 5e3d9d7a..d8daae88 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py @@ -287,12 +287,12 @@ def run_training(config: FinetuneLoRASDXLConfig): # noqa: C901 # ) # Create a timestamped directory for all outputs. - out_dir = os.path.join(config.output.base_output_dir, f"{time.time()}") + out_dir = os.path.join(config.base_output_dir, f"{time.time()}") ckpt_dir = os.path.join(out_dir, "checkpoints") os.makedirs(ckpt_dir) accelerator = initialize_accelerator( - out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.output.report_to + out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.report_to ) logger = initialize_logging(__name__, accelerator) diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py index de0b4ef9..30764ef5 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py @@ -143,12 +143,12 @@ def _initialize_placeholder_tokens( def run_training(config: TextualInversionSDXLConfig): # noqa: C901 # Create a timestamped directory for all outputs. - out_dir = os.path.join(config.output.base_output_dir, f"{time.time()}") + out_dir = os.path.join(config.base_output_dir, f"{time.time()}") ckpt_dir = os.path.join(out_dir, "checkpoints") os.makedirs(ckpt_dir) accelerator = initialize_accelerator( - out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.output.report_to + out_dir, config.gradient_accumulation_steps, config.mixed_precision, config.report_to ) logger = initialize_logging(__name__, accelerator) @@ -294,14 +294,14 @@ def run_training(config: TextualInversionSDXLConfig): # noqa: C901 epoch_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_epoch", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) step_checkpoint_tracker = CheckpointTracker( base_dir=ckpt_dir, prefix="checkpoint_step", - extension=f".{config.output.save_model_as}", + extension=".safetensors", max_checkpoints=config.max_checkpoints, ) From 2bf88f409df6363e8fd8af888a2c98100cf36936 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 21:36:47 -0500 Subject: [PATCH 2/8] Remove unused train_unet_non_attention_blocks config param. --- .../config/pipelines/finetune_lora_config.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/invoke_training/config/pipelines/finetune_lora_config.py b/src/invoke_training/config/pipelines/finetune_lora_config.py index 8f348ebd..6fc5d831 100644 --- a/src/invoke_training/config/pipelines/finetune_lora_config.py +++ b/src/invoke_training/config/pipelines/finetune_lora_config.py @@ -60,12 +60,6 @@ class LoRATrainingConfig(BasePipelineConfig): """The learning rate to use for the UNet model. If set, this overrides the optimizer's default learning rate. """ - train_unet_non_attention_blocks: bool = False - """Whether to inject LoRA layers into the non-attention UNet blocks for training. Enabling will produce a more - expressive LoRA model at the cost of slower training, higher training VRAM requirements, and a larger LoRA weight - file. - """ - lora_rank_dim: int = 4 """The rank dimension to use for the LoRA layers. Increasing the rank dimension increases the model's expressivity, but also increases the size of the generated LoRA model. From f28c0fb58d1e91538a1f2473aeebcf3dfa8edf6f Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 22:01:50 -0500 Subject: [PATCH 3/8] Flatten OptimizerConfig. --- ...inetune_lora_sd_pokemon_1x8gb_example.yaml | 10 ++--- ...tune_lora_sdxl_pokemon_1x24gb_example.yaml | 10 ++--- ...etune_lora_sdxl_pokemon_1x8gb_example.yaml | 12 +++--- ...tual_inversion_sd_gnome_1x8gb_example.yaml | 7 ++-- ...l_inversion_sdxl_gnome_1x24gb_example.yaml | 7 ++-- .../config/_experimental/dpo/config.py | 3 +- .../config/pipelines/finetune_lora_config.py | 16 ++++++-- .../pipelines/textual_inversion_config.py | 26 ++++++------- .../shared/optimizer/optimizer_config.py | 37 ++++++------------- .../dpo/diffusion_dpo_lora_sd.py | 8 ++-- .../_shared/optimizer/optimizer_utils.py | 22 +++++------ .../stable_diffusion/finetune_lora_sd.py | 8 ++-- .../stable_diffusion/textual_inversion_sd.py | 6 +-- .../stable_diffusion_xl/finetune_lora_sdxl.py | 8 ++-- .../textual_inversion_sdxl.py | 6 +-- 15 files changed, 86 insertions(+), 100 deletions(-) diff --git a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml index ec9efcd2..5deb8aff 100644 --- a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml @@ -12,13 +12,11 @@ seed: 1 base_output_dir: output/finetune_lora_sd_pokemon/ optimizer: + optimizer_type: Prodigy learning_rate: 1.0 - - optimizer: - optimizer_type: Prodigy - weight_decay: 0.01 - use_bias_correction: True - safeguard_warmup: True + weight_decay: 0.01 + use_bias_correction: True + safeguard_warmup: True data_loader: type: IMAGE_CAPTION_SD_DATA_LOADER diff --git a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml index d3d6dc77..cded2b83 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml @@ -11,13 +11,11 @@ seed: 1 base_output_dir: output/finetune_lora_sdxl_pokemon/ optimizer: + optimizer_type: Prodigy learning_rate: 1.0 - - optimizer: - optimizer_type: Prodigy - weight_decay: 0.01 - use_bias_correction: True - safeguard_warmup: True + weight_decay: 0.01 + use_bias_correction: True + safeguard_warmup: True data_loader: type: IMAGE_CAPTION_SD_DATA_LOADER diff --git a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml index 3b75a1fe..558ac985 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml @@ -9,16 +9,14 @@ # - Achieve reasonable results *quickly* (<15mins) for demo purposes. type: FINETUNE_LORA_SDXL seed: 1 -base_output_dir: output/finetune_lora_sdxl_pokemon/ +base_output_dir: output/finetune_lora_sdxl_pokemon/ optimizer: + optimizer_type: Prodigy learning_rate: 1.0 - - optimizer: - optimizer_type: Prodigy - weight_decay: 0.01 - use_bias_correction: True - safeguard_warmup: True + weight_decay: 0.01 + use_bias_correction: True + safeguard_warmup: True data_loader: type: IMAGE_CAPTION_SD_DATA_LOADER diff --git a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml index 161145e5..187d9dc2 100644 --- a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml +++ b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml @@ -7,12 +7,11 @@ seed: 1 base_output_dir: output/ti_sd_bruce_the_gnome optimizer: + optimizer_type: AdamW learning_rate: 4e-3 - lr_warmup_steps: 200 - lr_scheduler: cosine - optimizer: - optimizer_type: AdamW +lr_warmup_steps: 200 +lr_scheduler: cosine data_loader: type: TEXTUAL_INVERSION_SD_DATA_LOADER diff --git a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml index 9725b8a2..93e98087 100644 --- a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml +++ b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml @@ -7,12 +7,11 @@ seed: 1 base_output_dir: output/ti_sdxl_bruce_the_gnome optimizer: + optimizer_type: AdamW learning_rate: 2e-3 - lr_warmup_steps: 200 - lr_scheduler: cosine - optimizer: - optimizer_type: AdamW +lr_warmup_steps: 200 +lr_scheduler: cosine data_loader: type: TEXTUAL_INVERSION_SD_DATA_LOADER diff --git a/src/invoke_training/config/_experimental/dpo/config.py b/src/invoke_training/config/_experimental/dpo/config.py index dc3cd050..e34c516d 100644 --- a/src/invoke_training/config/_experimental/dpo/config.py +++ b/src/invoke_training/config/_experimental/dpo/config.py @@ -5,7 +5,6 @@ from invoke_training.config.pipelines.finetune_lora_config import LoRATrainingConfig from invoke_training.config.shared.config_base_model import ConfigBaseModel from invoke_training.config.shared.data.transform_config import SDImageTransformConfig -from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig class HFHubImagePairPreferenceDatasetConfig(ConfigBaseModel): @@ -37,7 +36,7 @@ class ImagePairPreferenceSDDataLoaderConfig(ConfigBaseModel): class DirectPreferenceOptimizationLoRASDConfig(LoRATrainingConfig): type: Literal["DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD"] = "DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD" - optimizer: OptimizerConfig + data_loader: ImagePairPreferenceSDDataLoaderConfig initial_lora: str | None = None diff --git a/src/invoke_training/config/pipelines/finetune_lora_config.py b/src/invoke_training/config/pipelines/finetune_lora_config.py index 6fc5d831..2804d4fa 100644 --- a/src/invoke_training/config/pipelines/finetune_lora_config.py +++ b/src/invoke_training/config/pipelines/finetune_lora_config.py @@ -1,3 +1,4 @@ +import typing from typing import Annotated, Literal, Optional, Union from pydantic import Field @@ -7,7 +8,7 @@ DreamboothSDDataLoaderConfig, ImageCaptionSDDataLoaderConfig, ) -from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer class LoRATrainingConfig(BasePipelineConfig): @@ -51,6 +52,8 @@ class LoRATrainingConfig(BasePipelineConfig): """Whether to add LoRA layers to the text encoder and train it. """ + optimizer: AdamOptimizer | ProdigyOptimizer = AdamOptimizer() + text_encoder_learning_rate: Optional[float] = None """The learning rate to use for the text encoder model. If set, this overrides the optimizer's default learning rate. @@ -60,6 +63,15 @@ class LoRATrainingConfig(BasePipelineConfig): """The learning rate to use for the UNet model. If set, this overrides the optimizer's default learning rate. """ + lr_scheduler: typing.Literal[ + "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" + ] = "constant" + + lr_warmup_steps: int = 0 + """The number of warmup steps in the learning rate scheduler. Only applied to schedulers that support warmup. + See lr_scheduler. + """ + lora_rank_dim: int = 4 """The rank dimension to use for the LoRA layers. Increasing the rank dimension increases the model's expressivity, but also increases the size of the generated LoRA model. @@ -153,7 +165,6 @@ class LoRATrainingConfig(BasePipelineConfig): class FinetuneLoRASDConfig(LoRATrainingConfig): type: Literal["FINETUNE_LORA_SD"] = "FINETUNE_LORA_SD" - optimizer: OptimizerConfig data_loader: Annotated[ Union[ImageCaptionSDDataLoaderConfig, DreamboothSDDataLoaderConfig], Field(discriminator="type") ] @@ -161,7 +172,6 @@ class FinetuneLoRASDConfig(LoRATrainingConfig): class FinetuneLoRASDXLConfig(LoRATrainingConfig): type: Literal["FINETUNE_LORA_SDXL"] = "FINETUNE_LORA_SDXL" - optimizer: OptimizerConfig data_loader: Annotated[ Union[ImageCaptionSDDataLoaderConfig, DreamboothSDDataLoaderConfig], Field(discriminator="type") ] diff --git a/src/invoke_training/config/pipelines/textual_inversion_config.py b/src/invoke_training/config/pipelines/textual_inversion_config.py index f333212b..a2d7d48f 100644 --- a/src/invoke_training/config/pipelines/textual_inversion_config.py +++ b/src/invoke_training/config/pipelines/textual_inversion_config.py @@ -1,8 +1,9 @@ +import typing from typing import Literal, Optional from invoke_training.config.pipelines.base_pipeline_config import BasePipelineConfig from invoke_training.config.shared.data.data_loader_config import TextualInversionSDDataLoaderConfig -from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer class TextualInversionTrainingConfig(BasePipelineConfig): @@ -73,6 +74,17 @@ class TextualInversionTrainingConfig(BasePipelineConfig): For example, if you are training on a dataset of images of pokemon, you might use `pokemon sketch white background`. """ + optimizer: AdamOptimizer | ProdigyOptimizer = AdamOptimizer() + + lr_scheduler: typing.Literal[ + "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" + ] = "constant" + + lr_warmup_steps: int = 0 + """The number of warmup steps in the learning rate scheduler. Only applied to schedulers that support warmup. + See lr_scheduler. + """ + cache_vae_outputs: bool = False """If True, the VAE will be applied to all of the images in the dataset before starting training and the results will be cached to disk. This reduces the VRAM requirements during training (don't have to keep the VAE in VRAM), and @@ -160,12 +172,6 @@ class TextualInversionSDConfig(TextualInversionTrainingConfig): """Must be `TEXTUAL_INVERSION_SD`. This is what differentiates training pipeline types. """ - optimizer: OptimizerConfig - """Configuration for the training optimizer (algorithm, learning rate, etc.). - - See [`OptimizerConfig`][invoke_training.config.shared.optimizer.optimizer_config.OptimizerConfig] for details. - """ - data_loader: TextualInversionSDDataLoaderConfig """The data configuration. @@ -180,12 +186,6 @@ class TextualInversionSDXLConfig(TextualInversionTrainingConfig): """Must be `TEXTUAL_INVERSION_SDXL`. This is what differentiates training pipeline types. """ - optimizer: OptimizerConfig - """Configuration for the training optimizer (algorithm, learning rate, etc.). - - See [`OptimizerConfig`][invoke_training.config.shared.optimizer.optimizer_config.OptimizerConfig] for details. - """ - data_loader: TextualInversionSDDataLoaderConfig """The data configuration. diff --git a/src/invoke_training/config/shared/optimizer/optimizer_config.py b/src/invoke_training/config/shared/optimizer/optimizer_config.py index ae2b8d1b..bcea23d5 100644 --- a/src/invoke_training/config/shared/optimizer/optimizer_config.py +++ b/src/invoke_training/config/shared/optimizer/optimizer_config.py @@ -6,6 +6,12 @@ class AdamOptimizer(ConfigBaseModel): optimizer_type: typing.Literal["AdamW"] = "AdamW" + learning_rate: float = 1e-4 + """Initial learning rate to use (after the potential warmup period). Note that in some training pipelines this can + be overriden for a specific group of params: https://pytorch.org/docs/stable/optim.html#per-parameter-options + (E.g. see `text_encoder_learning_rate` and `unet_learning_rate`) + """ + beta1: float = 0.9 beta2: float = 0.999 weight_decay: float = 1e-2 @@ -15,32 +21,11 @@ class AdamOptimizer(ConfigBaseModel): class ProdigyOptimizer(ConfigBaseModel): optimizer_type: typing.Literal["Prodigy"] = "Prodigy" + learning_rate: float = 1.0 + """The learning rate. For the Prodigy optimizer, the learning rate is adjusted dynamically. A value of 1.0 is + recommended. + """ + weight_decay: float = 0.0 use_bias_correction: bool = False safeguard_warmup: bool = False - - -class OptimizerConfig(ConfigBaseModel): - """Configuration for a training optimizer.""" - - optimizer: typing.Union[AdamOptimizer, ProdigyOptimizer] = AdamOptimizer() - - learning_rate: float = 1e-4 - """Initial learning rate to use (after the potential warmup period). Note that in some training pipelines this can - be overriden for a specific group of params: https://pytorch.org/docs/stable/optim.html#per-parameter-options - (E.g. see `text_encoder_learning_rate` and `unet_learning_rate`) - """ - - lr_scheduler: typing.Literal[ - "linear", - "cosine", - "cosine_with_restarts", - "polynomial", - "constant", - "constant_with_warmup", - ] = "constant" - - lr_warmup_steps: int = 0 - """The number of warmup steps in the learning rate scheduler. Only applied to schedulers that support warmup. - See lr_scheduler. - """ diff --git a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py index 31465b1b..7018959c 100644 --- a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py +++ b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py @@ -388,9 +388,9 @@ def prep_peft_model(model, lr: float | None = None): # (https://github.com/huggingface/accelerate/blame/49cb83a423f2946059117d8bb39b7c8747d29d80/src/accelerate/scheduler.py#L72-L82), # so the scaling here simply reverses that behaviour. lr_scheduler: torch.optim.lr_scheduler.LRScheduler = get_scheduler( - config.optimizer.lr_scheduler, + config.lr_scheduler, optimizer=optimizer, - num_warmup_steps=config.optimizer.lr_warmup_steps * accelerator.num_processes, + num_warmup_steps=config.lr_warmup_steps * accelerator.num_processes, num_training_steps=config.max_train_steps * accelerator.num_processes, ) @@ -498,12 +498,12 @@ def prep_peft_model(model, lr: float | None = None): if training_unet: # When training the UNet, it will always be the first parameter group. log["lr/unet"] = float(lrs[0]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/unet"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"] if training_text_encoder: # When training the text encoder, it will always be the last parameter group. log["lr/text_encoder"] = float(lrs[-1]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/text_encoder"] = optimizer.param_groups[-1]["d"] * optimizer.param_groups[-1]["lr"] accelerator.log(log, step=global_step) diff --git a/src/invoke_training/training/_shared/optimizer/optimizer_utils.py b/src/invoke_training/training/_shared/optimizer/optimizer_utils.py index 107543e1..d121557f 100644 --- a/src/invoke_training/training/_shared/optimizer/optimizer_utils.py +++ b/src/invoke_training/training/_shared/optimizer/optimizer_utils.py @@ -1,29 +1,29 @@ import torch from prodigyopt import Prodigy -from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer -def initialize_optimizer(config: OptimizerConfig, trainable_params: list) -> torch.optim.Optimizer: +def initialize_optimizer(config: AdamOptimizer | ProdigyOptimizer, trainable_params: list) -> torch.optim.Optimizer: """Initialize an optimizer based on the provided config.""" - if config.optimizer.optimizer_type == "AdamW": + if config.optimizer_type == "AdamW": optimizer = torch.optim.AdamW( trainable_params, lr=config.learning_rate, - betas=(config.optimizer.beta1, config.optimizer.beta2), - weight_decay=config.optimizer.weight_decay, - eps=config.optimizer.epsilon, + betas=(config.beta1, config.beta2), + weight_decay=config.weight_decay, + eps=config.epsilon, ) - elif config.optimizer.optimizer_type == "Prodigy": + elif config.optimizer_type == "Prodigy": optimizer = Prodigy( trainable_params, lr=config.learning_rate, - weight_decay=config.optimizer.weight_decay, - use_bias_correction=config.optimizer.use_bias_correction, - safeguard_warmup=config.optimizer.safeguard_warmup, + weight_decay=config.weight_decay, + use_bias_correction=config.use_bias_correction, + safeguard_warmup=config.safeguard_warmup, ) else: - raise ValueError(f"'{config.optimizer}' is not a supported optimizer.") + raise ValueError(f"'{config.optimizer_type}' is not a supported optimizer.") return optimizer diff --git a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py index af36b0fd..ea8364cb 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py @@ -398,9 +398,9 @@ def inject_lora_layers(model, lora_config: peft.LoraConfig, lr: float | None = N # (https://github.com/huggingface/accelerate/blame/49cb83a423f2946059117d8bb39b7c8747d29d80/src/accelerate/scheduler.py#L72-L82), # so the scaling here simply reverses that behaviour. lr_scheduler: torch.optim.lr_scheduler.LRScheduler = get_scheduler( - config.optimizer.lr_scheduler, + config.lr_scheduler, optimizer=optimizer, - num_warmup_steps=config.optimizer.lr_warmup_steps * accelerator.num_processes, + num_warmup_steps=config.lr_warmup_steps * accelerator.num_processes, num_training_steps=config.max_train_steps * accelerator.num_processes, ) @@ -504,12 +504,12 @@ def inject_lora_layers(model, lora_config: peft.LoraConfig, lr: float | None = N if config.train_unet: # When training the UNet, it will always be the first parameter group. log["lr/unet"] = float(lrs[0]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/unet"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"] if config.train_text_encoder: # When training the text encoder, it will always be the last parameter group. log["lr/text_encoder"] = float(lrs[-1]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/text_encoder"] = optimizer.param_groups[-1]["d"] * optimizer.param_groups[-1]["lr"] accelerator.log(log, step=global_step) diff --git a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py index 725f5aa7..1eafd0ec 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py @@ -233,9 +233,9 @@ def run_training(config: TextualInversionSDConfig): # noqa: C901 # (https://github.com/huggingface/accelerate/blame/49cb83a423f2946059117d8bb39b7c8747d29d80/src/accelerate/scheduler.py#L72-L82), # so the scaling here simply reverses that behaviour. lr_scheduler: torch.optim.lr_scheduler.LRScheduler = get_scheduler( - config.optimizer.lr_scheduler, + config.lr_scheduler, optimizer=optimizer, - num_warmup_steps=config.optimizer.lr_warmup_steps * accelerator.num_processes, + num_warmup_steps=config.lr_warmup_steps * accelerator.num_processes, num_training_steps=config.max_train_steps * accelerator.num_processes, ) @@ -345,7 +345,7 @@ def run_training(config: TextualInversionSDConfig): # noqa: C901 global_step += 1 log = {"train_loss": train_loss, "lr": lr_scheduler.get_last_lr()[0]} - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": # TODO(ryand): Test Prodigy logging. log["lr/d*lr"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"] diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py index d8daae88..90175551 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py @@ -480,9 +480,9 @@ def inject_lora_layers(model, lora_config: peft.LoraConfig, lr: float | None = N # (https://github.com/huggingface/accelerate/blame/49cb83a423f2946059117d8bb39b7c8747d29d80/src/accelerate/scheduler.py#L72-L82), # so the scaling here simply reverses that behaviour. lr_scheduler: torch.optim.lr_scheduler.LRScheduler = get_scheduler( - config.optimizer.lr_scheduler, + config.lr_scheduler, optimizer=optimizer, - num_warmup_steps=config.optimizer.lr_warmup_steps * accelerator.num_processes, + num_warmup_steps=config.lr_warmup_steps * accelerator.num_processes, num_training_steps=config.max_train_steps * accelerator.num_processes, ) @@ -599,12 +599,12 @@ def inject_lora_layers(model, lora_config: peft.LoraConfig, lr: float | None = N if config.train_unet: # When training the UNet, it will always be the first parameter group. log["lr/unet"] = float(lrs[0]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/unet"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"] if config.train_text_encoder: # When training the text encoder, it will always be the last parameter group. log["lr/text_encoder"] = float(lrs[-1]) - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": log["lr/d*lr/text_encoder"] = optimizer.param_groups[-1]["d"] * optimizer.param_groups[-1]["lr"] accelerator.log(log, step=global_step) diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py index 30764ef5..29dd10a3 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py @@ -264,9 +264,9 @@ def run_training(config: TextualInversionSDXLConfig): # noqa: C901 # (https://github.com/huggingface/accelerate/blame/49cb83a423f2946059117d8bb39b7c8747d29d80/src/accelerate/scheduler.py#L72-L82), # so the scaling here simply reverses that behaviour. lr_scheduler: torch.optim.lr_scheduler.LRScheduler = get_scheduler( - config.optimizer.lr_scheduler, + config.lr_scheduler, optimizer=optimizer, - num_warmup_steps=config.optimizer.lr_warmup_steps * accelerator.num_processes, + num_warmup_steps=config.lr_warmup_steps * accelerator.num_processes, num_training_steps=config.max_train_steps * accelerator.num_processes, ) @@ -390,7 +390,7 @@ def run_training(config: TextualInversionSDXLConfig): # noqa: C901 global_step += 1 log = {"train_loss": train_loss, "lr": lr_scheduler.get_last_lr()[0]} - if config.optimizer.optimizer.optimizer_type == "Prodigy": + if config.optimizer.optimizer_type == "Prodigy": # TODO(ryand): Test Prodigy logging. log["lr/d*lr"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"] From bc8927f99d53f9ed43dbfea393eacca428d468e9 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 22:03:45 -0500 Subject: [PATCH 4/8] Rename AdamOptimizer -> AdamOptimizerConfig, ProdigyOptimizer -> ProdigyOptimizerConfig. --- .../config/pipelines/finetune_lora_config.py | 4 ++-- .../config/pipelines/textual_inversion_config.py | 4 ++-- .../config/shared/optimizer/optimizer_config.py | 4 ++-- .../training/_shared/optimizer/optimizer_utils.py | 6 ++++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/invoke_training/config/pipelines/finetune_lora_config.py b/src/invoke_training/config/pipelines/finetune_lora_config.py index 2804d4fa..c9678f41 100644 --- a/src/invoke_training/config/pipelines/finetune_lora_config.py +++ b/src/invoke_training/config/pipelines/finetune_lora_config.py @@ -8,7 +8,7 @@ DreamboothSDDataLoaderConfig, ImageCaptionSDDataLoaderConfig, ) -from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizerConfig, ProdigyOptimizerConfig class LoRATrainingConfig(BasePipelineConfig): @@ -52,7 +52,7 @@ class LoRATrainingConfig(BasePipelineConfig): """Whether to add LoRA layers to the text encoder and train it. """ - optimizer: AdamOptimizer | ProdigyOptimizer = AdamOptimizer() + optimizer: AdamOptimizerConfig | ProdigyOptimizerConfig = AdamOptimizerConfig() text_encoder_learning_rate: Optional[float] = None """The learning rate to use for the text encoder model. If set, this overrides the optimizer's default learning diff --git a/src/invoke_training/config/pipelines/textual_inversion_config.py b/src/invoke_training/config/pipelines/textual_inversion_config.py index a2d7d48f..d752dc77 100644 --- a/src/invoke_training/config/pipelines/textual_inversion_config.py +++ b/src/invoke_training/config/pipelines/textual_inversion_config.py @@ -3,7 +3,7 @@ from invoke_training.config.pipelines.base_pipeline_config import BasePipelineConfig from invoke_training.config.shared.data.data_loader_config import TextualInversionSDDataLoaderConfig -from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizerConfig, ProdigyOptimizerConfig class TextualInversionTrainingConfig(BasePipelineConfig): @@ -74,7 +74,7 @@ class TextualInversionTrainingConfig(BasePipelineConfig): For example, if you are training on a dataset of images of pokemon, you might use `pokemon sketch white background`. """ - optimizer: AdamOptimizer | ProdigyOptimizer = AdamOptimizer() + optimizer: AdamOptimizerConfig | ProdigyOptimizerConfig = AdamOptimizerConfig() lr_scheduler: typing.Literal[ "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" diff --git a/src/invoke_training/config/shared/optimizer/optimizer_config.py b/src/invoke_training/config/shared/optimizer/optimizer_config.py index bcea23d5..a3ba33d6 100644 --- a/src/invoke_training/config/shared/optimizer/optimizer_config.py +++ b/src/invoke_training/config/shared/optimizer/optimizer_config.py @@ -3,7 +3,7 @@ from invoke_training.config.shared.config_base_model import ConfigBaseModel -class AdamOptimizer(ConfigBaseModel): +class AdamOptimizerConfig(ConfigBaseModel): optimizer_type: typing.Literal["AdamW"] = "AdamW" learning_rate: float = 1e-4 @@ -18,7 +18,7 @@ class AdamOptimizer(ConfigBaseModel): epsilon: float = 1e-8 -class ProdigyOptimizer(ConfigBaseModel): +class ProdigyOptimizerConfig(ConfigBaseModel): optimizer_type: typing.Literal["Prodigy"] = "Prodigy" learning_rate: float = 1.0 diff --git a/src/invoke_training/training/_shared/optimizer/optimizer_utils.py b/src/invoke_training/training/_shared/optimizer/optimizer_utils.py index d121557f..fcb52307 100644 --- a/src/invoke_training/training/_shared/optimizer/optimizer_utils.py +++ b/src/invoke_training/training/_shared/optimizer/optimizer_utils.py @@ -1,10 +1,12 @@ import torch from prodigyopt import Prodigy -from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizer, ProdigyOptimizer +from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizerConfig, ProdigyOptimizerConfig -def initialize_optimizer(config: AdamOptimizer | ProdigyOptimizer, trainable_params: list) -> torch.optim.Optimizer: +def initialize_optimizer( + config: AdamOptimizerConfig | ProdigyOptimizerConfig, trainable_params: list +) -> torch.optim.Optimizer: """Initialize an optimizer based on the provided config.""" if config.optimizer_type == "AdamW": From bd14935729a3d2ddcb4b7234a3006ea94a66367b Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 22:40:42 -0500 Subject: [PATCH 5/8] Flatten SDImageTransformConfig. --- ..._refinement_sd_pokemon_1x24gb_example.yaml | 3 +- .../dpo_lora_sd_pickapic_1x24gb_example.yaml | 3 +- ...inetune_lora_sd_pokemon_1x8gb_example.yaml | 3 +- ...tune_lora_sdxl_pokemon_1x24gb_example.yaml | 3 +- ...etune_lora_sdxl_pokemon_1x8gb_example.yaml | 3 +- ...tual_inversion_sd_gnome_1x8gb_example.yaml | 7 ++- ...l_inversion_sdxl_gnome_1x24gb_example.yaml | 7 ++- .../config/_experimental/dpo/config.py | 16 +++++- .../config/shared/data/data_loader_config.py | 54 +++++++++++++++---- .../config/shared/data/transform_config.py | 16 ------ .../dpo/diffusion_dpo_lora_sd.py | 5 -- .../data_loaders/dreambooth_sd_dataloader.py | 6 +-- .../image_caption_sd_dataloader.py | 6 +-- .../image_pair_preference_sd_dataloader.py | 10 ++-- .../textual_inversion_sd_dataloader.py | 6 +-- .../_shared/stable_diffusion/validation.py | 4 +- .../stable_diffusion/finetune_lora_sd.py | 4 +- .../stable_diffusion/textual_inversion_sd.py | 4 +- .../stable_diffusion_xl/finetune_lora_sdxl.py | 6 +-- .../textual_inversion_sdxl.py | 6 +-- .../test_dreambooth_sd_dataloader.py | 4 -- .../test_image_caption_sd_dataloader.py | 2 - ...est_image_pair_preference_sd_dataloader.py | 6 +-- .../test_textual_inversion_sd_dataloader.py | 2 - 24 files changed, 97 insertions(+), 89 deletions(-) diff --git a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml index e39890e6..4836b389 100644 --- a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml @@ -20,8 +20,7 @@ data_loader: dataset: type: IMAGE_PAIR_PREFERENCE_DATASET dataset_dir: output/pokemon_pairs - image_transforms: - resolution: 512 + resolution: 512 dataloader_num_workers: 4 # General diff --git a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml index 70264d8e..3d7b52e8 100644 --- a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml @@ -22,8 +22,7 @@ data_loader: type: IMAGE_PAIR_PREFERENCE_SD_DATA_LOADER dataset: type: HF_HUB_IMAGE_PAIR_PREFERENCE_DATASET - image_transforms: - resolution: 512 + resolution: 512 # General model: runwayml/stable-diffusion-v1-5 diff --git a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml index 5deb8aff..ae42964d 100644 --- a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml @@ -23,8 +23,7 @@ data_loader: dataset: type: HF_HUB_IMAGE_CAPTION_DATASET dataset_name: lambdalabs/pokemon-blip-captions - image_transforms: - resolution: 512 + resolution: 512 dataloader_num_workers: 4 # General diff --git a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml index cded2b83..15ffaa71 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml @@ -22,8 +22,7 @@ data_loader: dataset: type: HF_HUB_IMAGE_CAPTION_DATASET dataset_name: lambdalabs/pokemon-blip-captions - image_transforms: - resolution: 512 + resolution: 512 # General model: stabilityai/stable-diffusion-xl-base-1.0 diff --git a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml index 558ac985..84c8ae6f 100644 --- a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml +++ b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml @@ -23,8 +23,7 @@ data_loader: dataset: type: HF_HUB_IMAGE_CAPTION_DATASET dataset_name: lambdalabs/pokemon-blip-captions - image_transforms: - resolution: 512 + resolution: 512 # General model: stabilityai/stable-diffusion-xl-base-1.0 diff --git a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml index 187d9dc2..a63d8e28 100644 --- a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml +++ b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml @@ -22,10 +22,9 @@ data_loader: captions: type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM preset: object - image_transforms: - resolution: 512 - center_crop: True - random_flip: False + resolution: 512 + center_crop: True + random_flip: False shuffle_caption_transform: null aspect_ratio_buckets: target_resolution: 512 diff --git a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml index 93e98087..2079b5ad 100644 --- a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml +++ b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml @@ -22,10 +22,9 @@ data_loader: captions: type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM preset: object - image_transforms: - resolution: 1024 - center_crop: True - random_flip: False + resolution: 1024 + center_crop: True + random_flip: False shuffle_caption_transform: null dataloader_num_workers: 4 diff --git a/src/invoke_training/config/_experimental/dpo/config.py b/src/invoke_training/config/_experimental/dpo/config.py index e34c516d..d3d39093 100644 --- a/src/invoke_training/config/_experimental/dpo/config.py +++ b/src/invoke_training/config/_experimental/dpo/config.py @@ -4,7 +4,6 @@ from invoke_training.config.pipelines.finetune_lora_config import LoRATrainingConfig from invoke_training.config.shared.config_base_model import ConfigBaseModel -from invoke_training.config.shared.data.transform_config import SDImageTransformConfig class HFHubImagePairPreferenceDatasetConfig(ConfigBaseModel): @@ -27,7 +26,20 @@ class ImagePairPreferenceSDDataLoaderConfig(ConfigBaseModel): Union[HFHubImagePairPreferenceDatasetConfig, ImagePairPreferenceDatasetConfig], Field(discriminator="type") ] - image_transforms: SDImageTransformConfig + resolution: int | tuple[int, int] = 512 + """The resolution for input images. Either a scalar integer representing the square resolution height and width, or + a (height, width) tuple. All of the images in the dataset will be resized to this resolution unless the + `aspect_ratio_buckets` config is set. + """ + + center_crop: bool = True + """If True, input images will be center-cropped to the target resolution. + If False, input images will be randomly cropped to the target resolution. + """ + + random_flip: bool = False + """Whether random flip augmentations should be applied to input images. + """ dataloader_num_workers: int = 0 """Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process. diff --git a/src/invoke_training/config/shared/data/data_loader_config.py b/src/invoke_training/config/shared/data/data_loader_config.py index d788e083..67a6f676 100644 --- a/src/invoke_training/config/shared/data/data_loader_config.py +++ b/src/invoke_training/config/shared/data/data_loader_config.py @@ -6,7 +6,6 @@ ImageDirDatasetConfig, ) from invoke_training.config.shared.data.transform_config import ( - SDImageTransformConfig, ShuffleCaptionTransformConfig, TextualInversionCaptionConfig, ) @@ -49,10 +48,23 @@ class ImageCaptionSDDataLoaderConfig(ConfigBaseModel): dataset: ImageCaptionDatasetConfig - image_transforms: SDImageTransformConfig - aspect_ratio_buckets: AspectRatioBucketConfig | None = None + resolution: int | tuple[int, int] = 512 + """The resolution for input images. Either a scalar integer representing the square resolution height and width, or + a (height, width) tuple. All of the images in the dataset will be resized to this resolution unless the + `aspect_ratio_buckets` config is set. + """ + + center_crop: bool = True + """If True, input images will be center-cropped to the target resolution. + If False, input images will be randomly cropped to the target resolution. + """ + + random_flip: bool = False + """Whether random flip augmentations should be applied to input images. + """ + caption_prefix: str | None = None """A prefix that will be prepended to all captions. If None, no prefix will be added. """ @@ -75,13 +87,26 @@ class DreamboothSDDataLoaderConfig(ConfigBaseModel): """The loss weight applied to class dataset examples. Instance dataset examples have an implicit loss weight of 1.0. """ - image_transforms: SDImageTransformConfig - aspect_ratio_buckets: AspectRatioBucketConfig | None = None """The aspect ratio bucketing configuration. If None, aspect ratio bucketing is disabled, and all images will be resized to the same resolution. """ + resolution: int | tuple[int, int] = 512 + """The resolution for input images. Either a scalar integer representing the square resolution height and width, or + a (height, width) tuple. All of the images in the dataset will be resized to this resolution unless the + `aspect_ratio_buckets` config is set. + """ + + center_crop: bool = True + """If True, input images will be center-cropped to the target resolution. + If False, input images will be randomly cropped to the target resolution. + """ + + random_flip: bool = False + """Whether random flip augmentations should be applied to input images. + """ + dataloader_num_workers: int = 0 """Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process. """ @@ -100,15 +125,26 @@ class TextualInversionSDDataLoaderConfig(ConfigBaseModel): - [`TextualInversionCaptionPrefixTransformConfig`][invoke_training.config.shared.data.transform_config.TextualInversionCaptionPrefixTransformConfig]: Prepend the textual inversion token(s) to all existing dataset captions. """ # noqa: E501 - image_transforms: SDImageTransformConfig - """The image transforms to apply to all images. - """ - aspect_ratio_buckets: AspectRatioBucketConfig | None = None """The aspect ratio bucketing configuration. If None, aspect ratio bucketing is disabled, and all images will be resized to the same resolution. """ + resolution: int | tuple[int, int] = 512 + """The resolution for input images. Either a scalar integer representing the square resolution height and width, or + a (height, width) tuple. All of the images in the dataset will be resized to this resolution unless the + `aspect_ratio_buckets` config is set. + """ + + center_crop: bool = True + """If True, input images will be center-cropped to the target resolution. + If False, input images will be randomly cropped to the target resolution. + """ + + random_flip: bool = False + """Whether random flip augmentations should be applied to input images. + """ + shuffle_caption_transform: Optional[ShuffleCaptionTransformConfig] = None """The caption shuffling configuration. If None, caption shuffling is disabled. """ diff --git a/src/invoke_training/config/shared/data/transform_config.py b/src/invoke_training/config/shared/data/transform_config.py index ebb206d1..83318826 100644 --- a/src/invoke_training/config/shared/data/transform_config.py +++ b/src/invoke_training/config/shared/data/transform_config.py @@ -5,22 +5,6 @@ from invoke_training.config.shared.config_base_model import ConfigBaseModel -class SDImageTransformConfig(ConfigBaseModel): - resolution: int | tuple[int, int] = 512 - """The resolution for input images. Either a scalar integer representing the square resolution height and width, or - a (height, width) tuple. All of the images in the dataset will be resized to this resolution. - """ - - center_crop: bool = True - """If True, input images will be center-cropped to resolution. - If False, input images will be randomly cropped to resolution. - """ - - random_flip: bool = False - """Whether random flip augmentations should be applied to input images. - """ - - class TextualInversionCaptionTransformConfig(ConfigBaseModel): type: Literal["TEXTUAL_INVERSION_CAPTION_TRANSFORM"] = "TEXTUAL_INVERSION_CAPTION_TRANSFORM" diff --git a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py index 7018959c..54dd9969 100644 --- a/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py +++ b/src/invoke_training/training/_experimental/dpo/diffusion_dpo_lora_sd.py @@ -259,11 +259,6 @@ def run_training(config: DirectPreferenceOptimizationLoRASDConfig): # noqa: C90 vae_output_cache_dir_name = None if config.cache_vae_outputs: raise NotImplementedError("VAE caching is not implemented for Diffusion-DPO training yet.") - # if config.data_loader.image_transforms.random_flip: - # raise ValueError("'cache_vae_outputs' cannot be True if 'random_flip' is True.") - # if not config.data_loader.image_transforms.center_crop: - # raise ValueError("'cache_vae_outputs' cannot be True if 'center_crop' is False.") - # # We use a temporary directory for the cache. The directory will automatically be cleaned up when # # tmp_vae_output_cache_dir is destroyed. # tmp_vae_output_cache_dir = tempfile.TemporaryDirectory() diff --git a/src/invoke_training/training/_shared/data/data_loaders/dreambooth_sd_dataloader.py b/src/invoke_training/training/_shared/data/data_loaders/dreambooth_sd_dataloader.py index a9f8fda9..23a0592a 100644 --- a/src/invoke_training/training/_shared/data/data_loaders/dreambooth_sd_dataloader.py +++ b/src/invoke_training/training/_shared/data/data_loaders/dreambooth_sd_dataloader.py @@ -92,7 +92,7 @@ def build_dreambooth_sd_dataloader( instance_sampler = None class_sampler = None if config.aspect_ratio_buckets is None: - target_resolution = config.image_transforms.resolution + target_resolution = config.resolution # TODO(ryand): Provide a seeded generator. instance_sampler = RandomSampler(instance_dataset) if shuffle else SequentialSampler(instance_dataset) if base_class_dataset is not None: @@ -125,8 +125,8 @@ def build_dreambooth_sd_dataloader( SDImageTransform( resolution=target_resolution, aspect_ratio_bucket_manager=aspect_ratio_bucket_manager, - center_crop=config.image_transforms.center_crop, - random_flip=config.image_transforms.random_flip, + center_crop=config.center_crop, + random_flip=config.random_flip, ) ) else: diff --git a/src/invoke_training/training/_shared/data/data_loaders/image_caption_sd_dataloader.py b/src/invoke_training/training/_shared/data/data_loaders/image_caption_sd_dataloader.py index edc7f320..a713780d 100644 --- a/src/invoke_training/training/_shared/data/data_loaders/image_caption_sd_dataloader.py +++ b/src/invoke_training/training/_shared/data/data_loaders/image_caption_sd_dataloader.py @@ -100,7 +100,7 @@ def build_image_caption_sd_dataloader( # Initialize either the fixed target resolution or aspect ratio buckets. if config.aspect_ratio_buckets is None: - target_resolution = config.image_transforms.resolution + target_resolution = config.resolution aspect_ratio_bucket_manager = None batch_sampler = None else: @@ -125,8 +125,8 @@ def build_image_caption_sd_dataloader( SDImageTransform( resolution=target_resolution, aspect_ratio_bucket_manager=aspect_ratio_bucket_manager, - center_crop=config.image_transforms.center_crop, - random_flip=config.image_transforms.random_flip, + center_crop=config.center_crop, + random_flip=config.random_flip, ) ) else: diff --git a/src/invoke_training/training/_shared/data/data_loaders/image_pair_preference_sd_dataloader.py b/src/invoke_training/training/_shared/data/data_loaders/image_pair_preference_sd_dataloader.py index d353ff0d..5fdc412f 100644 --- a/src/invoke_training/training/_shared/data/data_loaders/image_pair_preference_sd_dataloader.py +++ b/src/invoke_training/training/_shared/data/data_loaders/image_pair_preference_sd_dataloader.py @@ -76,7 +76,7 @@ def build_image_pair_preference_sd_dataloader( else: raise ValueError(f"Unexpected dataset config type: '{type(config.dataset)}'.") - target_resolution = config.image_transforms.resolution + target_resolution = config.resolution all_transforms = [] if vae_output_cache_dir is None: @@ -84,8 +84,8 @@ def build_image_pair_preference_sd_dataloader( SDImageTransform( resolution=target_resolution, aspect_ratio_bucket_manager=None, - center_crop=config.image_transforms.center_crop, - random_flip=config.image_transforms.random_flip, + center_crop=config.center_crop, + random_flip=config.random_flip, image_field_name="image_0", orig_size_field_name="original_size_hw_0", crop_field_name="crop_top_left_yx_0", @@ -95,8 +95,8 @@ def build_image_pair_preference_sd_dataloader( SDImageTransform( resolution=target_resolution, aspect_ratio_bucket_manager=None, - center_crop=config.image_transforms.center_crop, - random_flip=config.image_transforms.random_flip, + center_crop=config.center_crop, + random_flip=config.random_flip, image_field_name="image_1", orig_size_field_name="original_size_hw_1", crop_field_name="crop_top_left_yx_1", diff --git a/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py b/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py index 9869afdc..541286f5 100644 --- a/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py +++ b/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py @@ -147,7 +147,7 @@ def build_textual_inversion_sd_dataloader( # noqa: C901 # Initialize either the fixed target resolution or aspect ratio buckets. if config.aspect_ratio_buckets is None: - target_resolution = config.image_transforms.resolution + target_resolution = config.resolution aspect_ratio_bucket_manager = None batch_sampler = None else: @@ -174,8 +174,8 @@ def build_textual_inversion_sd_dataloader( # noqa: C901 SDImageTransform( resolution=target_resolution, aspect_ratio_bucket_manager=aspect_ratio_bucket_manager, - center_crop=config.image_transforms.center_crop, - random_flip=config.image_transforms.random_flip, + center_crop=config.center_crop, + random_flip=config.random_flip, ) ) else: diff --git a/src/invoke_training/training/_shared/stable_diffusion/validation.py b/src/invoke_training/training/_shared/stable_diffusion/validation.py index b2a4c77f..e0b62f1f 100644 --- a/src/invoke_training/training/_shared/stable_diffusion/validation.py +++ b/src/invoke_training/training/_shared/stable_diffusion/validation.py @@ -72,7 +72,7 @@ def generate_validation_images_sd( pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) - validation_resolution = Resolution.parse(config.data_loader.image_transforms.resolution) + validation_resolution = Resolution.parse(config.data_loader.resolution) # Run inference. with torch.no_grad(): @@ -173,7 +173,7 @@ def generate_validation_images_sdxl( pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) - validation_resolution = Resolution.parse(config.data_loader.image_transforms.resolution) + validation_resolution = Resolution.parse(config.data_loader.resolution) # Run inference. with torch.no_grad(): diff --git a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py index ea8364cb..3f86f8a1 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/finetune_lora_sd.py @@ -285,9 +285,9 @@ def run_training(config: FinetuneLoRASDConfig): # noqa: C901 # Prepare VAE output cache. vae_output_cache_dir_name = None if config.cache_vae_outputs: - if config.data_loader.image_transforms.random_flip: + if config.data_loader.random_flip: raise ValueError("'cache_vae_outputs' cannot be True if 'random_flip' is True.") - if not config.data_loader.image_transforms.center_crop: + if not config.data_loader.center_crop: raise ValueError("'cache_vae_outputs' cannot be True if 'center_crop' is False.") # We use a temporary directory for the cache. The directory will automatically be cleaned up when diff --git a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py index 1eafd0ec..1ee3c9df 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py +++ b/src/invoke_training/training/pipelines/stable_diffusion/textual_inversion_sd.py @@ -185,9 +185,9 @@ def run_training(config: TextualInversionSDConfig): # noqa: C901 # Prepare VAE output cache. vae_output_cache_dir_name = None if config.cache_vae_outputs: - if config.data_loader.image_transforms.random_flip: + if config.data_loader.random_flip: raise ValueError("'cache_vae_outputs' cannot be True if 'random_flip' is True.") - if not config.data_loader.image_transforms.center_crop: + if not config.data_loader.center_crop: raise ValueError("'cache_vae_outputs' cannot be True if 'center_crop' is False.") # We use a temporary directory for the cache. The directory will automatically be cleaned up when diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py index 90175551..6cf9df1c 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/finetune_lora_sdxl.py @@ -362,9 +362,9 @@ def run_training(config: FinetuneLoRASDXLConfig): # noqa: C901 # Prepare VAE output cache. vae_output_cache_dir_name = None if config.cache_vae_outputs: - if config.data_loader.image_transforms.random_flip: + if config.data_loader.random_flip: raise ValueError("'cache_vae_outputs' cannot be True if 'random_flip' is True.") - if not config.data_loader.image_transforms.center_crop: + if not config.data_loader.center_crop: raise ValueError("'cache_vae_outputs' cannot be True if 'center_crop' is False.") # We use a temporary directory for the cache. The directory will automatically be cleaned up when @@ -571,7 +571,7 @@ def inject_lora_layers(model, lora_config: peft.LoraConfig, lr: float | None = N text_encoder_2, unet, weight_dtype, - config.data_loader.image_transforms.resolution, + config.data_loader.resolution, config.prediction_type, ) diff --git a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py index 29dd10a3..7a9535e7 100644 --- a/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py +++ b/src/invoke_training/training/pipelines/stable_diffusion_xl/textual_inversion_sdxl.py @@ -211,9 +211,9 @@ def run_training(config: TextualInversionSDXLConfig): # noqa: C901 # Prepare VAE output cache. vae_output_cache_dir_name = None if config.cache_vae_outputs: - if config.data_loader.image_transforms.random_flip: + if config.data_loader.random_flip: raise ValueError("'cache_vae_outputs' cannot be True if 'random_flip' is True.") - if not config.data_loader.image_transforms.center_crop: + if not config.data_loader.center_crop: raise ValueError("'cache_vae_outputs' cannot be True if 'center_crop' is False.") # We use a temporary directory for the cache. The directory will automatically be cleaned up when @@ -348,7 +348,7 @@ def run_training(config: TextualInversionSDXLConfig): # noqa: C901 text_encoder_2, unet, weight_dtype, - config.data_loader.image_transforms.resolution, + config.data_loader.resolution, config.prediction_type, ) diff --git a/tests/invoke_training/training/_shared/data/data_loaders/test_dreambooth_sd_dataloader.py b/tests/invoke_training/training/_shared/data/data_loaders/test_dreambooth_sd_dataloader.py index e0e3aa53..48b2aea9 100644 --- a/tests/invoke_training/training/_shared/data/data_loaders/test_dreambooth_sd_dataloader.py +++ b/tests/invoke_training/training/_shared/data/data_loaders/test_dreambooth_sd_dataloader.py @@ -2,7 +2,6 @@ from invoke_training.config.shared.data.data_loader_config import AspectRatioBucketConfig, DreamboothSDDataLoaderConfig from invoke_training.config.shared.data.dataset_config import ImageDirDatasetConfig -from invoke_training.config.shared.data.transform_config import SDImageTransformConfig from invoke_training.training._shared.data.data_loaders.dreambooth_sd_dataloader import ( build_dreambooth_sd_dataloader, ) @@ -18,7 +17,6 @@ def test_build_dreambooth_sd_dataloader(image_dir): # noqa: F811 class_caption="test class prompt", # For testing, we just use the same directory for the instance and class datasets. class_dataset=ImageDirDatasetConfig(dataset_dir=str(image_dir)), - image_transforms=SDImageTransformConfig(resolution=512), ) data_loader = build_dreambooth_sd_dataloader(config=config, batch_size=2) @@ -53,7 +51,6 @@ def test_build_dreambooth_sd_dataloader_no_class_dataset(image_dir): # noqa: F8 config = DreamboothSDDataLoaderConfig( instance_caption="test instance prompt", instance_dataset=ImageDirDatasetConfig(dataset_dir=str(image_dir)), - image_transforms=SDImageTransformConfig(resolution=512), ) data_loader = build_dreambooth_sd_dataloader(config=config, batch_size=2) @@ -93,7 +90,6 @@ def test_build_dreambooth_sd_dataloader_with_bucketing(image_dir): # noqa: F811 aspect_ratio_buckets=AspectRatioBucketConfig( target_resolution=256, start_dim=128, end_dim=512, divisible_by=64 ), - image_transforms=SDImageTransformConfig(), ) data_loader = build_dreambooth_sd_dataloader(config=config, batch_size=2, shuffle=False, sequential_batching=True) diff --git a/tests/invoke_training/training/_shared/data/data_loaders/test_image_caption_sd_dataloader.py b/tests/invoke_training/training/_shared/data/data_loaders/test_image_caption_sd_dataloader.py index 1fb8bc9c..5883bf89 100644 --- a/tests/invoke_training/training/_shared/data/data_loaders/test_image_caption_sd_dataloader.py +++ b/tests/invoke_training/training/_shared/data/data_loaders/test_image_caption_sd_dataloader.py @@ -6,7 +6,6 @@ ImageCaptionSDDataLoaderConfig, ) from invoke_training.config.shared.data.dataset_config import HFHubImageCaptionDatasetConfig -from invoke_training.config.shared.data.transform_config import SDImageTransformConfig from invoke_training.training._shared.data.data_loaders.image_caption_sd_dataloader import ( build_image_caption_sd_dataloader, ) @@ -17,7 +16,6 @@ def test_build_image_caption_sd_dataloader(): config = ImageCaptionSDDataLoaderConfig( dataset=HFHubImageCaptionDatasetConfig(dataset_name="lambdalabs/pokemon-blip-captions"), - image_transforms=SDImageTransformConfig(resolution=512), ) data_loader = build_image_caption_sd_dataloader(config, 4) diff --git a/tests/invoke_training/training/_shared/data/data_loaders/test_image_pair_preference_sd_dataloader.py b/tests/invoke_training/training/_shared/data/data_loaders/test_image_pair_preference_sd_dataloader.py index 875933e4..edb3e5e6 100644 --- a/tests/invoke_training/training/_shared/data/data_loaders/test_image_pair_preference_sd_dataloader.py +++ b/tests/invoke_training/training/_shared/data/data_loaders/test_image_pair_preference_sd_dataloader.py @@ -4,7 +4,6 @@ HFHubImagePairPreferenceDatasetConfig, ImagePairPreferenceSDDataLoaderConfig, ) -from invoke_training.config.shared.data.transform_config import SDImageTransformConfig from invoke_training.training._shared.data.data_loaders.image_pair_preference_sd_dataloader import ( build_image_pair_preference_sd_dataloader, ) @@ -13,10 +12,7 @@ def test_build_image_pair_preference_sd_dataloader(): """Smoke test of build_image_pair_preference_sd_dataloader(...).""" - config = ImagePairPreferenceSDDataLoaderConfig( - dataset=HFHubImagePairPreferenceDatasetConfig(), - image_transforms=SDImageTransformConfig(resolution=512), - ) + config = ImagePairPreferenceSDDataLoaderConfig(dataset=HFHubImagePairPreferenceDatasetConfig()) data_loader = build_image_pair_preference_sd_dataloader(config, 4) example = next(iter(data_loader)) diff --git a/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py b/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py index 846cef85..7e019307 100644 --- a/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py +++ b/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py @@ -3,7 +3,6 @@ from invoke_training.config.shared.data.data_loader_config import TextualInversionSDDataLoaderConfig from invoke_training.config.shared.data.dataset_config import ImageDirDatasetConfig from invoke_training.config.shared.data.transform_config import ( - SDImageTransformConfig, TextualInversionPresetCaptionTransformConfig, ) from invoke_training.training._shared.data.data_loaders.textual_inversion_sd_dataloader import ( @@ -19,7 +18,6 @@ def test_build_textual_inversion_sd_dataloader(image_dir): # noqa: F811 config = TextualInversionSDDataLoaderConfig( dataset=ImageDirDatasetConfig(dataset_dir=str(image_dir)), captions=TextualInversionPresetCaptionTransformConfig(preset="object"), - image_transforms=SDImageTransformConfig(resolution=512), ) data_loader = build_textual_inversion_sd_dataloader( From 04d39a42bfa9bafbe80cd06ea72a31b4283b95bd Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 23:00:36 -0500 Subject: [PATCH 6/8] Fix the DPO sample configs. --- .../dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml | 9 ++++----- .../dpo_lora_sd_pickapic_1x24gb_example.yaml | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml index 4836b389..953365ca 100644 --- a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml @@ -7,13 +7,12 @@ seed: 1 base_output_dir: output/dpo optimizer: + optimizer_type: AdamW learning_rate: 1e-4 - lr_warmup_steps: 500 - lr_scheduler: cosine + weight_decay: 1e-2 - optimizer: - optimizer_type: AdamW - weight_decay: 1e-2 +lr_warmup_steps: 500 +lr_scheduler: cosine data_loader: type: IMAGE_PAIR_PREFERENCE_SD_DATA_LOADER diff --git a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml index 3d7b52e8..5b9ac065 100644 --- a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml +++ b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml @@ -10,13 +10,12 @@ seed: 1 base_output_dir: output/dpo optimizer: + optimizer_type: AdamW learning_rate: 1e-4 - lr_warmup_steps: 200 - lr_scheduler: cosine + weight_decay: 1e-2 - optimizer: - optimizer_type: AdamW - weight_decay: 1e-2 +lr_warmup_steps: 200 +lr_scheduler: cosine data_loader: type: IMAGE_PAIR_PREFERENCE_SD_DATA_LOADER From e8a536d8939dc73fe247d77c53219ed39e9bb35c Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 23:01:01 -0500 Subject: [PATCH 7/8] Flatten the caption transform configs. --- ...tual_inversion_sd_gnome_1x8gb_example.yaml | 6 +-- ...l_inversion_sdxl_gnome_1x24gb_example.yaml | 6 +-- .../config/shared/data/data_loader_config.py | 26 +++++------ .../config/shared/data/transform_config.py | 43 ------------------- .../textual_inversion_sd_dataloader.py | 26 +++++------ .../test_textual_inversion_sd_dataloader.py | 6 +-- 6 files changed, 30 insertions(+), 83 deletions(-) delete mode 100644 src/invoke_training/config/shared/data/transform_config.py diff --git a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml index a63d8e28..3739f822 100644 --- a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml +++ b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml @@ -19,13 +19,11 @@ data_loader: type: IMAGE_DIR_DATASET dataset_dir: "sample_data/bruce_the_gnome" keep_in_memory: True - captions: - type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM - preset: object + caption_preset: object resolution: 512 center_crop: True random_flip: False - shuffle_caption_transform: null + shuffle_caption_delimiter: null aspect_ratio_buckets: target_resolution: 512 start_dim: 256 diff --git a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml index 2079b5ad..b0ba130f 100644 --- a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml +++ b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml @@ -19,13 +19,11 @@ data_loader: type: IMAGE_DIR_DATASET dataset_dir: "sample_data/bruce_the_gnome" keep_in_memory: True - captions: - type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM - preset: object + caption_preset: object resolution: 1024 center_crop: True random_flip: False - shuffle_caption_transform: null + shuffle_caption_delimiter: null dataloader_num_workers: 4 # General diff --git a/src/invoke_training/config/shared/data/data_loader_config.py b/src/invoke_training/config/shared/data/data_loader_config.py index 67a6f676..b3da6761 100644 --- a/src/invoke_training/config/shared/data/data_loader_config.py +++ b/src/invoke_training/config/shared/data/data_loader_config.py @@ -5,10 +5,6 @@ ImageCaptionDatasetConfig, ImageDirDatasetConfig, ) -from invoke_training.config.shared.data.transform_config import ( - ShuffleCaptionTransformConfig, - TextualInversionCaptionConfig, -) class AspectRatioBucketConfig(ConfigBaseModel): @@ -117,13 +113,19 @@ class TextualInversionSDDataLoaderConfig(ConfigBaseModel): dataset: ImageDirDatasetConfig | ImageCaptionDatasetConfig - captions: TextualInversionCaptionConfig - """The caption configuration. One of: + caption_preset: Literal["style", "object"] | None = None + + caption_templates: list[str] | None = None + """A list of caption templates with a single template argument 'slot' in each. + E.g.: + + - "a photo of a {}" + - "a rendering of a {}" + - "a cropped photo of the {}" + """ - - [`TextualInversionPresetCaptionTransformConfig`][invoke_training.config.shared.data.transform_config.TextualInversionPresetCaptionTransformConfig]: Use preset `object` or `style` caption templates. - - [`TextualInversionCaptionTransformConfig`][invoke_training.config.shared.data.transform_config.TextualInversionCaptionTransformConfig]: Use custom caption templates. - - [`TextualInversionCaptionPrefixTransformConfig`][invoke_training.config.shared.data.transform_config.TextualInversionCaptionPrefixTransformConfig]: Prepend the textual inversion token(s) to all existing dataset captions. - """ # noqa: E501 + # TODO(ryand): Replace this with keep_original_captions config. + apply_caption_prefix: bool = False aspect_ratio_buckets: AspectRatioBucketConfig | None = None """The aspect ratio bucketing configuration. If None, aspect ratio bucketing is disabled, and all images will be @@ -145,8 +147,8 @@ class TextualInversionSDDataLoaderConfig(ConfigBaseModel): """Whether random flip augmentations should be applied to input images. """ - shuffle_caption_transform: Optional[ShuffleCaptionTransformConfig] = None - """The caption shuffling configuration. If None, caption shuffling is disabled. + shuffle_caption_delimiter: str | None = None + """If `None`, then no caption shuffling is applied. If set, then captions are split on this delimiter and shuffled. """ dataloader_num_workers: int = 0 diff --git a/src/invoke_training/config/shared/data/transform_config.py b/src/invoke_training/config/shared/data/transform_config.py deleted file mode 100644 index 83318826..00000000 --- a/src/invoke_training/config/shared/data/transform_config.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Annotated, Literal, Union - -from pydantic import Field - -from invoke_training.config.shared.config_base_model import ConfigBaseModel - - -class TextualInversionCaptionTransformConfig(ConfigBaseModel): - type: Literal["TEXTUAL_INVERSION_CAPTION_TRANSFORM"] = "TEXTUAL_INVERSION_CAPTION_TRANSFORM" - - templates: list[str] - """A list of caption templates with a single template argument 'slot' in each. - E.g.: - - - "a photo of a {}" - - "a rendering of a {}" - - "a cropped photo of the {}" - """ - - -class TextualInversionPresetCaptionTransformConfig(ConfigBaseModel): - type: Literal["TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM"] = "TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM" - - preset: Literal["style", "object"] - - -class TextualInversionCaptionPrefixTransformConfig(ConfigBaseModel): - type: Literal["TEXTUAL_INVERSION_CAPTION_PREFIX_TRANSFORM"] = "TEXTUAL_INVERSION_CAPTION_PREFIX_TRANSFORM" - - -TextualInversionCaptionConfig = Annotated[ - Union[ - TextualInversionCaptionTransformConfig, - TextualInversionPresetCaptionTransformConfig, - TextualInversionCaptionPrefixTransformConfig, - ], - Field(discriminator="type"), -] - - -class ShuffleCaptionTransformConfig(ConfigBaseModel): - delimiter: str = "," - """The delimiter to use for caption splitting.""" diff --git a/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py b/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py index 541286f5..c5683d36 100644 --- a/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py +++ b/src/invoke_training/training/_shared/data/data_loaders/textual_inversion_sd_dataloader.py @@ -8,11 +8,6 @@ HFHubImageCaptionDatasetConfig, ImageDirDatasetConfig, ) -from invoke_training.config.shared.data.transform_config import ( - TextualInversionCaptionPrefixTransformConfig, - TextualInversionCaptionTransformConfig, - TextualInversionPresetCaptionTransformConfig, -) from invoke_training.training._shared.data.data_loaders.image_caption_sd_dataloader import ( build_aspect_ratio_bucket_manager, sd_image_caption_collate_fn, @@ -124,26 +119,29 @@ def build_textual_inversion_sd_dataloader( # noqa: C901 else: raise ValueError(f"Unexpected dataset config type: '{type(config.dataset)}'.") - if isinstance(config.captions, TextualInversionCaptionTransformConfig): + if sum([config.caption_templates is not None, config.caption_preset is not None, config.apply_caption_prefix]) != 1: + raise ValueError("Exactly one of caption_templates, caption_preset, or apply_caption_prefix must be set.") + + if config.caption_templates is not None: # Overwrites the caption field. Typically used with a ImageDirDataset that does not have captions. caption_tf = TemplateCaptionTransform( field_name="caption", placeholder_str=placeholder_token, - caption_templates=config.captions.templates, + caption_templates=config.caption_templates, ) - elif isinstance(config.captions, TextualInversionPresetCaptionTransformConfig): + elif config.caption_preset is not None: # Overwrites the caption field. Typically used with a ImageDirDataset that does not have captions. caption_tf = TemplateCaptionTransform( field_name="caption", placeholder_str=placeholder_token, - caption_templates=get_preset_ti_caption_templates(config.captions.preset), + caption_templates=get_preset_ti_caption_templates(config.caption_preset), ) - elif isinstance(config.captions, TextualInversionCaptionPrefixTransformConfig): + elif config.apply_caption_prefix: # Prefixes the caption field. Must be used with a HFHubImageCaptionDataset or HFDirImageCaptionDataset that # already has captions. caption_tf = CaptionPrefixTransform(caption_field_name="caption", prefix=placeholder_token + " ") else: - raise ValueError(f"Unexpected caption config type: '{type(config.captions)}'.") + raise ValueError("Exactly one of caption_templates, caption_preset, or apply_caption_prefix must be set.") # Initialize either the fixed target resolution or aspect ratio buckets. if config.aspect_ratio_buckets is None: @@ -164,10 +162,8 @@ def build_textual_inversion_sd_dataloader( # noqa: C901 all_transforms = [caption_tf] - if config.shuffle_caption_transform is not None: - all_transforms.append( - ShuffleCaptionTransform(field_name="caption", delimiter=config.shuffle_caption_transform.delimiter) - ) + if config.shuffle_caption_delimiter is not None: + all_transforms.append(ShuffleCaptionTransform(field_name="caption", delimiter=config.shuffle_caption_delimiter)) if vae_output_cache_dir is None: all_transforms.append( diff --git a/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py b/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py index 7e019307..87ad7581 100644 --- a/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py +++ b/tests/invoke_training/training/_shared/data/data_loaders/test_textual_inversion_sd_dataloader.py @@ -2,9 +2,6 @@ from invoke_training.config.shared.data.data_loader_config import TextualInversionSDDataLoaderConfig from invoke_training.config.shared.data.dataset_config import ImageDirDatasetConfig -from invoke_training.config.shared.data.transform_config import ( - TextualInversionPresetCaptionTransformConfig, -) from invoke_training.training._shared.data.data_loaders.textual_inversion_sd_dataloader import ( build_textual_inversion_sd_dataloader, ) @@ -16,8 +13,7 @@ def test_build_textual_inversion_sd_dataloader(image_dir): # noqa: F811 """Smoke test of build_textual_inversion_sd_dataloader(...).""" config = TextualInversionSDDataLoaderConfig( - dataset=ImageDirDatasetConfig(dataset_dir=str(image_dir)), - captions=TextualInversionPresetCaptionTransformConfig(preset="object"), + dataset=ImageDirDatasetConfig(dataset_dir=str(image_dir)), caption_preset="object" ) data_loader = build_textual_inversion_sd_dataloader( From e2801940fce57ac9e8d7429e4958654d1e339c06 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 23 Jan 2024 23:42:25 -0500 Subject: [PATCH 8/8] Delete outdated docs page. --- docs/reference/config/shared/data/transform_config.md | 1 - mkdocs.yml | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 docs/reference/config/shared/data/transform_config.md diff --git a/docs/reference/config/shared/data/transform_config.md b/docs/reference/config/shared/data/transform_config.md deleted file mode 100644 index a2dada3c..00000000 --- a/docs/reference/config/shared/data/transform_config.md +++ /dev/null @@ -1 +0,0 @@ -::: invoke_training.config.shared.data.transform_config \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 1e3ae20b..c192a278 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -48,10 +48,8 @@ nav: - Textual Inversion SD Config: reference/config/pipelines/textual_inversion_sd_config.md - Textual Inversion SDXL Config: reference/config/pipelines/textual_inversion_sdxl_config.md - shared: - - data: - - data_loader_config: reference/config/shared/data/data_loader_config.md - - dataset_config: reference/config/shared/data/dataset_config.md - - transform_config: reference/config/shared/data/transform_config.md + - data_loader_config: reference/config/shared/data/data_loader_config.md + - dataset_config: reference/config/shared/data/dataset_config.md - optimizer_config: reference/config/shared/optimizer_config.md - Contributing: - contributing/development_environment.md