Merge pull request #76 from invoke-ai/flat-configs

Flatten configs
invoke-ai · Jan 24, 2024 · 2f74cf3 · 2f74cf3
2 parents a64403d + e280194
commit 2f74cf3
Show file tree

Hide file tree

Showing 33 changed files with 253 additions and 348 deletions.
diff --git a/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml b/configs/_experimental/dpo_lora_refinement_sd_pokemon_1x24gb_example.yaml
@@ -4,25 +4,22 @@
 
 type: DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD
 seed: 1
-output:
-  base_output_dir: output/dpo
+base_output_dir: output/dpo
 
 optimizer:
+  optimizer_type: AdamW
   learning_rate: 1e-4
-  lr_warmup_steps: 500
-  lr_scheduler: cosine
+  weight_decay: 1e-2
 
-  optimizer:
-    optimizer_type: AdamW
-    weight_decay: 1e-2
+lr_warmup_steps: 500
+lr_scheduler: cosine
 
 data_loader:
   type: IMAGE_PAIR_PREFERENCE_SD_DATA_LOADER
   dataset:
     type: IMAGE_PAIR_PREFERENCE_DATASET
     dataset_dir: output/pokemon_pairs
-  image_transforms:
-    resolution: 512
+  resolution: 512
   dataloader_num_workers: 4
 
 # General

diff --git a/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml b/configs/_experimental/dpo_lora_sd_pickapic_1x24gb_example.yaml
@@ -7,24 +7,21 @@
 
 type: DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD
 seed: 1
-output:
-  base_output_dir: output/dpo
+base_output_dir: output/dpo
 
 optimizer:
+  optimizer_type: AdamW
   learning_rate: 1e-4
-  lr_warmup_steps: 200
-  lr_scheduler: cosine
+  weight_decay: 1e-2
 
-  optimizer:
-    optimizer_type: AdamW
-    weight_decay: 1e-2
+lr_warmup_steps: 200
+lr_scheduler: cosine
 
 data_loader:
   type: IMAGE_PAIR_PREFERENCE_SD_DATA_LOADER
   dataset:
     type: HF_HUB_IMAGE_PAIR_PREFERENCE_DATASET
-  image_transforms:
-    resolution: 512
+  resolution: 512
 
 # General
 model: runwayml/stable-diffusion-v1-5

diff --git a/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sd_pokemon_1x8gb_example.yaml
@@ -9,25 +9,21 @@
 
 type: FINETUNE_LORA_SD
 seed: 1
-output:
-  base_output_dir: output/finetune_lora_sd_pokemon/
+base_output_dir: output/finetune_lora_sd_pokemon/
 
 optimizer:
+  optimizer_type: Prodigy
   learning_rate: 1.0
-
-  optimizer:
-    optimizer_type: Prodigy
-    weight_decay: 0.01
-    use_bias_correction: True
-    safeguard_warmup: True
+  weight_decay: 0.01
+  use_bias_correction: True
+  safeguard_warmup: True
 
 data_loader:
   type: IMAGE_CAPTION_SD_DATA_LOADER
   dataset:
     type: HF_HUB_IMAGE_CAPTION_DATASET
     dataset_name: lambdalabs/pokemon-blip-captions
-  image_transforms:
-    resolution: 512
+  resolution: 512
   dataloader_num_workers: 4
 
 # General

diff --git a/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x24gb_example.yaml
@@ -8,25 +8,21 @@
 # purposes.
 type: FINETUNE_LORA_SDXL
 seed: 1
-output:
-  base_output_dir: output/finetune_lora_sdxl_pokemon/
+base_output_dir: output/finetune_lora_sdxl_pokemon/
 
 optimizer:
+  optimizer_type: Prodigy
   learning_rate: 1.0
-
-  optimizer:
-    optimizer_type: Prodigy
-    weight_decay: 0.01
-    use_bias_correction: True
-    safeguard_warmup: True
+  weight_decay: 0.01
+  use_bias_correction: True
+  safeguard_warmup: True
 
 data_loader:
   type: IMAGE_CAPTION_SD_DATA_LOADER
   dataset:
     type: HF_HUB_IMAGE_CAPTION_DATASET
     dataset_name: lambdalabs/pokemon-blip-captions
-  image_transforms:
-    resolution: 512
+  resolution: 512
 
 # General
 model: stabilityai/stable-diffusion-xl-base-1.0

diff --git a/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml b/configs/finetune_lora_sdxl_pokemon_1x8gb_example.yaml
@@ -9,25 +9,21 @@
 #   - Achieve reasonable results *quickly* (<15mins) for demo purposes.
 type: FINETUNE_LORA_SDXL
 seed: 1
-output:
-  base_output_dir: output/finetune_lora_sdxl_pokemon/
+base_output_dir: output/finetune_lora_sdxl_pokemon/  
 
 optimizer:
+  optimizer_type: Prodigy
   learning_rate: 1.0
-
-  optimizer:
-    optimizer_type: Prodigy
-    weight_decay: 0.01
-    use_bias_correction: True
-    safeguard_warmup: True
+  weight_decay: 0.01
+  use_bias_correction: True
+  safeguard_warmup: True
 
 data_loader:
   type: IMAGE_CAPTION_SD_DATA_LOADER
   dataset:
     type: HF_HUB_IMAGE_CAPTION_DATASET
     dataset_name: lambdalabs/pokemon-blip-captions
-  image_transforms:
-    resolution: 512
+  resolution: 512
 
 # General
 model: stabilityai/stable-diffusion-xl-base-1.0

diff --git a/configs/textual_inversion_sd_gnome_1x8gb_example.yaml b/configs/textual_inversion_sd_gnome_1x8gb_example.yaml
@@ -4,31 +4,26 @@
 
 type: TEXTUAL_INVERSION_SD
 seed: 1
-output:
-  base_output_dir: output/ti_sd_bruce_the_gnome
+base_output_dir: output/ti_sd_bruce_the_gnome
 
 optimizer:
+  optimizer_type: AdamW
   learning_rate: 4e-3
-  lr_warmup_steps: 200
-  lr_scheduler: cosine
 
-  optimizer:
-    optimizer_type: AdamW
+lr_warmup_steps: 200
+lr_scheduler: cosine
 
 data_loader:
   type: TEXTUAL_INVERSION_SD_DATA_LOADER
   dataset:
     type: IMAGE_DIR_DATASET
     dataset_dir: "sample_data/bruce_the_gnome"
     keep_in_memory: True
-  captions:
-    type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM
-    preset: object
-  image_transforms:
-    resolution: 512
-    center_crop: True
-    random_flip: False
-  shuffle_caption_transform: null
+  caption_preset: object
+  resolution: 512
+  center_crop: True
+  random_flip: False
+  shuffle_caption_delimiter: null
   aspect_ratio_buckets:
     target_resolution: 512
     start_dim: 256

diff --git a/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml b/configs/textual_inversion_sdxl_gnome_1x24gb_example.yaml
@@ -4,31 +4,26 @@
 
 type: TEXTUAL_INVERSION_SDXL
 seed: 1
-output:
-  base_output_dir: output/ti_sdxl_bruce_the_gnome
+base_output_dir: output/ti_sdxl_bruce_the_gnome
 
 optimizer:
+  optimizer_type: AdamW
   learning_rate: 2e-3
-  lr_warmup_steps: 200
-  lr_scheduler: cosine
 
-  optimizer:
-    optimizer_type: AdamW
+lr_warmup_steps: 200
+lr_scheduler: cosine
 
 data_loader:
   type: TEXTUAL_INVERSION_SD_DATA_LOADER
   dataset:
     type: IMAGE_DIR_DATASET
     dataset_dir: "sample_data/bruce_the_gnome"
     keep_in_memory: True
-  captions:
-    type: TEXTUAL_INVERSION_PRESET_CAPTION_TRANSFORM
-    preset: object
-  image_transforms:
-    resolution: 1024
-    center_crop: True
-    random_flip: False
-  shuffle_caption_transform: null
+  caption_preset: object
+  resolution: 1024
+  center_crop: True
+  random_flip: False
+  shuffle_caption_delimiter: null
   dataloader_num_workers: 4
 
 # General

diff --git a/docs/reference/config/shared/data/transform_config.md b/docs/reference/config/shared/data/transform_config.md
diff --git a/docs/reference/config/shared/training_output_config.md b/docs/reference/config/shared/training_output_config.md
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -48,11 +48,8 @@ nav:
     - Textual Inversion SD Config: reference/config/pipelines/textual_inversion_sd_config.md
     - Textual Inversion SDXL Config: reference/config/pipelines/textual_inversion_sdxl_config.md
   - shared:
-    - data:
-      - data_loader_config: reference/config/shared/data/data_loader_config.md
-      - dataset_config: reference/config/shared/data/dataset_config.md
-      - transform_config: reference/config/shared/data/transform_config.md
-    - training_output_config: reference/config/shared/training_output_config.md
+    - data_loader_config: reference/config/shared/data/data_loader_config.md
+    - dataset_config: reference/config/shared/data/dataset_config.md
     - optimizer_config: reference/config/shared/optimizer_config.md
 - Contributing:
   - contributing/development_environment.md

diff --git a/src/invoke_training/config/_experimental/dpo/config.py b/src/invoke_training/config/_experimental/dpo/config.py
@@ -4,8 +4,6 @@
 
 from invoke_training.config.pipelines.finetune_lora_config import LoRATrainingConfig
 from invoke_training.config.shared.config_base_model import ConfigBaseModel
-from invoke_training.config.shared.data.transform_config import SDImageTransformConfig
-from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig
 
 
 class HFHubImagePairPreferenceDatasetConfig(ConfigBaseModel):
@@ -28,7 +26,20 @@ class ImagePairPreferenceSDDataLoaderConfig(ConfigBaseModel):
         Union[HFHubImagePairPreferenceDatasetConfig, ImagePairPreferenceDatasetConfig], Field(discriminator="type")
     ]
 
-    image_transforms: SDImageTransformConfig
+    resolution: int | tuple[int, int] = 512
+    """The resolution for input images. Either a scalar integer representing the square resolution height and width, or
+    a (height, width) tuple. All of the images in the dataset will be resized to this resolution unless the
+    `aspect_ratio_buckets` config is set.
+    """
+
+    center_crop: bool = True
+    """If True, input images will be center-cropped to the target resolution.
+    If False, input images will be randomly cropped to the target resolution.
+    """
+
+    random_flip: bool = False
+    """Whether random flip augmentations should be applied to input images.
+    """
 
     dataloader_num_workers: int = 0
     """Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
@@ -37,7 +48,7 @@ class ImagePairPreferenceSDDataLoaderConfig(ConfigBaseModel):
 
 class DirectPreferenceOptimizationLoRASDConfig(LoRATrainingConfig):
     type: Literal["DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD"] = "DIRECT_PREFERENCE_OPTIMIZATION_LORA_SD"
-    optimizer: OptimizerConfig
+
     data_loader: ImagePairPreferenceSDDataLoaderConfig
 
     initial_lora: str | None = None

diff --git a/src/invoke_training/config/pipelines/base_pipeline_config.py b/src/invoke_training/config/pipelines/base_pipeline_config.py
@@ -1,7 +1,7 @@
+import typing
 from typing import Optional
 
 from invoke_training.config.shared.config_base_model import ConfigBaseModel
-from invoke_training.config.shared.training_output_config import TrainingOutputConfig
 
 
 class BasePipelineConfig(ConfigBaseModel):
@@ -14,8 +14,12 @@ class BasePipelineConfig(ConfigBaseModel):
     set to `null`, training will be non-deterministic.
     """
 
-    output: TrainingOutputConfig
-    """Configuration for the training run outputs (output directory, log format, checkpoint format, etc.).
+    base_output_dir: str
+    """The output directory where the training outputs (model checkpoints, logs, intermediate predictions) will be
+    written. A subdirectory will be created with a timestamp for each new training run.
+    """
 
-    See [`TrainingOutputConfig`][invoke_training.config.shared.training_output_config.TrainingOutputConfig] for details.
+    report_to: typing.Literal["all", "tensorboard", "wandb", "comet_ml"] = "tensorboard"
+    """The integration to report results and logs to. This value is passed to Hugging Face Accelerate. See
+    `accelerate.Accelerator.log_with` for more details.
     """
diff --git a/src/invoke_training/config/pipelines/finetune_lora_config.py b/src/invoke_training/config/pipelines/finetune_lora_config.py
@@ -1,3 +1,4 @@
+import typing
 from typing import Annotated, Literal, Optional, Union
 
 from pydantic import Field
@@ -7,7 +8,7 @@
     DreamboothSDDataLoaderConfig,
     ImageCaptionSDDataLoaderConfig,
 )
-from invoke_training.config.shared.optimizer.optimizer_config import OptimizerConfig
+from invoke_training.config.shared.optimizer.optimizer_config import AdamOptimizerConfig, ProdigyOptimizerConfig
 
 
 class LoRATrainingConfig(BasePipelineConfig):
@@ -51,6 +52,8 @@ class LoRATrainingConfig(BasePipelineConfig):
     """Whether to add LoRA layers to the text encoder and train it.
     """
 
+    optimizer: AdamOptimizerConfig | ProdigyOptimizerConfig = AdamOptimizerConfig()
+
     text_encoder_learning_rate: Optional[float] = None
     """The learning rate to use for the text encoder model. If set, this overrides the optimizer's default learning
     rate.
@@ -60,10 +63,13 @@ class LoRATrainingConfig(BasePipelineConfig):
     """The learning rate to use for the UNet model. If set, this overrides the optimizer's default learning rate.
     """
 
-    train_unet_non_attention_blocks: bool = False
-    """Whether to inject LoRA layers into the non-attention UNet blocks for training. Enabling will produce a more
-    expressive LoRA model at the cost of slower training, higher training VRAM requirements, and a larger LoRA weight
-    file.
+    lr_scheduler: typing.Literal[
+        "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"
+    ] = "constant"
+
+    lr_warmup_steps: int = 0
+    """The number of warmup steps in the learning rate scheduler. Only applied to schedulers that support warmup.
+    See lr_scheduler.
     """
 
     lora_rank_dim: int = 4
@@ -159,15 +165,13 @@ class LoRATrainingConfig(BasePipelineConfig):
 
 class FinetuneLoRASDConfig(LoRATrainingConfig):
     type: Literal["FINETUNE_LORA_SD"] = "FINETUNE_LORA_SD"
-    optimizer: OptimizerConfig
     data_loader: Annotated[
         Union[ImageCaptionSDDataLoaderConfig, DreamboothSDDataLoaderConfig], Field(discriminator="type")
     ]
 
 
 class FinetuneLoRASDXLConfig(LoRATrainingConfig):
     type: Literal["FINETUNE_LORA_SDXL"] = "FINETUNE_LORA_SDXL"
-    optimizer: OptimizerConfig
     data_loader: Annotated[
         Union[ImageCaptionSDDataLoaderConfig, DreamboothSDDataLoaderConfig], Field(discriminator="type")
     ]