threestudio-project · yankeesong · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 12, 2023
diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -409,7 +409,7 @@ For the first three options, you can check more details in [pipe_stable_diffusio
 
 No specific configuration.
 
-## stable-diffusion-vsd-guidance
+### stable-diffusion-vsd-guidance
 
 | name                               | type          | description                                                                                                                                                     |
 | ---------------------------------- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -420,6 +420,20 @@ No specific configuration.
 | anneal_start_step                  | Optional[int] | If specified, denotes at which step to perform t annealing. Default: 5000                                                                                       |
 | camera_condition_type              | str           | Which to use as the camera condition for the LoRA model, in ["extrinsics", "mvp"]. Default: "extrinsics"                                                        |
 
+### controlnet-guidance
+
+View-dependent prompting is currently disabled for controlnet guidance.
+
+| name                               | type          | description                                                                                                                                                     |
+| ---------------------------------- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| control_type                       | str           | Type of control. If "normal", a normalbae detector will be used to estimate the normal map from input image. If "canny", a canny edge detector will be used, and you might want to specify canny_lower_bound and canny_upper_bound. If "input_normal", the viewspace normal from the geometry will be used directly.      |
+| condition_scale                    | float         | strength of the conditional control input on ControlNet model. Default: 1.5. |
+| ddim_scheduler_name_or_path        | str           | Name or path for the ddim_scheduler |
+
+### controlnet-vsd-guidance
+
+No specific configuration beyond those in stable-duffusion-vsd-guidance and controlnet-guidance
+
 ## Prompt Processors
 
 Prompt processors take a user prompt and compute text embeddings for training. The type of the prompt processor should match that of the guidance.

diff --git a/README.md b/README.md
@@ -399,6 +399,9 @@ python launch.py --config configs/fantasia3d.yaml --train --gpu 0 system.prompt_
 # --------- Texture --------- #
 # to train PBR texture continued from a geometry checkpoint:
 python launch.py --config configs/fantasia3d-texture.yaml --train --gpu 0 system.prompt_processor.prompt="a DSLR photo of an ice cream sundae" system.geometry_convert_from=path/to/geometry/stage/trial/dir/ckpts/last.ckpt
+# To train PBR texture from a custom (fixed) geometry: you need to change fantasia3d-texture.yaml, see comments there.
+python launch.py --config configs/fantasia3d-texture.yaml --train --gpu 0 system.prompt_processor.prompt="a girl" system.geometry.shape_init=mesh:/path/to/custom/geometry/girl.ply
+# You can also allow stronger control using ControlNet and normal map from geometry. See comments in fantasia3d-texture.yaml.
 ```
 
 **Tips**

diff --git a/configs/debugging/fantasia3d-texture-vsd.yaml b/configs/debugging/fantasia3d-texture-vsd.yaml
@@ -0,0 +1,162 @@
+name: "fantasia3d-texture-vsd"
+tag: "${rmspace:${system.prompt_processor.prompt},_}"
+exp_root_dir: "outputs"
+seed: 0
+
+data_type: "random-camera-datamodule"
+data:
+  batch_size: 1
+  width: 512
+  height: 512
+  camera_distance_range: [3, 3]
+  fovy_range: [25, 45]
+  camera_perturb: 0.
+  center_perturb: 0.
+  up_perturb: 0.
+  elevation_range: [-10, 45]
+  azimuth_range: [-180, 180]
+  batch_uniform_azimuth: true
+  eval_camera_distance: 3.
+  eval_fovy_deg: 45.
+
+system_type: "fantasia3d-vsd-system"
+system:
+  # do texture training
+  texture: true
+
+  # If using geometry from previous training
+  # geometry_convert_from: ???
+  # geometry_convert_inherit_texture: false
+  # geometry_type: "tetrahedra-sdf-grid"
+  # geometry:
+  #   radius: 1.0 # consistent with coarse
+  #   isosurface_resolution: 128
+  #   isosurface_deformable_grid: true
+  #   pos_encoding_config:
+  #     otype: HashGrid
+  #     n_levels: 16
+  #     n_features_per_level: 2
+  #     log2_hashmap_size: 19
+  #     base_resolution: 16
+  #     per_level_scale: 1.4472692374403782 # max resolution 4096
+  #   n_feature_dims: 8 # albedo3 + roughness1 + metallic1 + bump3
+  #   fix_geometry: true
+
+  # If using custom mesh
+  geometry_type: "custom-mesh"
+  geometry:
+    shape_init: ???
+    shape_init_params: 1.0
+    radius: 1.0 # consistent with coarse
+    pos_encoding_config:
+      otype: HashGrid
+      n_levels: 16
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 16
+      per_level_scale: 1.4472692374403782 # max resolution 4096
+    n_feature_dims: 8 # albedo3 + roughness1 + metallic1 + bump3
+    shape_init_mesh_up: +y
+    shape_init_mesh_front: +z
+
+  material_type: "pbr-material"
+  material:
+    material_activation: sigmoid
+    environment_texture: "load/lights/mud_road_puresky_1k.hdr"
+    environment_scale: 2.0
+    min_metallic: 0.0
+    max_metallic: 0.9
+    min_roughness: 0.08
+    max_roughness: 0.9
+    use_bump: true
+#   material_type: "diffuse-with-point-light-material"
+#   material:
+#     ambient_only_steps: 200100
+#     soft_shading: true
+
+  background_type: "solid-color-background"
+
+  renderer_type: "nvdiff-rasterizer"
+  # prompt_processor_type: "stable-diffusion-prompt-processor"
+  # prompt_processor:
+  #   pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
+  #   prompt: ???
+
+  # guidance_type: "stable-diffusion-guidance"
+  # guidance:
+  #   pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
+  #   guidance_scale: 100
+  #   weighting_strategy: sds
+  #   min_step_percent: 0.02
+  #   max_step_percent: 0.50
+
+  # If using controlnet guidance:
+#   prompt_processor_type: "stable-diffusion-prompt-processor"
+#   prompt_processor:
+#     pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+
+#   guidance_type: "stable-diffusion-controlnet-guidance"
+#   guidance:
+#     control_type: "input_normal"
+#     min_step_percent: 0.02
+#     max_step_percent: 0.98
+#     condition_scale: 1.5
+#     guidance_scale: 20
+#     use_sds: true
+
+  # If using controlnet vsd guidance:
+  prompt_processor_type: "stable-diffusion-prompt-processor"
+  prompt_processor:
+    pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+    # pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
+
+  guidance_type: "stable-diffusion-controlnet-vsd-guidance"
+  guidance:
+    # pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+    # pretrained_model_name_or_path_lora: "runwayml/stable-diffusion-v1-5"
+    control_type: "input_normal"
+    # control_type: "canny"
+    min_step_percent: 0.02
+    max_step_percent: 0.50
+    condition_scale: 2
+    guidance_scale: 7.5
+    use_sds: true
+
+
+  loggers:
+    wandb:
+      enable: false
+      project: "threestudio"
+
+  loss:
+    lambda_sds: 1.
+    lambda_lora: 1.
+    lambda_normal_consistency: 0.
+
+  optimizer:
+    name: AdamW
+    args:
+      betas: [0.9, 0.99]
+      eps: 1.e-15
+    params:
+      geometry:
+        lr: 0.01
+      material:
+        lr: 0.01
+      background:
+        lr: 0.01
+      guidance:
+        lr: 0.0001
+
+trainer:
+  max_steps: 5000
+  log_every_n_steps: 1
+  num_sanity_val_steps: 1
+  val_check_interval: 100
+  enable_progress_bar: true
+  precision: 16-mixed
+
+checkpoint:
+  save_last: true # save at each validation time
+  save_top_k: -1
+  every_n_train_steps: ${trainer.max_steps}
diff --git a/configs/fantasia3d-texture.yaml b/configs/fantasia3d-texture.yaml
@@ -46,7 +46,7 @@ system:
   # geometry_type: "custom-mesh"
   # geometry:
   #   shape_init: ???
-  #   radius: 1.0 # consistent with coarse
+  #   shape_init_params: 0.9
   #   pos_encoding_config:
   #     otype: HashGrid
   #     n_levels: 16
@@ -91,13 +91,27 @@ system:
 
 #   guidance_type: "stable-diffusion-controlnet-guidance"
 #   guidance:
-#     control_type: "normal"
+#     control_type: "input_normal"
 #     min_step_percent: 0.02
 #     max_step_percent: 0.50
 #     condition_scale: 1.0
 #     guidance_scale: 100
 #     use_sds: true
 
+  # If using controlnet vsd guidance:
+#   prompt_processor_type: "stable-diffusion-prompt-processor"
+#   prompt_processor:
+#     pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
+
+#   guidance_type: "stable-diffusion-controlnet-vsd-guidance"
+#   guidance:
+#     control_type: "input_normal"
+#     min_step_percent: 0.02
+#     max_step_percent: 0.50
+#     condition_scale: 2.0
+#     guidance_scale: 7.5
+#     use_sds: true
+
 
   loggers:
     wandb:
@@ -106,6 +120,7 @@ system:
 
   loss:
     lambda_sds: 1.
+    lambda_lora: 1.
     lambda_normal_consistency: 0.
 
   optimizer:

diff --git a/threestudio/models/guidance/__init__.py b/threestudio/models/guidance/__init__.py
@@ -1,5 +1,6 @@
 from . import (
     controlnet_guidance,
+    controlnet_vsd_guidance,
     deep_floyd_guidance,
     instructpix2pix_guidance,
     stable_diffusion_guidance,

diff --git a/threestudio/models/guidance/controlnet_guidance.py b/threestudio/models/guidance/controlnet_guidance.py
@@ -54,7 +54,7 @@ def configure(self) -> None:
         threestudio.info(f"Loading ControlNet ...")
 
         controlnet_name_or_path: str
-        if self.cfg.control_type == "normal":
+        if self.cfg.control_type in ("normal", "input_normal"):
             controlnet_name_or_path = "lllyasviel/control_v11p_sd15_normalbae"
         elif self.cfg.control_type == "canny":
             controlnet_name_or_path = "lllyasviel/control_v11p_sd15_canny"
@@ -293,6 +293,13 @@ def prepare_image_cond(self, cond_rgb: Float[Tensor, "B H W C"]):
             control = control.unsqueeze(-1).repeat(1, 1, 3)
             control = control.unsqueeze(0)
             control = control.permute(0, 3, 1, 2)
+        elif self.cfg.control_type == "input_normal":
+            cond_rgb[..., 0] = (
+                1 - cond_rgb[..., 0]
+            )  # Flip the sign on the x-axis to match bae system
+            control = cond_rgb.permute(0, 3, 1, 2)
+        else:
+            raise ValueError(f"Unknown control type: {self.cfg.control_type}")
 
         return F.interpolate(control, (512, 512), mode="bilinear", align_corners=False)