kakaobrain · vkinakh · Mar 6, 2023 · Mar 6, 2023
diff --git a/README.md b/README.md
@@ -118,9 +118,12 @@ wget https://arena.kakaocdn.net/brainrepo/models/karlo-public/v1.0.0.alpha/4226b
 ## Sampling
 
 ### Gradio demo (T2I and Image variation)
-The following command launches gradio demo for text-to-image generation and image variation. We notice that the second run in the gradio is unexpectedly slower than the usual case in PyTorch>=1.12. We guess that this happens because launching the cuda kernels takes some time, usually up to 2 minutes.
+The following command launches gradio demo for text-to-image generation and image variation. 
+We notice that the second run in the gradio is unexpectedly slower than the usual case in PyTorch>=1.12. 
+We guess that this happens because launching the cuda kernels takes some time, usually up to 2 minutes. 
+Add `--use_bf16` to use `bfloat16` precision to be able to run on one 11GB GPU.
 ```
-python demo/product_demo.py --host 0.0.0.0 --port $PORT --root-dir $KARLO_ROOT_DIR
+python demo/product_demo.py --host 0.0.0.0 --port $PORT --root-dir $KARLO_ROOT_DIR [--use_bf16]
 ```
 
 Samples below are non-cherry picked T2I and image variation examples of random seed 0.
@@ -145,12 +148,14 @@ In each case, the first row shows T2I samples and the second shows the image var
 
 ### T2I command line example
 Here, we include the command line example of T2I. For image variation, you can refer to [karlo/sampler/i2i.py](karlo/sampler/i2i.py) on how to replace the prior into the clip image feature.
+Add `--use_bf16` to use `bfloat16` precision to be able to run on one 11GB GPU.
 ```python
 python example.py --root-dir=$KARLO_ROOT_DIR \
                   --prompt="A man with a face of avocado, in the drawing style of Rene Magritte" \
                   --output-dir=$OUTPUT_DIR \
                   --max-bsz=2 \
-                  --sampling-type=fast
+                  --sampling-type= \
+                  --use_bf16
 ```
 
 ## Licence and Disclaimer

diff --git a/demo/components.py b/demo/components.py
@@ -43,19 +43,21 @@ def __init__(
         max_bsz,
         progressive,
         sampling_type: str,
+        use_bf16: bool = False,
     ):
         self._root_dir = root_dir
         self._max_bsz = max_bsz
         self._progressive = progressive
         self._sampling_type = sampling_type
+        self._use_bf16 = use_bf16
 
         self.load_ckpt()
         self.set_options_from_sampler()
 
         self.result_queue = Queue()
 
     def load_ckpt(self):
-        base_sampler = BaseSampler(root_dir=self._root_dir)
+        base_sampler = BaseSampler(root_dir=self._root_dir, use_bf16=self._use_bf16)
         base_sampler.load_clip(clip_path="ViT-L-14.pt")
         base_sampler.load_prior(
             f"{CKPT_PATH['prior']}",
@@ -65,10 +67,10 @@ def load_ckpt(self):
         base_sampler.load_sr_64_256(f"{CKPT_PATH['sr_256']}")
 
         self.t2i_sampler = T2ISampler(
-            root_dir=self._root_dir, sampling_type=self._sampling_type
+            root_dir=self._root_dir, sampling_type=self._sampling_type, use_bf16=self._use_bf16
         )
         self.i2i_sampler = I2ISampler(
-            root_dir=self._root_dir, sampling_type=self._sampling_type
+            root_dir=self._root_dir, sampling_type=self._sampling_type, use_bf16=self._use_bf16
         )
 
         self.t2i_sampler._clip = base_sampler._clip

diff --git a/demo/product_demo.py b/demo/product_demo.py
@@ -22,12 +22,14 @@ def __init__(
         max_bsz: int,
         progressive: str,
         sampling_type: str,
+        use_bf16: bool = False,
     ):
         sampler = GradioSampler(
             root_dir=root_dir,
             max_bsz=max_bsz,
             progressive=progressive,
             sampling_type=sampling_type,
+            use_bf16=use_bf16,
         )
 
         demo = gr.Blocks()
@@ -102,7 +104,12 @@ def default_parser():
         default="fast",
         choices=("fast", "default"),
     )
-
+    parser.add_argument(
+        "--use_bf16",
+        action="store_true",
+        default=False,
+        help="If true, use bf16 for inference."
+    )
     return parser
 
 
@@ -121,5 +128,6 @@ def default_parser():
         max_bsz=args.max_bsz,
         progressive=args.progressive,
         sampling_type=args.sampling_type,
+        use_bf16=args.use_bf16,
     )
     gradio_demo.demo.launch(server_name=args.host, server_port=args.port)
diff --git a/example.py b/example.py
@@ -37,6 +37,12 @@ def default_parser():
     parser.add_argument(
         "--prompt", type=str, default="A photo of a baby puppy waiting for her mom."
     )
+    parser.add_argument(
+        "--use_bf16",
+        action="store_true",
+        default=False,
+        help="If true, use bf16 for inference."
+    )
     parser.add_argument("--seed", type=int, default=0)
 
     return parser
@@ -58,6 +64,7 @@ def default_parser():
         clip_model_path="ViT-L-14.pt",
         clip_stat_path="ViT-L-14_stats.th",
         sampling_type=args.sampling_type,
+        use_bf16=args.use_bf16,
     )
 
     for i in range(5):

diff --git a/karlo/sampler/i2i.py b/karlo/sampler/i2i.py
@@ -19,14 +19,16 @@ class I2ISampler(BaseSampler):
 
     :param root_dir: directory for model checkpoints.
     :param sampling_type: ["default", "fast"]
+    :param use_bf16: If true, use bf16 for inference.
     """
 
     def __init__(
         self,
         root_dir: str,
         sampling_type: str = "default",
+        use_bf16: bool = False,
     ):
-        super().__init__(root_dir, sampling_type)
+        super().__init__(root_dir, sampling_type, use_bf16)
 
     @classmethod
     def from_pretrained(
@@ -35,11 +37,13 @@ def from_pretrained(
         clip_model_path: str,
         clip_stat_path: str,
         sampling_type: str = "default",
+        use_bf16: bool = False,
     ):
 
         model = cls(
             root_dir=root_dir,
             sampling_type=sampling_type,
+            use_bf16=use_bf16,
         )
         model.load_clip(clip_model_path)
         model.load_decoder(f"{CKPT_PATH['decoder']}")

diff --git a/karlo/sampler/t2i.py b/karlo/sampler/t2i.py
@@ -18,14 +18,16 @@ class T2ISampler(BaseSampler):
 
     :param root_dir: directory for model checkpoints.
     :param sampling_type: ["default", "fast"]
+    :param use_bf16: If true, use bf16 for inference.
     """
 
     def __init__(
         self,
         root_dir: str,
         sampling_type: str = "default",
+        use_bf16: bool = False,
     ):
-        super().__init__(root_dir, sampling_type)
+        super().__init__(root_dir, sampling_type, use_bf16)
 
     @classmethod
     def from_pretrained(
@@ -34,11 +36,13 @@ def from_pretrained(
         clip_model_path: str,
         clip_stat_path: str,
         sampling_type: str = "default",
+        use_bf16: bool = False,
     ):
 
         model = cls(
             root_dir=root_dir,
             sampling_type=sampling_type,
+            use_bf16=use_bf16,
         )
         model.load_clip(clip_model_path)
         model.load_prior(

diff --git a/karlo/sampler/template.py b/karlo/sampler/template.py
@@ -50,6 +50,7 @@ def __init__(
         self,
         root_dir: str,
         sampling_type: str = "fast",
+        use_bf16: bool = False,
     ):
         self._root_dir = root_dir
 
@@ -64,6 +65,7 @@ def __init__(
         self._decoder_cf_scale = sampling_type["decoder_cf_scale"]
 
         self._sr_sm = sampling_type["sr_sm"]
+        self._use_bf16 = use_bf16
 
     def __repr__(self):
         line = ""
@@ -77,10 +79,11 @@ def load_clip(self, clip_path: str):
         clip = CustomizedCLIP.load_from_checkpoint(
             os.path.join(self._root_dir, clip_path)
         )
+        if self._use_bf16:
+            clip.bfloat16()
         clip = torch.jit.script(clip)
         clip.cuda()
         clip.eval()
-
         self._clip = clip
         self._tokenizer = CustomizedTokenizer()
 
@@ -104,6 +107,8 @@ def load_prior(
             os.path.join(self._root_dir, ckpt_path),
             strict=True,
         )
+        if self._use_bf16:
+            prior.bfloat16()
         prior.cuda()
         prior.eval()
         logging.info("done.")
@@ -120,6 +125,8 @@ def load_decoder(self, ckpt_path: str):
             os.path.join(self._root_dir, ckpt_path),
             strict=True,
         )
+        if self._use_bf16:
+            decoder.bfloat16()
         decoder.cuda()
         decoder.eval()
         logging.info("done.")
@@ -133,6 +140,7 @@ def load_sr_64_256(self, ckpt_path: str):
         sr = self._SR256_CLASS.load_from_checkpoint(
             config, os.path.join(self._root_dir, ckpt_path), strict=True
         )
+        sr.bfloat16()
         sr.cuda()
         sr.eval()
         logging.info("done.")