From 0c482d01669ecacc5e2143d68d83cecb22935f79 Mon Sep 17 00:00:00 2001 From: Enigmatisms <984041003@qq.com> Date: Wed, 24 Aug 2022 21:19:42 +0800 Subject: [PATCH] Ref NeRF good to go --- README.md | 4 ++-- Updates.md | 4 ++++ mkdir.sh | 14 ++++++++++++++ py/addtional.py | 2 +- py/dataset.py | 14 +++++++++++--- py/procedures.py | 43 ++++++++++++++++++++++++++++++------------- train.py | 21 ++++++++++----------- train.sh | 13 +++++++++++++ train_helmet_v1.sh | 1 - 9 files changed, 85 insertions(+), 31 deletions(-) create mode 100755 mkdir.sh create mode 100755 train.sh delete mode 100644 train_helmet_v1.sh diff --git a/README.md b/README.md index bd5f1be..e44c333 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,9 @@ If you are interested in the implementation of NeRFs and you don't want to read - If you are interested in CUDA implementations (there were, once), please refer to: [issue#4](https://github.com/Enigmatisms/NeRF/issues/4) and [issue#6](https://github.com/Enigmatisms/NeRF/issues/6) Some Ref NeRF results: -(Latest-commit `e4907564`) Shinny blender "helmet" dataset trained for 3 hours (not completed, PSNR around 18). Oops, gif file to big to upload. Fine, just imagine the output, its better than the older commit (in terms of normal prediction) +(Latest-commit `e4907564`) Shinny blender "helmet" dataset trained for 3 hours (not completed, PSNR around 27). Oops, gif file to big to upload. Fine, just imagine the output, its better than the older commit (in terms of normal prediction) -(Older-commit `847fdb9d`) Shinny blender "helmet" dataset trained for 6-7 hours (not completed, PSNR around 19.5) +(Older-commit `847fdb9d`) Shinny blender "helmet" dataset trained for 6-7 hours (not completed, PSNR around 28.5.) ![ezgif-1-8207b1faa2](https://user-images.githubusercontent.com/46109954/185753069-d5cbd05e-1f66-4423-9503-1a5cd126ed89.gif) diff --git a/Updates.md b/Updates.md index 066e613..8b65264 100644 --- a/Updates.md +++ b/Updates.md @@ -2,6 +2,10 @@ --- +### 8.24 Update + +It turns out that PSNR calculation is to blame. My loss function is not MSE (PSNR is calculated using MSE), it is SoftL1 (`sqrt(e^2 + ε)`), which is bigger than expected. Therefore "PSNR" is low (around 19.). Currently, PSNR of the model's (trained for only 7 hours) is around 28.5. + ### 8.19 Update CVPR 2022 best student honorable mention: [Ref-NeRF: Structured View-Dependent Appearance for Neural Radiance Fields](https://arxiv.org/abs/2112.03907) is implemented in this repo. This repo can turn Ref NeRF part on/off with one flag: `-t`. Ref NeRF is implemented upon (proposal network + NeRF) framework. Currently, the result is not so satisfying as I expected. This may be caused by insufficient time for training (limited training device, 6GB global memory, can only use up to batch size 2^9 (rays), while the paper uses 2^14). diff --git a/mkdir.sh b/mkdir.sh new file mode 100755 index 0000000..688d293 --- /dev/null +++ b/mkdir.sh @@ -0,0 +1,14 @@ +if [ ! -d output ]; then + echo "Creating folder \"output\" and \"output/sphere\""; + mkdir -p output/sphere +fi + +if [ ! -d check_points ]; then + echo "Creating folder \"check_points\""; + mkdir check_points +fi + +if [ ! -d model ]; then + echo "Creating folder \"model\""; + mkdir model +fi \ No newline at end of file diff --git a/py/addtional.py b/py/addtional.py index d8647e9..4c0a66b 100644 --- a/py/addtional.py +++ b/py/addtional.py @@ -41,7 +41,7 @@ def __init__(self, epsilon = 0.001) -> None: super().__init__() self.eps = epsilon def forward(self, pred:torch.Tensor, target:torch.Tensor): - return torch.mean(torch.sqrt(self.eps ** 2 + (pred - target) ** 2)) + return torch.mean((pred - target) ** 2) class LossPSNR(nn.Module): __LOG_10__ = 2.3025851249694824 diff --git a/py/dataset.py b/py/dataset.py index be274e2..38ab926 100644 --- a/py/dataset.py +++ b/py/dataset.py @@ -32,7 +32,7 @@ def forward(self, input:Image.Image): return F.resize(input, size, self.interpolation, self.max_size, self.antialias) class CustomDataSet(data.Dataset): - def __init__(self, root_dir, transform, scene_scale = 1.0, is_train = True, use_alpha = False, white_bkg = False): + def __init__(self, root_dir, transform, scene_scale = 1.0, is_train = True, use_alpha = False, white_bkg = False, is_cuda = False): self.is_train = is_train self.root_dir = root_dir self.main_dir = root_dir + ("train/" if is_train else "test/") @@ -43,6 +43,7 @@ def __init__(self, root_dir, transform, scene_scale = 1.0, is_train = True, use_ self.use_alpha = use_alpha self.scene_scale = scene_scale self.white_bkg = white_bkg + self.is_cuda = is_cuda self.cam_fov, self.tfs = self.__get_camera_param() def __len__(self): @@ -52,16 +53,23 @@ def __getitem__(self, idx): img_loc = os.path.join(self.main_dir, self.total_imgs[idx]) image = Image.open(img_loc, mode = 'r').convert("RGBA" if self.use_alpha or self.white_bkg else "RGB") tensor_image = self.transform(image) + if self.is_cuda: + tensor_image = tensor_image.cuda() + tf = self.tfs[idx].cuda() + else: + tf = self.tfs[idx].clone() if self.white_bkg: tensor_image = tensor_image[:3, ...]*tensor_image[-1:, ...] + (1.-tensor_image[-1:, ...]) - tf = self.tfs[idx].clone() tf[:3, -1] *= self.scene_scale - return tensor_image, tf + return tensor_image.squeeze(0), tf def r_c(self): image, _ = self.__getitem__(0) return image.shape[1], image.shape[2] + def cuda(self, flag = True): + self.is_cuda = flag + @staticmethod def readFromJson(path:str): with open(path, "r") as file: diff --git a/py/procedures.py b/py/procedures.py index a70121b..cdf729b 100644 --- a/py/procedures.py +++ b/py/procedures.py @@ -9,7 +9,7 @@ from py.ref_model import RefNeRF from torchvision import transforms from py.dataset import CustomDataSet, AdaptiveResize -from py.addtional import ProposalNetwork +from py.addtional import ProposalNetwork, SoftL1Loss, LossPSNR from torch.nn.functional import softplus from torchvision.utils import save_image from py.mip_methods import maxBlurFilter @@ -107,8 +107,9 @@ def render_only(args, model_path: str, opt_level: str): use_white_bkg = args.white_bkg opt_mode = args.opt_mode use_ref_nerf = args.ref_nerf - render_normal = args.render_normal - render_depth = args.render_depth + eval_poses = args.eval_poses + render_normal = args.render_normal & (not eval_poses) + render_depth = args.render_depth & (not eval_poses) transform_funcs = transforms.Compose([ AdaptiveResize(img_scale), transforms.ToTensor(), @@ -117,23 +118,28 @@ def render_only(args, model_path: str, opt_level: str): cam_fov_test, _ = testset.getCameraParam() r_c = testset.r_c() - del testset + if eval_poses: + all_poses = testset.tfs.cuda() + loss_func = SoftL1Loss() + psnr_func = LossPSNR() + else: + all_poses = torch.stack([pose_spherical(angle, -30.0, 4.0) for angle in torch.linspace(-180,180,120 + 1)[:-1]], 0).cuda() + del testset test_focal = fov2Focal(cam_fov_test, r_c) if use_ref_nerf: from py.ref_model import RefNeRF - mip_net = RefNeRF(10, args.ide_level, hidden_unit = 256, perturb_bottle_neck_w = args.bottle_neck_noise, use_srgb = args.use_srgb).cuda() + mip_net = RefNeRF(10, args.ide_level, hidden_unit = args.nerf_net_width, perturb_bottle_neck_w = args.bottle_neck_noise, use_srgb = args.use_srgb).cuda() else: from py.mip_model import MipNeRF - mip_net = MipNeRF(10, 4, hidden_unit = 256) - prop_net = ProposalNetwork(10, hidden_unit = 256).cuda() + mip_net = MipNeRF(10, 4, hidden_unit = args.nerf_net_width) + prop_net = ProposalNetwork(10, hidden_unit = args.prop_net_width).cuda() if use_amp and opt_mode != "native": from apex import amp [mip_net, prop_net] = amp.initialize([mip_net, prop_net], None, opt_level = opt_level) mip_net.loadFromFile(load_path_mip, use_amp and opt_mode != "native") prop_net.loadFromFile(load_path_prop, use_amp and opt_mode != "native") - all_poses = torch.stack([pose_spherical(angle, -30.0, 4.0) for angle in torch.linspace(-180,180,120 + 1)[:-1]], 0).cuda() mip_net.eval() prop_net.eval() with torch.no_grad(): @@ -141,22 +147,32 @@ def render_only(args, model_path: str, opt_level: str): pose[:3, -1] *= scene_scale if opt_mode == "native": with autocast(): - result = render_image(mip_net, prop_net, pose[:-1, :], r_c, test_focal, near_t, far_t, 128, + result = render_image(mip_net, prop_net, pose[:3, :], r_c, test_focal, near_t, far_t, 128, white_bkg = use_white_bkg, render_normal = render_normal, render_depth = render_depth) else: - result = render_image(mip_net, prop_net, pose[:-1, :], r_c, test_focal, near_t, far_t, 128, + result = render_image(mip_net, prop_net, pose[:3, :], r_c, test_focal, near_t, far_t, 128, white_bkg = use_white_bkg, render_normal = render_normal, render_depth = render_depth) - save_image(list(result.values()), "./output/sphere/result_%03d.png"%(i), nrow = 1 + render_depth + render_depth) + if eval_poses == True: + gt_img, _ = testset[i] + gt_img = gt_img.cuda() + loss = loss_func(result['rgb'], gt_img) + psnr = psnr_func(loss) + print("Image loss:%.6f\tPSNR:%.4f"%(loss.item(), psnr.item())) + result['gt_img'] = gt_img + output_dir = "given" if eval_poses else "sphere" + save_image(list(result.values()), "./output/%s/result_%03d.png"%(output_dir, i), nrow = 1 + render_depth + render_depth + eval_poses) def get_parser(): parser = argparse.ArgumentParser() - parser.add_argument("--epochs", type = int, default = 2000, help = "Training lasts for . epochs") + parser.add_argument("--epochs", type = int, default = 2400, help = "Training lasts for . epochs") parser.add_argument("--sample_ray_num", type = int, default = 1024, help = " rays to sample per training time") parser.add_argument("--coarse_sample_pnum", type = int, default = 64, help = "Points to sample in coarse net") parser.add_argument("--fine_sample_pnum", type = int, default = 128, help = "Points to sample in fine net") parser.add_argument("--eval_time", type = int, default = 5, help = "Tensorboard output interval (train time)") parser.add_argument("--output_time", type = int, default = 20, help = "Image output interval (train time)") parser.add_argument("--center_crop_iter", type = int, default = 0, help = "Produce center") + parser.add_argument("--prop_net_width", type = int, default = 256, help = "Width of proposal network") + parser.add_argument("--nerf_net_width", type = int, default = 256, help = "Width of nerf network") parser.add_argument("--near", type = float, default = 2., help = "Nearest sample depth") parser.add_argument("--far", type = float, default = 6., help = "Farthest sample depth") parser.add_argument("--center_crop_x", type = float, default = 0.5, help = "Center crop x axis ratio") @@ -165,7 +181,7 @@ def get_parser(): parser.add_argument("--dataset_name", type = str, default = "lego", help = "Input dataset name in nerf synthetic dataset") parser.add_argument("--img_scale", type = float, default = 0.5, help = "Scale of the image") parser.add_argument("--scene_scale", type = float, default = 1.0, help = "Scale of the scene") - parser.add_argument("--grad_clip", type = float, default = 1e-3, help = "Gradient clipping parameter") + parser.add_argument("--grad_clip", type = float, default = -0.01, help = "Gradient clipping parameter (Negative number means no clipping)") parser.add_argument("--pe_period_scale", type = float, default = 0.5, help = "Scale of positional encoding") # opt related parser.add_argument("--opt_mode", type = str, default = "O1", help = "Optimization mode: none, native (torch amp), O1, O2 (apex amp)") @@ -184,6 +200,7 @@ def get_parser(): parser.add_argument("-w", "--white_bkg", default = False, action = "store_true", help = "Output white background") parser.add_argument("-t", "--ref_nerf", default = False, action = "store_true", help = "Test Ref NeRF") parser.add_argument("-u", "--use_srgb", default = False, action = "store_true", help = "Whether to use srgb in the output or not") + parser.add_argument("-e", "--eval_poses", default = False, action = "store_true", help = "Whether to use test set poses to render image") # long bool options parser.add_argument("--render_depth", default = False, action = "store_true", help = "Render depth image") parser.add_argument("--render_normal", default = False, action = "store_true", help = "Render normal image") diff --git a/train.py b/train.py index 53463d1..c455d93 100644 --- a/train.py +++ b/train.py @@ -69,11 +69,11 @@ def main(args): from py.ref_model import RefNeRF, WeightedNormalLoss, BackFaceLoss normal_loss_func = WeightedNormalLoss(True) bf_loss_func = BackFaceLoss() - mip_net = RefNeRF(10, args.ide_level, hidden_unit = 256, perturb_bottle_neck_w = args.bottle_neck_noise, use_srgb = args.use_srgb).cuda() + mip_net = RefNeRF(10, args.ide_level, hidden_unit = args.nerf_net_width, perturb_bottle_neck_w = args.bottle_neck_noise, use_srgb = args.use_srgb).cuda() else: from py.mip_model import MipNeRF - mip_net = MipNeRF(10, 4, hidden_unit = 256).cuda() - prop_net = ProposalNetwork(10, hidden_unit = 256).cuda() + mip_net = MipNeRF(10, 4, hidden_unit = args.nerf_net_width).cuda() + prop_net = ProposalNetwork(10, hidden_unit = args.prop_net_width).cuda() if debugging: for submodule in mip_net.modules(): @@ -92,8 +92,10 @@ def main(args): ]) # 数据集加载 - trainset = CustomDataSet("../dataset/refnerf/%s/"%(dataset_name), transform_funcs, scene_scale, True, use_alpha = False, white_bkg = use_white_bkg) - testset = CustomDataSet("../dataset/refnerf/%s/"%(dataset_name), transform_funcs, scene_scale, False, use_alpha = False, white_bkg = use_white_bkg) + trainset = CustomDataSet("../dataset/refnerf/%s/"%(dataset_name), transform_funcs, + scene_scale, True, use_alpha = False, white_bkg = use_white_bkg, is_cuda = True) + testset = CustomDataSet("../dataset/refnerf/%s/"%(dataset_name), transform_funcs, + scene_scale, False, use_alpha = False, white_bkg = use_white_bkg) cam_fov_train, train_cam_tf = trainset.getCameraParam() r_c = trainset.r_c() train_cam_tf = train_cam_tf.cuda() @@ -126,6 +128,7 @@ def grad_clip_func(parameters, grad_clip): test_views = [] for i in (1, 4): test_views.append(testset[i]) + del testset torch.cuda.empty_cache() # ====== tensorboard summary writer ====== @@ -141,8 +144,6 @@ def grad_clip_func(parameters, grad_clip): epoch_timer.tic() for i, (train_img, train_tf) in enumerate(train_loader): train_timer.tic() - train_img = train_img.cuda().squeeze(0) - train_tf = train_tf.cuda().squeeze(0) now_crop = (center_crop if train_cnt < center_crop_iter else (1., 1.)) valid_pixels, valid_coords = randomFromOneImage(train_img, now_crop) @@ -228,7 +229,7 @@ def run(is_ref_model = False): with torch.no_grad(): eval_timer.tic() test_results = [] - test_loss = torch.zeros(1).cuda() + test_loss = 0. for test_img, test_tf in test_views: test_result = render_image( mip_net, prop_net, test_tf.cuda(), r_c, test_focal, near_t, far_t, fine_sample_pnum, @@ -242,9 +243,7 @@ def run(is_ref_model = False): print("Evaluation in epoch: %4d / %4d\t, test counter: %d test loss: %.4f\taverage time: %.4lf\tremaining eval time:%s"%( ep, epochs, test_cnt, test_loss.item() / 2, eval_timer.get_mean_time(), eval_timer.remaining_time(epochs - ep - 1) )) - images_to_save = [] - images_to_save.extend(test_results) - save_image(images_to_save, "./output/result_%03d.png"%(test_cnt), nrow = 1 + render_normal + render_depth) + save_image(test_results, "./output/result_%03d.png"%(test_cnt), nrow = 1 + render_normal + render_depth) # ======== Saving checkpoints ======== saveModel(mip_net, "%schkpt_%d_mip.pt"%(default_chkpt_path, train_cnt), {"train_cnt": train_cnt, "epoch": ep}, opt = opt, amp = (amp) if use_amp and opt_mode != "native" else None) saveModel(prop_net, "%schkpt_%d_prop.pt"%(default_chkpt_path, train_cnt), opt = None, amp = (amp) if use_amp and opt_mode != "native" else None) diff --git a/train.sh b/train.sh new file mode 100755 index 0000000..82683cf --- /dev/null +++ b/train.sh @@ -0,0 +1,13 @@ +if [ "$1" = "" -o "$2" = "" ]; then + echo "Usage: ./train.sh " + echo "For example: ./train.sh 1024 car" + if [ "a"$1 = "a" ]; then + echo "Please specify batch size (ray number)" + fi + if [ "a"$2 = "a" ]; then + echo "Please specify dataset name (object name, for example: lego)" + fi + exit +fi + +python3 ./train.py -s -t -u --sample_ray_num $1 --dataset_name $2 --render_depth --render_normal \ No newline at end of file diff --git a/train_helmet_v1.sh b/train_helmet_v1.sh deleted file mode 100644 index 1e33c7b..0000000 --- a/train_helmet_v1.sh +++ /dev/null @@ -1 +0,0 @@ -python3 ./train.py -s -t -u --sample_ray_num 512 --grad_clip -0.01 --dataset_name helmet --render_depth --render_normal -l --name helmet_v1 \ No newline at end of file