diff --git a/configs/experiment/m2/feat_v6_no_pose.yaml b/configs/experiment/m2/OLD/feat_v6_no_pose.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_no_pose.yaml rename to configs/experiment/m2/OLD/feat_v6_no_pose.yaml diff --git a/configs/experiment/m2/feat_v6_only_hands_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_only_hands_joints.yaml rename to configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml diff --git a/configs/experiment/m2/feat_v6_only_object_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_only_object_joints.yaml rename to configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml diff --git a/configs/experiment/m2/feat_v6.yaml b/configs/experiment/m2/feat_v6.yaml index d6f1e2f4b..6e1921b33 100644 --- a/configs/experiment/m2/feat_v6.yaml +++ b/configs/experiment/m2/feat_v6.yaml @@ -40,12 +40,12 @@ trainer: log_every_n_steps: 1 model: + num_classes: 9 # number of activity classification classes compile: false net: # Length of feature vector for a single frame. - # Currently derived from feature version and other hyperparameters. + # Currently derived from the parameterization of dataset vectorizer. dim: 297 - num_classes: 9 data: coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json" @@ -60,10 +60,12 @@ data: coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json" coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimates.coco.json" - batch_size: 512 + # batch_size: 512 + # batch_size: 8192 + batch_size: 16384 num_workers: 16 target_framerate: 15 # BBN Hololens2 Framerate - epoch_length: 20000 + epoch_length: 200000 train_dataset: window_size: 25 @@ -106,13 +108,12 @@ data: paths: # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" - root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/training_root" + root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens" # Convenience variable to where your train/val/test split COCO file datasets # are stored. coco_file_root: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens" -#exp_name: "tcn_training_revive" #logger: # aim: # experiment: ${task_name} diff --git a/configs/experiment/r18/feat_v5.yaml b/configs/experiment/r18/OLD/feat_v5.yaml similarity index 100% rename from configs/experiment/r18/feat_v5.yaml rename to configs/experiment/r18/OLD/feat_v5.yaml diff --git a/configs/experiment/r18/OLD/feat_v6.yaml b/configs/experiment/r18/OLD/feat_v6.yaml new file mode 100644 index 000000000..7bd8aa05a --- /dev/null +++ b/configs/experiment/r18/OLD/feat_v6.yaml @@ -0,0 +1,111 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=example +topic: "medical" +task: "r18" +feature_version: 6 + +defaults: + - override /data: ptg + - override /model: ptg + - override /callbacks: default + - override /trainer: gpu + - override /paths: default + - override /logger: aim + +# all parameters below will be merged with parameters from default configurations set above + +tags: ["r18", "ms_tcn", "debug"] + +seed: 12345 + +trainer: + min_epochs: 50 + max_epochs: 500 + log_every_n_steps: 1 + + +model: + compile: false + + net: + dim: 297 # length of feature vector when top_k_objects=1 + #dim: 506 # length of feature vector when top_k_objects=2 + +# LIVE HZ +IMAGE_HZ: 30 # zed bags +#IMAGE_HZ: 15 # BBN hololens live +OBJECT_DET_HZ: 15 +POSE_HZ: 4 + +# GENERATE TRAINING DATA +data_gen: + top_k_objects: 1 + pose_repeat_rate: 7.5 # ${IMAGE_HZ} / ${POSE_HZ} + + data_type: "pro" + dataset_kwcoco: "/data/PTG/medical/training/yolo_object_detector/detect/r18_all/r18_all_all_obj_results_with_dets_and_pose.mscoco.json" + train_vid_ids: [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, + 22, 23, 24, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, + 42, 43, 44, 45, 46, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] + val_vid_ids: [3, 7, 10, 18, 27, 32, 41] + test_vid_ids: [50, 13, 47, 25] + names_black_gloves: [] + names_blue_gloves: [] + #data_type: "lab" + #dataset_kwcoco: "/data/PTG/medical/training/yolo_object_detector/detect/r18_all_bbn_lab_data/r18_all_bbn_lab_data_all_obj_results.mscoco.json" + #train_vid_ids: [1, 2, 3, 4, 5, 6, 7, 8] + #val_vid_ids: [9] + #test_vid_ids: [10] + #names_black_gloves: [] + #names_blue_gloves: [] + + filter_black_gloves: false + filter_blue_gloves: false + + activity_config_fn: "/home/local/KHQ/hannah.defazio/angel_system/config/activity_labels/medical/r18.yaml" + + # This matches the folder name created in the data generator + exp_ext: "_NEW_ORDER_fix_overlap_gt" # anything unique about this run that isn't already in ``exp_name`` + exp_name: "${task}_${data_gen.data_type}_data_top_${data_gen.top_k_objects}_objs_feat_v${feature_version}_pose_rate_${data_gen.pose_repeat_rate}${data_gen.exp_ext}" + +# TRAINING +data: + num_classes: 6 # activities: includes background + batch_size: 512 + num_workers: 12 + epoch_length: 20000 + window_size: 25 + sample_rate: 2 # ${IMAGE_HZ} / ${OBJECT_DET_HZ} + + # AUGMENTATIONS + all_transforms: + train_order: [] #["MoveCenterPts", "NormalizePixelPts"] + test_order: [] #["NormalizePixelPts"] + MoveCenterPts: + feat_version: ${feature_version} + num_obj_classes: 6 # not including background, includes hands + top_k_objects: ${data_gen.top_k_objects} + NormalizeFromCenter: + feat_version: ${feature_version} + num_obj_classes: 6 # not including background, includes hands + top_k_objects: ${data_gen.top_k_objects} + NormalizePixelPts: + feat_version: ${feature_version} + num_obj_classes: 6 # not including background, includes hands + top_k_objects: ${data_gen.top_k_objects} + +exp_name: ${data_gen.exp_name}_win_${data.window_size}_obj_sample_${data.sample_rate} + +paths: + data_dir: "/data/PTG/medical/training/activity_classifier/TCN_data/${task}/${data_gen.exp_name}" + root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" + +logger: + aim: + experiment: ${exp_name} + capture_terminal_logs: true + +task_name: ${exp_name} \ No newline at end of file diff --git a/configs/experiment/r18/feat_v6.yaml b/configs/experiment/r18/feat_v6.yaml index 7bd8aa05a..3d736b73a 100644 --- a/configs/experiment/r18/feat_v6.yaml +++ b/configs/experiment/r18/feat_v6.yaml @@ -12,9 +12,20 @@ defaults: - override /callbacks: default - override /trainer: gpu - override /paths: default - - override /logger: aim + #- override /logger: aim + - override /logger: csv # all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +# Change this name to something descriptive and unique for this experiment. +# This will differentiate the run logs and output to be separate from other +# experiments that may have been run under the configured +# Setting this value influences: +# - the name of the directory under `${paths.root_dir}/logs/` in which training +# run files are stored. +# Default is "train" set in the "configs/train.yaml" file. +#task_name: tags: ["r18", "ms_tcn", "debug"] @@ -25,87 +36,81 @@ trainer: max_epochs: 500 log_every_n_steps: 1 - model: + num_classes: 6 # number of activity classification classes compile: false - net: - dim: 297 # length of feature vector when top_k_objects=1 - #dim: 506 # length of feature vector when top_k_objects=2 - -# LIVE HZ -IMAGE_HZ: 30 # zed bags -#IMAGE_HZ: 15 # BBN hololens live -OBJECT_DET_HZ: 15 -POSE_HZ: 4 - -# GENERATE TRAINING DATA -data_gen: - top_k_objects: 1 - pose_repeat_rate: 7.5 # ${IMAGE_HZ} / ${POSE_HZ} - - data_type: "pro" - dataset_kwcoco: "/data/PTG/medical/training/yolo_object_detector/detect/r18_all/r18_all_all_obj_results_with_dets_and_pose.mscoco.json" - train_vid_ids: [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, - 22, 23, 24, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, - 42, 43, 44, 45, 46, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, - 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] - val_vid_ids: [3, 7, 10, 18, 27, 32, 41] - test_vid_ids: [50, 13, 47, 25] - names_black_gloves: [] - names_blue_gloves: [] - #data_type: "lab" - #dataset_kwcoco: "/data/PTG/medical/training/yolo_object_detector/detect/r18_all_bbn_lab_data/r18_all_bbn_lab_data_all_obj_results.mscoco.json" - #train_vid_ids: [1, 2, 3, 4, 5, 6, 7, 8] - #val_vid_ids: [9] - #test_vid_ids: [10] - #names_black_gloves: [] - #names_blue_gloves: [] - - filter_black_gloves: false - filter_blue_gloves: false - - activity_config_fn: "/home/local/KHQ/hannah.defazio/angel_system/config/activity_labels/medical/r18.yaml" - - # This matches the folder name created in the data generator - exp_ext: "_NEW_ORDER_fix_overlap_gt" # anything unique about this run that isn't already in ``exp_name`` - exp_name: "${task}_${data_gen.data_type}_data_top_${data_gen.top_k_objects}_objs_feat_v${feature_version}_pose_rate_${data_gen.pose_repeat_rate}${data_gen.exp_ext}" + # Length of feature vector for a single frame. + # Currently derived from the parameterization of dataset vectorizer. + dim: 297 # TRAINING data: - num_classes: 6 # activities: includes background - batch_size: 512 - num_workers: 12 - epoch_length: 20000 - window_size: 25 - sample_rate: 2 # ${IMAGE_HZ} / ${OBJECT_DET_HZ} - - # AUGMENTATIONS - all_transforms: - train_order: [] #["MoveCenterPts", "NormalizePixelPts"] - test_order: [] #["NormalizePixelPts"] - MoveCenterPts: - feat_version: ${feature_version} - num_obj_classes: 6 # not including background, includes hands - top_k_objects: ${data_gen.top_k_objects} - NormalizeFromCenter: - feat_version: ${feature_version} - num_obj_classes: 6 # not including background, includes hands - top_k_objects: ${data_gen.top_k_objects} - NormalizePixelPts: - feat_version: ${feature_version} - num_obj_classes: 6 # not including background, includes hands - top_k_objects: ${data_gen.top_k_objects} - -exp_name: ${data_gen.exp_name}_win_${data.window_size}_obj_sample_${data.sample_rate} + coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json" + coco_train_objects: "${paths.coco_file_root}/TRAIN-object_detections.coco.json" + coco_train_poses: "${paths.coco_file_root}/TRAIN-pose_estimations.coco.json" + + coco_validation_activities: "${paths.coco_file_root}/VALIDATION-activity_truth.coco.json" + coco_validation_objects: "${paths.coco_file_root}/VALIDATION-object_detections.coco.json" + coco_validation_poses: "${paths.coco_file_root}/VALIDATION-pose_estimations.coco.json" + + coco_test_activities: "${paths.coco_file_root}/TEST-activity_truth.coco.json" + coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json" + coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimations.coco.json" + + batch_size: 16384 + num_workers: 16 + target_framerate: 15 # BBN Hololens2 Framerate + epoch_length: 80000 + + train_dataset: + window_size: 25 + vectorizer: + _target_: tcn_hpl.data.vectorize.classic.Classic + feat_version: 6 + top_k: 1 + num_classes: 7 + background_idx: 0 + hand_left_idx: 5 + hand_right_idx: 6 + transform: + transforms: [] # no transforms +# - _target_: tcn_hpl.data.components.augmentations.MoveCenterPts +# hand_dist_delta: 0.05 +# obj_dist_delta: 0.05 +# joint_dist_delta: 0.025 +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 +# - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 + val_dataset: + transform: + transforms: [] # no transforms +# - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 + # Test dataset usually configured the same as val, unless there is some + # different set of transforms that should be used during test/prediction. paths: - data_dir: "/data/PTG/medical/training/activity_classifier/TCN_data/${task}/${data_gen.exp_name}" - root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" + # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" + root_dir: "/data/paul.tunison/data/darpa-ptg/train-TCN-R18_bbn_hololens-yolo_v7-mmpose" -logger: - aim: - experiment: ${exp_name} - capture_terminal_logs: true + # Convenience variable to where your train/val/test split COCO file datasets + # are stored. + coco_file_root: ${paths.root_dir} -task_name: ${exp_name} \ No newline at end of file +#logger: +# aim: +# experiment: ${task_name} +# capture_terminal_logs: true diff --git a/tcn_hpl/data/ptg_datamodule.py b/tcn_hpl/data/ptg_datamodule.py index 1a4ef0f6f..ad6ab4936 100644 --- a/tcn_hpl/data/ptg_datamodule.py +++ b/tcn_hpl/data/ptg_datamodule.py @@ -142,8 +142,7 @@ def __init__( # this line allows to access init params with 'self.hparams' attribute # also ensures init params will be stored in ckpt self.save_hyperparameters( - logger=False, - ignore=["train_dataset", "val_dataset", "test_dataset"] + logger=False, ignore=["train_dataset", "val_dataset", "test_dataset"] ) self.data_train: Optional[TCNDataset] = train_dataset diff --git a/tcn_hpl/data/tcn_dataset.py b/tcn_hpl/data/tcn_dataset.py index 5275f80b4..059a416c3 100644 --- a/tcn_hpl/data/tcn_dataset.py +++ b/tcn_hpl/data/tcn_dataset.py @@ -1,3 +1,4 @@ +import click import logging import os from hashlib import sha256 @@ -16,7 +17,7 @@ import kwcoco import numpy as np import numpy.typing as npt -import torch +import torch.multiprocessing from torch.utils.data import Dataset, DataLoader from tqdm import tqdm @@ -415,6 +416,8 @@ def load_data_offline( csum.update(f.read()) csum.update(f"{target_framerate:0.{framerate_round_decimals}f}".encode()) csum.update(f"{self.window_size:d}".encode()) + csum.update(self.vectorizer.__class__.__module__.encode()) + csum.update(self.vectorizer.__class__.__name__.encode()) csum.update(json.dumps(self.vectorizer.hparams()).encode()) # Include vectorization variables in the name of the file. # Note the "z" in the name, expecting to use savez_compressed. @@ -432,31 +435,42 @@ def load_data_offline( # Pre-vectorize data for iteration efficiency during training. # * Creating a mini Dataset/Dataloader situation to efficiently # generate vectors. - frame_vectors: List[npt.NDArray[np.float32]] = [] - vectorizer = self.vectorizer - - class VecDset(Dataset): - def __getitem__(self, item): - return vectorizer(frame_data[item]) - - def __len__(self): - return len(frame_data) - - # Using larger batch sizes than 1 did not show any particular - # increase in throughput. This may require increasing - # `ulimit -n`, though. - dloader = DataLoader( - VecDset(), - batch_size=1, - num_workers=pre_vectorize_cores, - ) - for batch in tqdm( - dloader, - desc="Frame data vectorized", - unit="frames", - ): - frame_vectors.extend(batch.numpy()) + # Set the sharing strategy to filesystem for the duration of + # this operation, and then restoring the existing strategy + # after we're done. + current_sharing_strategy = torch.multiprocessing.get_sharing_strategy() + + try: + # This iteration seems to go twice as fast when utilizing + # the file-system strategy. + torch.multiprocessing.set_sharing_strategy("file_system") + + vec_dset = _VectorizationDataset(self.vectorizer, frame_data) + + # Using larger batch sizes than 1 did not show any particular + # increase in throughput. This may require increasing + # `ulimit -n`, though. + dloader = DataLoader( + vec_dset, + batch_size=1, + num_workers=pre_vectorize_cores, + # Required, especially for large dataset sizes, so the + # dataloader multiprocessing iteration does not exhaust + # shared memory. + pin_memory=True, + ) + + frame_vectors: List[npt.NDArray[np.float32]] = [] + for batch in tqdm( + dloader, + desc="Frame data vectorized", + unit="frames", + ): + frame_vectors.extend(batch.numpy()) + finally: + torch.multiprocessing.set_sharing_strategy(current_sharing_strategy) + self._frame_vectors = np.asarray(frame_vectors) if cache_filepath is not None: @@ -564,73 +578,117 @@ def __len__(self): Returns: length: Length of the dataset. """ - return len(self._window_data_idx) + return len(self._window_data_idx) if self._window_data_idx is not None else 0 -if __name__ == "__main__": +class _VectorizationDataset(Dataset): + """ + Helper dataset for iterating over individual frames of data and producing + embedding vectors. + """ + + def __init__(self, vectorize: Vectorize, frame_data: Sequence[FrameData]): + self.vectorize = vectorize + self.frame_data = frame_data + + def __len__(self): + return len(self.frame_data) + + def __getitem__(self, item): + return self.vectorize(self.frame_data[item]) + + +@click.command() +@click.help_option("-h", "--help") +@click.argument("activity_coco", type=click.Path(path_type=Path)) +@click.argument("detections_coco", type=click.Path(path_type=Path)) +@click.argument("pose_coco", type=click.Path(path_type=Path)) +@click.option( + "--window-size", + type=int, + default=25, + show_default=True, +) +@click.option( + "--target-framerate", + type=float, + default=15, + show_default=True, +) +@click.option( + "--pre-vectorize", + is_flag=True, + help="Run pre-vectorization or not.", + show_default=True, +) +def test_dataset_for_input( + activity_coco: Path, + detections_coco: Path, + pose_coco: Path, + window_size: int, + target_framerate: float, + pre_vectorize: bool, +): + """ + Test the TCN Dataset iteration over some test data. + """ logging.basicConfig(level=logging.INFO) - # Example usage: - activity_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/activity_truth.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-activity_truth.coco.json" - ) - dets_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_object_detections.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-object_detections.coco.json" - ) - pose_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_poses.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-pose_estimates.coco.json" - ) + activity_coco = kwcoco.CocoDataset(activity_coco) + dets_coco = kwcoco.CocoDataset(detections_coco) + pose_coco = kwcoco.CocoDataset(pose_coco) + # TODO: Some method of configuring which vectorizer to use. from tcn_hpl.data.vectorize.classic import Classic - vectorizer = Classic( feat_version=6, top_k=1, - # M2-specific object detection class indices + # M2/R18 object detection class indices num_classes=7, background_idx=0, hand_left_idx=5, hand_right_idx=6, ) - dataset = TCNDataset(window_size=25, vectorizer=vectorizer) + + dataset = TCNDataset(window_size=window_size, vectorizer=vectorizer) dataset.load_data_offline( activity_coco, dets_coco, pose_coco, - target_framerate=15, - cache_dir="./test_cache", + target_framerate=target_framerate, + pre_vectorize=pre_vectorize, ) - print(f"dataset: {len(dataset)}") + logger.info(f"Number of windows: {len(dataset)}") + + # Get vector dimensionality + window_vecs = dataset[0] + logger.info(f"Feature vector dims: {window_vecs[0].shape[1]}") + + # Test that we can iterate over the dataset using a DataLoader with + # shuffling. batch_size = 512 # 16 - data_loader = torch.utils.data.DataLoader( + data_loader = DataLoader( dataset, batch_size=batch_size, shuffle=True, - num_workers=16, + num_workers=os.cpu_count(), pin_memory=True, ) - count = 0 s = time.time() - for idx, batch in tqdm( - enumerate(data_loader), + for batch in tqdm( + data_loader, desc="Iterating batches of features", unit="batches", ): count += 1 duration = time.time() - s - - print( - f"Total batches of size {batch_size}: {count} ({duration:.02f} seconds total)" - ) + logger.info(f"Iterated over the full TCN Dataset in {duration:.2f} s.") # Test creating online mode with subset of data from above. - dset_online = TCNDataset(window_size=25, vectorizer=vectorizer) - dset_online.load_data_online(dataset._frame_data[:25]) # noqa + dset_online = TCNDataset(window_size=window_size, vectorizer=vectorizer) + dset_online.load_data_online(dataset._frame_data[:window_size]) # noqa assert len(dset_online) == 1, "Online dataset should be size 1" _ = dset_online[0] failed_index_error = True @@ -643,3 +701,7 @@ def __len__(self): assert ( (dataset[0][0] == dset_online[0][0]).all() # noqa ), "Online should have produced same window matrix as offline version." + + +if __name__ == "__main__": + test_dataset_for_input()