From bf271499d9a8bfa7a132d6c51744c7b7555e0af1 Mon Sep 17 00:00:00 2001 From: Paul Tunison Date: Thu, 31 Oct 2024 13:34:38 -0400 Subject: [PATCH 1/4] Move old versions of experiment configs into an OLD directory --- configs/experiment/m2/{ => OLD}/feat_v6_no_pose.yaml | 0 configs/experiment/m2/{ => OLD}/feat_v6_only_hands_joints.yaml | 0 configs/experiment/m2/{ => OLD}/feat_v6_only_object_joints.yaml | 0 configs/experiment/r18/{ => OLD}/feat_v5.yaml | 0 configs/experiment/r18/{ => OLD}/feat_v6.yaml | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename configs/experiment/m2/{ => OLD}/feat_v6_no_pose.yaml (100%) rename configs/experiment/m2/{ => OLD}/feat_v6_only_hands_joints.yaml (100%) rename configs/experiment/m2/{ => OLD}/feat_v6_only_object_joints.yaml (100%) rename configs/experiment/r18/{ => OLD}/feat_v5.yaml (100%) rename configs/experiment/r18/{ => OLD}/feat_v6.yaml (100%) diff --git a/configs/experiment/m2/feat_v6_no_pose.yaml b/configs/experiment/m2/OLD/feat_v6_no_pose.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_no_pose.yaml rename to configs/experiment/m2/OLD/feat_v6_no_pose.yaml diff --git a/configs/experiment/m2/feat_v6_only_hands_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_only_hands_joints.yaml rename to configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml diff --git a/configs/experiment/m2/feat_v6_only_object_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml similarity index 100% rename from configs/experiment/m2/feat_v6_only_object_joints.yaml rename to configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml diff --git a/configs/experiment/r18/feat_v5.yaml b/configs/experiment/r18/OLD/feat_v5.yaml similarity index 100% rename from configs/experiment/r18/feat_v5.yaml rename to configs/experiment/r18/OLD/feat_v5.yaml diff --git a/configs/experiment/r18/feat_v6.yaml b/configs/experiment/r18/OLD/feat_v6.yaml similarity index 100% rename from configs/experiment/r18/feat_v6.yaml rename to configs/experiment/r18/OLD/feat_v6.yaml From 43cf5877cc09d05cceb3e453e0955cf5b6eaa9db Mon Sep 17 00:00:00 2001 From: Paul Tunison Date: Thu, 31 Oct 2024 14:17:27 -0400 Subject: [PATCH 2/4] Minor black formatting --- tcn_hpl/data/ptg_datamodule.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tcn_hpl/data/ptg_datamodule.py b/tcn_hpl/data/ptg_datamodule.py index 1a4ef0f6f..ad6ab4936 100644 --- a/tcn_hpl/data/ptg_datamodule.py +++ b/tcn_hpl/data/ptg_datamodule.py @@ -142,8 +142,7 @@ def __init__( # this line allows to access init params with 'self.hparams' attribute # also ensures init params will be stored in ckpt self.save_hyperparameters( - logger=False, - ignore=["train_dataset", "val_dataset", "test_dataset"] + logger=False, ignore=["train_dataset", "val_dataset", "test_dataset"] ) self.data_train: Optional[TCNDataset] = train_dataset From 1f8147f27da026c0be74c9ffdfc6c187c5c994db Mon Sep 17 00:00:00 2001 From: Paul Tunison Date: Thu, 31 Oct 2024 17:20:53 -0400 Subject: [PATCH 3/4] Update and add experiment configs --- configs/experiment/m2/feat_v6.yaml | 13 ++-- configs/experiment/r18/feat_v6.yaml | 116 ++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 configs/experiment/r18/feat_v6.yaml diff --git a/configs/experiment/m2/feat_v6.yaml b/configs/experiment/m2/feat_v6.yaml index d6f1e2f4b..6e1921b33 100644 --- a/configs/experiment/m2/feat_v6.yaml +++ b/configs/experiment/m2/feat_v6.yaml @@ -40,12 +40,12 @@ trainer: log_every_n_steps: 1 model: + num_classes: 9 # number of activity classification classes compile: false net: # Length of feature vector for a single frame. - # Currently derived from feature version and other hyperparameters. + # Currently derived from the parameterization of dataset vectorizer. dim: 297 - num_classes: 9 data: coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json" @@ -60,10 +60,12 @@ data: coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json" coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimates.coco.json" - batch_size: 512 + # batch_size: 512 + # batch_size: 8192 + batch_size: 16384 num_workers: 16 target_framerate: 15 # BBN Hololens2 Framerate - epoch_length: 20000 + epoch_length: 200000 train_dataset: window_size: 25 @@ -106,13 +108,12 @@ data: paths: # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" - root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/training_root" + root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens" # Convenience variable to where your train/val/test split COCO file datasets # are stored. coco_file_root: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens" -#exp_name: "tcn_training_revive" #logger: # aim: # experiment: ${task_name} diff --git a/configs/experiment/r18/feat_v6.yaml b/configs/experiment/r18/feat_v6.yaml new file mode 100644 index 000000000..3d736b73a --- /dev/null +++ b/configs/experiment/r18/feat_v6.yaml @@ -0,0 +1,116 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=example +topic: "medical" +task: "r18" +feature_version: 6 + +defaults: + - override /data: ptg + - override /model: ptg + - override /callbacks: default + - override /trainer: gpu + - override /paths: default + #- override /logger: aim + - override /logger: csv + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +# Change this name to something descriptive and unique for this experiment. +# This will differentiate the run logs and output to be separate from other +# experiments that may have been run under the configured +# Setting this value influences: +# - the name of the directory under `${paths.root_dir}/logs/` in which training +# run files are stored. +# Default is "train" set in the "configs/train.yaml" file. +#task_name: + +tags: ["r18", "ms_tcn", "debug"] + +seed: 12345 + +trainer: + min_epochs: 50 + max_epochs: 500 + log_every_n_steps: 1 + +model: + num_classes: 6 # number of activity classification classes + compile: false + net: + # Length of feature vector for a single frame. + # Currently derived from the parameterization of dataset vectorizer. + dim: 297 + +# TRAINING +data: + coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json" + coco_train_objects: "${paths.coco_file_root}/TRAIN-object_detections.coco.json" + coco_train_poses: "${paths.coco_file_root}/TRAIN-pose_estimations.coco.json" + + coco_validation_activities: "${paths.coco_file_root}/VALIDATION-activity_truth.coco.json" + coco_validation_objects: "${paths.coco_file_root}/VALIDATION-object_detections.coco.json" + coco_validation_poses: "${paths.coco_file_root}/VALIDATION-pose_estimations.coco.json" + + coco_test_activities: "${paths.coco_file_root}/TEST-activity_truth.coco.json" + coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json" + coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimations.coco.json" + + batch_size: 16384 + num_workers: 16 + target_framerate: 15 # BBN Hololens2 Framerate + epoch_length: 80000 + + train_dataset: + window_size: 25 + vectorizer: + _target_: tcn_hpl.data.vectorize.classic.Classic + feat_version: 6 + top_k: 1 + num_classes: 7 + background_idx: 0 + hand_left_idx: 5 + hand_right_idx: 6 + transform: + transforms: [] # no transforms +# - _target_: tcn_hpl.data.components.augmentations.MoveCenterPts +# hand_dist_delta: 0.05 +# obj_dist_delta: 0.05 +# joint_dist_delta: 0.025 +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 +# - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 + val_dataset: + transform: + transforms: [] # no transforms +# - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts +# im_w: 1280 +# im_h: 720 +# num_obj_classes: 42 +# feat_version: 2 +# top_k_objects: 1 + # Test dataset usually configured the same as val, unless there is some + # different set of transforms that should be used during test/prediction. + +paths: + # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/" + root_dir: "/data/paul.tunison/data/darpa-ptg/train-TCN-R18_bbn_hololens-yolo_v7-mmpose" + + # Convenience variable to where your train/val/test split COCO file datasets + # are stored. + coco_file_root: ${paths.root_dir} + +#logger: +# aim: +# experiment: ${task_name} +# capture_terminal_logs: true From dc111a66330d3872a1589034c1c1f81b5329f82b Mon Sep 17 00:00:00 2001 From: Paul Tunison Date: Thu, 31 Oct 2024 18:01:03 -0400 Subject: [PATCH 4/4] Fix a couple things with TCN dataset * Fix pre-vectorization dataloader iteration. Found with some high values of dataset size, memory errors could occur. Using pinned memory in dataloader seems to fix this. * Making use of the file-system sharing strategy for internal pre-vectorization, restoring current sharing strategy afterwards. * Fix cache checksuming to include module and classname of vectorizer implementation being utilized. * Moved internally used dataset for pre-vectorization out as a global-level class definition instead of being inside a closure. * Updated "main" function a bit to be parameterized and spit out some useful stats for configuring a training run (number of windows, dimension of embedding vector). --- tcn_hpl/data/tcn_dataset.py | 176 ++++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 57 deletions(-) diff --git a/tcn_hpl/data/tcn_dataset.py b/tcn_hpl/data/tcn_dataset.py index 5275f80b4..059a416c3 100644 --- a/tcn_hpl/data/tcn_dataset.py +++ b/tcn_hpl/data/tcn_dataset.py @@ -1,3 +1,4 @@ +import click import logging import os from hashlib import sha256 @@ -16,7 +17,7 @@ import kwcoco import numpy as np import numpy.typing as npt -import torch +import torch.multiprocessing from torch.utils.data import Dataset, DataLoader from tqdm import tqdm @@ -415,6 +416,8 @@ def load_data_offline( csum.update(f.read()) csum.update(f"{target_framerate:0.{framerate_round_decimals}f}".encode()) csum.update(f"{self.window_size:d}".encode()) + csum.update(self.vectorizer.__class__.__module__.encode()) + csum.update(self.vectorizer.__class__.__name__.encode()) csum.update(json.dumps(self.vectorizer.hparams()).encode()) # Include vectorization variables in the name of the file. # Note the "z" in the name, expecting to use savez_compressed. @@ -432,31 +435,42 @@ def load_data_offline( # Pre-vectorize data for iteration efficiency during training. # * Creating a mini Dataset/Dataloader situation to efficiently # generate vectors. - frame_vectors: List[npt.NDArray[np.float32]] = [] - vectorizer = self.vectorizer - - class VecDset(Dataset): - def __getitem__(self, item): - return vectorizer(frame_data[item]) - - def __len__(self): - return len(frame_data) - - # Using larger batch sizes than 1 did not show any particular - # increase in throughput. This may require increasing - # `ulimit -n`, though. - dloader = DataLoader( - VecDset(), - batch_size=1, - num_workers=pre_vectorize_cores, - ) - for batch in tqdm( - dloader, - desc="Frame data vectorized", - unit="frames", - ): - frame_vectors.extend(batch.numpy()) + # Set the sharing strategy to filesystem for the duration of + # this operation, and then restoring the existing strategy + # after we're done. + current_sharing_strategy = torch.multiprocessing.get_sharing_strategy() + + try: + # This iteration seems to go twice as fast when utilizing + # the file-system strategy. + torch.multiprocessing.set_sharing_strategy("file_system") + + vec_dset = _VectorizationDataset(self.vectorizer, frame_data) + + # Using larger batch sizes than 1 did not show any particular + # increase in throughput. This may require increasing + # `ulimit -n`, though. + dloader = DataLoader( + vec_dset, + batch_size=1, + num_workers=pre_vectorize_cores, + # Required, especially for large dataset sizes, so the + # dataloader multiprocessing iteration does not exhaust + # shared memory. + pin_memory=True, + ) + + frame_vectors: List[npt.NDArray[np.float32]] = [] + for batch in tqdm( + dloader, + desc="Frame data vectorized", + unit="frames", + ): + frame_vectors.extend(batch.numpy()) + finally: + torch.multiprocessing.set_sharing_strategy(current_sharing_strategy) + self._frame_vectors = np.asarray(frame_vectors) if cache_filepath is not None: @@ -564,73 +578,117 @@ def __len__(self): Returns: length: Length of the dataset. """ - return len(self._window_data_idx) + return len(self._window_data_idx) if self._window_data_idx is not None else 0 -if __name__ == "__main__": +class _VectorizationDataset(Dataset): + """ + Helper dataset for iterating over individual frames of data and producing + embedding vectors. + """ + + def __init__(self, vectorize: Vectorize, frame_data: Sequence[FrameData]): + self.vectorize = vectorize + self.frame_data = frame_data + + def __len__(self): + return len(self.frame_data) + + def __getitem__(self, item): + return self.vectorize(self.frame_data[item]) + + +@click.command() +@click.help_option("-h", "--help") +@click.argument("activity_coco", type=click.Path(path_type=Path)) +@click.argument("detections_coco", type=click.Path(path_type=Path)) +@click.argument("pose_coco", type=click.Path(path_type=Path)) +@click.option( + "--window-size", + type=int, + default=25, + show_default=True, +) +@click.option( + "--target-framerate", + type=float, + default=15, + show_default=True, +) +@click.option( + "--pre-vectorize", + is_flag=True, + help="Run pre-vectorization or not.", + show_default=True, +) +def test_dataset_for_input( + activity_coco: Path, + detections_coco: Path, + pose_coco: Path, + window_size: int, + target_framerate: float, + pre_vectorize: bool, +): + """ + Test the TCN Dataset iteration over some test data. + """ logging.basicConfig(level=logging.INFO) - # Example usage: - activity_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/activity_truth.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-activity_truth.coco.json" - ) - dets_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_object_detections.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-object_detections.coco.json" - ) - pose_coco = kwcoco.CocoDataset( - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_poses.coco.json" - "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-pose_estimates.coco.json" - ) + activity_coco = kwcoco.CocoDataset(activity_coco) + dets_coco = kwcoco.CocoDataset(detections_coco) + pose_coco = kwcoco.CocoDataset(pose_coco) + # TODO: Some method of configuring which vectorizer to use. from tcn_hpl.data.vectorize.classic import Classic - vectorizer = Classic( feat_version=6, top_k=1, - # M2-specific object detection class indices + # M2/R18 object detection class indices num_classes=7, background_idx=0, hand_left_idx=5, hand_right_idx=6, ) - dataset = TCNDataset(window_size=25, vectorizer=vectorizer) + + dataset = TCNDataset(window_size=window_size, vectorizer=vectorizer) dataset.load_data_offline( activity_coco, dets_coco, pose_coco, - target_framerate=15, - cache_dir="./test_cache", + target_framerate=target_framerate, + pre_vectorize=pre_vectorize, ) - print(f"dataset: {len(dataset)}") + logger.info(f"Number of windows: {len(dataset)}") + + # Get vector dimensionality + window_vecs = dataset[0] + logger.info(f"Feature vector dims: {window_vecs[0].shape[1]}") + + # Test that we can iterate over the dataset using a DataLoader with + # shuffling. batch_size = 512 # 16 - data_loader = torch.utils.data.DataLoader( + data_loader = DataLoader( dataset, batch_size=batch_size, shuffle=True, - num_workers=16, + num_workers=os.cpu_count(), pin_memory=True, ) - count = 0 s = time.time() - for idx, batch in tqdm( - enumerate(data_loader), + for batch in tqdm( + data_loader, desc="Iterating batches of features", unit="batches", ): count += 1 duration = time.time() - s - - print( - f"Total batches of size {batch_size}: {count} ({duration:.02f} seconds total)" - ) + logger.info(f"Iterated over the full TCN Dataset in {duration:.2f} s.") # Test creating online mode with subset of data from above. - dset_online = TCNDataset(window_size=25, vectorizer=vectorizer) - dset_online.load_data_online(dataset._frame_data[:25]) # noqa + dset_online = TCNDataset(window_size=window_size, vectorizer=vectorizer) + dset_online.load_data_online(dataset._frame_data[:window_size]) # noqa assert len(dset_online) == 1, "Online dataset should be size 1" _ = dset_online[0] failed_index_error = True @@ -643,3 +701,7 @@ def __len__(self): assert ( (dataset[0][0] == dset_online[0][0]).all() # noqa ), "Online should have produced same window matrix as offline version." + + +if __name__ == "__main__": + test_dataset_for_input()