From bf271499d9a8bfa7a132d6c51744c7b7555e0af1 Mon Sep 17 00:00:00 2001
From: Paul Tunison <paul.tunison@kitware.com>
Date: Thu, 31 Oct 2024 13:34:38 -0400
Subject: [PATCH 1/4] Move old versions of experiment configs into an OLD
 directory

---
 configs/experiment/m2/{ => OLD}/feat_v6_no_pose.yaml            | 0
 configs/experiment/m2/{ => OLD}/feat_v6_only_hands_joints.yaml  | 0
 configs/experiment/m2/{ => OLD}/feat_v6_only_object_joints.yaml | 0
 configs/experiment/r18/{ => OLD}/feat_v5.yaml                   | 0
 configs/experiment/r18/{ => OLD}/feat_v6.yaml                   | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename configs/experiment/m2/{ => OLD}/feat_v6_no_pose.yaml (100%)
 rename configs/experiment/m2/{ => OLD}/feat_v6_only_hands_joints.yaml (100%)
 rename configs/experiment/m2/{ => OLD}/feat_v6_only_object_joints.yaml (100%)
 rename configs/experiment/r18/{ => OLD}/feat_v5.yaml (100%)
 rename configs/experiment/r18/{ => OLD}/feat_v6.yaml (100%)

diff --git a/configs/experiment/m2/feat_v6_no_pose.yaml b/configs/experiment/m2/OLD/feat_v6_no_pose.yaml
similarity index 100%
rename from configs/experiment/m2/feat_v6_no_pose.yaml
rename to configs/experiment/m2/OLD/feat_v6_no_pose.yaml
diff --git a/configs/experiment/m2/feat_v6_only_hands_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml
similarity index 100%
rename from configs/experiment/m2/feat_v6_only_hands_joints.yaml
rename to configs/experiment/m2/OLD/feat_v6_only_hands_joints.yaml
diff --git a/configs/experiment/m2/feat_v6_only_object_joints.yaml b/configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml
similarity index 100%
rename from configs/experiment/m2/feat_v6_only_object_joints.yaml
rename to configs/experiment/m2/OLD/feat_v6_only_object_joints.yaml
diff --git a/configs/experiment/r18/feat_v5.yaml b/configs/experiment/r18/OLD/feat_v5.yaml
similarity index 100%
rename from configs/experiment/r18/feat_v5.yaml
rename to configs/experiment/r18/OLD/feat_v5.yaml
diff --git a/configs/experiment/r18/feat_v6.yaml b/configs/experiment/r18/OLD/feat_v6.yaml
similarity index 100%
rename from configs/experiment/r18/feat_v6.yaml
rename to configs/experiment/r18/OLD/feat_v6.yaml

From 43cf5877cc09d05cceb3e453e0955cf5b6eaa9db Mon Sep 17 00:00:00 2001
From: Paul Tunison <paul.tunison@kitware.com>
Date: Thu, 31 Oct 2024 14:17:27 -0400
Subject: [PATCH 2/4] Minor black formatting

---
 tcn_hpl/data/ptg_datamodule.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tcn_hpl/data/ptg_datamodule.py b/tcn_hpl/data/ptg_datamodule.py
index 1a4ef0f6f..ad6ab4936 100644
--- a/tcn_hpl/data/ptg_datamodule.py
+++ b/tcn_hpl/data/ptg_datamodule.py
@@ -142,8 +142,7 @@ def __init__(
         # this line allows to access init params with 'self.hparams' attribute
         # also ensures init params will be stored in ckpt
         self.save_hyperparameters(
-            logger=False,
-            ignore=["train_dataset", "val_dataset", "test_dataset"]
+            logger=False, ignore=["train_dataset", "val_dataset", "test_dataset"]
         )
 
         self.data_train: Optional[TCNDataset] = train_dataset

From 1f8147f27da026c0be74c9ffdfc6c187c5c994db Mon Sep 17 00:00:00 2001
From: Paul Tunison <paul.tunison@kitware.com>
Date: Thu, 31 Oct 2024 17:20:53 -0400
Subject: [PATCH 3/4] Update and add experiment configs

---
 configs/experiment/m2/feat_v6.yaml  |  13 ++--
 configs/experiment/r18/feat_v6.yaml | 116 ++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 6 deletions(-)
 create mode 100644 configs/experiment/r18/feat_v6.yaml

diff --git a/configs/experiment/m2/feat_v6.yaml b/configs/experiment/m2/feat_v6.yaml
index d6f1e2f4b..6e1921b33 100644
--- a/configs/experiment/m2/feat_v6.yaml
+++ b/configs/experiment/m2/feat_v6.yaml
@@ -40,12 +40,12 @@ trainer:
   log_every_n_steps: 1
 
 model:
+  num_classes: 9  # number of activity classification classes
   compile: false
   net:
     # Length of feature vector for a single frame.
-    # Currently derived from feature version and other hyperparameters.
+    # Currently derived from the parameterization of dataset vectorizer.
     dim: 297
-  num_classes: 9
 
 data:
   coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json"
@@ -60,10 +60,12 @@ data:
   coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json"
   coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimates.coco.json"
 
-  batch_size: 512
+  # batch_size: 512
+  # batch_size: 8192
+  batch_size: 16384
   num_workers: 16
   target_framerate: 15  # BBN Hololens2 Framerate
-  epoch_length: 20000
+  epoch_length: 200000
 
   train_dataset:
     window_size: 25
@@ -106,13 +108,12 @@ data:
 
 paths:
   # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/"
-  root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/training_root"
+  root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens"
 
   # Convenience variable to where your train/val/test split COCO file datasets
   # are stored.
   coco_file_root: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens"
 
-#exp_name: "tcn_training_revive"
 #logger:
 #  aim:
 #    experiment: ${task_name}
diff --git a/configs/experiment/r18/feat_v6.yaml b/configs/experiment/r18/feat_v6.yaml
new file mode 100644
index 000000000..3d736b73a
--- /dev/null
+++ b/configs/experiment/r18/feat_v6.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+topic: "medical"
+task: "r18"
+feature_version: 6
+
+defaults:
+  - override /data: ptg
+  - override /model: ptg
+  - override /callbacks: default
+  - override /trainer: gpu
+  - override /paths: default
+  #- override /logger: aim
+  - override /logger: csv
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+# Change this name to something descriptive and unique for this experiment.
+# This will differentiate the run logs and output to be separate from other
+# experiments that may have been run under the configured
+# Setting this value influences:
+# - the name of the directory under `${paths.root_dir}/logs/` in which training
+#   run files are stored.
+# Default is "train" set in the "configs/train.yaml" file.
+#task_name:
+
+tags: ["r18", "ms_tcn", "debug"]
+
+seed: 12345
+
+trainer:
+  min_epochs: 50
+  max_epochs: 500
+  log_every_n_steps: 1
+
+model:
+  num_classes: 6  # number of activity classification classes
+  compile: false
+  net:
+    # Length of feature vector for a single frame.
+    # Currently derived from the parameterization of dataset vectorizer.
+    dim: 297
+
+# TRAINING
+data:
+  coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json"
+  coco_train_objects: "${paths.coco_file_root}/TRAIN-object_detections.coco.json"
+  coco_train_poses: "${paths.coco_file_root}/TRAIN-pose_estimations.coco.json"
+
+  coco_validation_activities: "${paths.coco_file_root}/VALIDATION-activity_truth.coco.json"
+  coco_validation_objects: "${paths.coco_file_root}/VALIDATION-object_detections.coco.json"
+  coco_validation_poses: "${paths.coco_file_root}/VALIDATION-pose_estimations.coco.json"
+
+  coco_test_activities: "${paths.coco_file_root}/TEST-activity_truth.coco.json"
+  coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json"
+  coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimations.coco.json"
+
+  batch_size: 16384
+  num_workers: 16
+  target_framerate: 15  # BBN Hololens2 Framerate
+  epoch_length: 80000
+
+  train_dataset:
+    window_size: 25
+    vectorizer:
+      _target_: tcn_hpl.data.vectorize.classic.Classic
+      feat_version: 6
+      top_k: 1
+      num_classes: 7
+      background_idx: 0
+      hand_left_idx: 5
+      hand_right_idx: 6
+    transform:
+      transforms: []  # no transforms
+#        - _target_: tcn_hpl.data.components.augmentations.MoveCenterPts
+#          hand_dist_delta: 0.05
+#          obj_dist_delta: 0.05
+#          joint_dist_delta: 0.025
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+#        - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+  val_dataset:
+    transform:
+      transforms: []  # no transforms
+#        - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+  # Test dataset usually configured the same as val, unless there is some
+  # different set of transforms that should be used during test/prediction.
+
+paths:
+  # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/"
+  root_dir: "/data/paul.tunison/data/darpa-ptg/train-TCN-R18_bbn_hololens-yolo_v7-mmpose"
+
+  # Convenience variable to where your train/val/test split COCO file datasets
+  # are stored.
+  coco_file_root: ${paths.root_dir}
+
+#logger:
+#  aim:
+#    experiment: ${task_name}
+#    capture_terminal_logs: true

From dc111a66330d3872a1589034c1c1f81b5329f82b Mon Sep 17 00:00:00 2001
From: Paul Tunison <paul.tunison@kitware.com>
Date: Thu, 31 Oct 2024 18:01:03 -0400
Subject: [PATCH 4/4] Fix a couple things with TCN dataset

* Fix pre-vectorization dataloader iteration. Found with some high
  values of dataset size, memory errors could occur. Using pinned memory
  in dataloader seems to fix this.
* Making use of the file-system sharing strategy for internal
  pre-vectorization, restoring current sharing strategy afterwards.
* Fix cache checksuming to include module and classname of vectorizer
  implementation being utilized.
* Moved internally used dataset for pre-vectorization out as a
  global-level class definition instead of being inside a closure.
* Updated "main" function a bit to be parameterized and spit out some
  useful stats for configuring a training run (number of windows,
  dimension of embedding vector).
---
 tcn_hpl/data/tcn_dataset.py | 176 ++++++++++++++++++++++++------------
 1 file changed, 119 insertions(+), 57 deletions(-)

diff --git a/tcn_hpl/data/tcn_dataset.py b/tcn_hpl/data/tcn_dataset.py
index 5275f80b4..059a416c3 100644
--- a/tcn_hpl/data/tcn_dataset.py
+++ b/tcn_hpl/data/tcn_dataset.py
@@ -1,3 +1,4 @@
+import click
 import logging
 import os
 from hashlib import sha256
@@ -16,7 +17,7 @@
 import kwcoco
 import numpy as np
 import numpy.typing as npt
-import torch
+import torch.multiprocessing
 from torch.utils.data import Dataset, DataLoader
 from tqdm import tqdm
 
@@ -415,6 +416,8 @@ def load_data_offline(
                 csum.update(f.read())
             csum.update(f"{target_framerate:0.{framerate_round_decimals}f}".encode())
             csum.update(f"{self.window_size:d}".encode())
+            csum.update(self.vectorizer.__class__.__module__.encode())
+            csum.update(self.vectorizer.__class__.__name__.encode())
             csum.update(json.dumps(self.vectorizer.hparams()).encode())
             # Include vectorization variables in the name of the file.
             # Note the "z" in the name, expecting to use savez_compressed.
@@ -432,31 +435,42 @@ def load_data_offline(
                 # Pre-vectorize data for iteration efficiency during training.
                 # * Creating a mini Dataset/Dataloader situation to efficiently
                 #   generate vectors.
-                frame_vectors: List[npt.NDArray[np.float32]] = []
-                vectorizer = self.vectorizer
-
-                class VecDset(Dataset):
-                    def __getitem__(self, item):
-                        return vectorizer(frame_data[item])
-
-                    def __len__(self):
-                        return len(frame_data)
-
-                # Using larger batch sizes than 1 did not show any particular
-                # increase in throughput. This may require increasing
-                # `ulimit -n`, though.
-                dloader = DataLoader(
-                    VecDset(),
-                    batch_size=1,
-                    num_workers=pre_vectorize_cores,
-                )
 
-                for batch in tqdm(
-                    dloader,
-                    desc="Frame data vectorized",
-                    unit="frames",
-                ):
-                    frame_vectors.extend(batch.numpy())
+                # Set the sharing strategy to filesystem for the duration of
+                # this operation, and then restoring the existing strategy
+                # after we're done.
+                current_sharing_strategy = torch.multiprocessing.get_sharing_strategy()
+
+                try:
+                    # This iteration seems to go twice as fast when utilizing
+                    # the file-system strategy.
+                    torch.multiprocessing.set_sharing_strategy("file_system")
+
+                    vec_dset = _VectorizationDataset(self.vectorizer, frame_data)
+
+                    # Using larger batch sizes than 1 did not show any particular
+                    # increase in throughput. This may require increasing
+                    # `ulimit -n`, though.
+                    dloader = DataLoader(
+                        vec_dset,
+                        batch_size=1,
+                        num_workers=pre_vectorize_cores,
+                        # Required, especially for large dataset sizes, so the
+                        # dataloader multiprocessing iteration does not exhaust
+                        # shared memory.
+                        pin_memory=True,
+                    )
+
+                    frame_vectors: List[npt.NDArray[np.float32]] = []
+                    for batch in tqdm(
+                        dloader,
+                        desc="Frame data vectorized",
+                        unit="frames",
+                    ):
+                        frame_vectors.extend(batch.numpy())
+                finally:
+                    torch.multiprocessing.set_sharing_strategy(current_sharing_strategy)
+
                 self._frame_vectors = np.asarray(frame_vectors)
 
                 if cache_filepath is not None:
@@ -564,73 +578,117 @@ def __len__(self):
         Returns:
             length: Length of the dataset.
         """
-        return len(self._window_data_idx)
+        return len(self._window_data_idx) if self._window_data_idx is not None else 0
 
 
-if __name__ == "__main__":
+class _VectorizationDataset(Dataset):
+    """
+    Helper dataset for iterating over individual frames of data and producing
+    embedding vectors.
+    """
+
+    def __init__(self, vectorize: Vectorize, frame_data: Sequence[FrameData]):
+        self.vectorize = vectorize
+        self.frame_data = frame_data
+
+    def __len__(self):
+        return len(self.frame_data)
+
+    def __getitem__(self, item):
+        return self.vectorize(self.frame_data[item])
+
+
+@click.command()
+@click.help_option("-h", "--help")
+@click.argument("activity_coco", type=click.Path(path_type=Path))
+@click.argument("detections_coco", type=click.Path(path_type=Path))
+@click.argument("pose_coco", type=click.Path(path_type=Path))
+@click.option(
+    "--window-size",
+    type=int,
+    default=25,
+    show_default=True,
+)
+@click.option(
+    "--target-framerate",
+    type=float,
+    default=15,
+    show_default=True,
+)
+@click.option(
+    "--pre-vectorize",
+    is_flag=True,
+    help="Run pre-vectorization or not.",
+    show_default=True,
+)
+def test_dataset_for_input(
+    activity_coco: Path,
+    detections_coco: Path,
+    pose_coco: Path,
+    window_size: int,
+    target_framerate: float,
+    pre_vectorize: bool,
+):
+    """
+    Test the TCN Dataset iteration over some test data.
+    """
     logging.basicConfig(level=logging.INFO)
 
-    # Example usage:
-    activity_coco = kwcoco.CocoDataset(
-        # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/activity_truth.coco.json"
-        "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-activity_truth.coco.json"
-    )
-    dets_coco = kwcoco.CocoDataset(
-        # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_object_detections.coco.json"
-        "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-object_detections.coco.json"
-    )
-    pose_coco = kwcoco.CocoDataset(
-        # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_poses.coco.json"
-        "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-pose_estimates.coco.json"
-    )
+    activity_coco = kwcoco.CocoDataset(activity_coco)
+    dets_coco = kwcoco.CocoDataset(detections_coco)
+    pose_coco = kwcoco.CocoDataset(pose_coco)
 
+    # TODO: Some method of configuring which vectorizer to use.
     from tcn_hpl.data.vectorize.classic import Classic
-
     vectorizer = Classic(
         feat_version=6,
         top_k=1,
-        # M2-specific object detection class indices
+        # M2/R18 object detection class indices
         num_classes=7,
         background_idx=0,
         hand_left_idx=5,
         hand_right_idx=6,
     )
-    dataset = TCNDataset(window_size=25, vectorizer=vectorizer)
+
+    dataset = TCNDataset(window_size=window_size, vectorizer=vectorizer)
     dataset.load_data_offline(
         activity_coco,
         dets_coco,
         pose_coco,
-        target_framerate=15,
-        cache_dir="./test_cache",
+        target_framerate=target_framerate,
+        pre_vectorize=pre_vectorize,
     )
 
-    print(f"dataset: {len(dataset)}")
+    logger.info(f"Number of windows: {len(dataset)}")
+
+    # Get vector dimensionality
+    window_vecs = dataset[0]
+    logger.info(f"Feature vector dims: {window_vecs[0].shape[1]}")
+
+    # Test that we can iterate over the dataset using a DataLoader with
+    # shuffling.
     batch_size = 512  # 16
-    data_loader = torch.utils.data.DataLoader(
+    data_loader = DataLoader(
         dataset,
         batch_size=batch_size,
         shuffle=True,
-        num_workers=16,
+        num_workers=os.cpu_count(),
         pin_memory=True,
     )
-
     count = 0
     s = time.time()
-    for idx, batch in tqdm(
-        enumerate(data_loader),
+    for batch in tqdm(
+        data_loader,
         desc="Iterating batches of features",
         unit="batches",
     ):
         count += 1
     duration = time.time() - s
-
-    print(
-        f"Total batches of size {batch_size}: {count} ({duration:.02f} seconds total)"
-    )
+    logger.info(f"Iterated over the full TCN Dataset in {duration:.2f} s.")
 
     # Test creating online mode with subset of data from above.
-    dset_online = TCNDataset(window_size=25, vectorizer=vectorizer)
-    dset_online.load_data_online(dataset._frame_data[:25])  # noqa
+    dset_online = TCNDataset(window_size=window_size, vectorizer=vectorizer)
+    dset_online.load_data_online(dataset._frame_data[:window_size])  # noqa
     assert len(dset_online) == 1, "Online dataset should be size 1"
     _ = dset_online[0]
     failed_index_error = True
@@ -643,3 +701,7 @@ def __len__(self):
     assert (
         (dataset[0][0] == dset_online[0][0]).all()  # noqa
     ), "Online should have produced same window matrix as offline version."
+
+
+if __name__ == "__main__":
+    test_dataset_for_input()