diff --git a/setup.py b/setup.py index e8554ee04..ee95fb422 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ "train_command = tcn_hpl.train:main", "eval_command = tcn_hpl.eval:main", "bbn_create_truth_coco = tcn_hpl.data.utils.bbn:create_truth_coco", + "kwcoco_guided_subset = tcn_hpl.data.utils.kwcoco_guided_subset:main", ] }, ) diff --git a/tcn_hpl/data/tcn_dataset.py b/tcn_hpl/data/tcn_dataset.py index 26cef5c4b..5275f80b4 100644 --- a/tcn_hpl/data/tcn_dataset.py +++ b/tcn_hpl/data/tcn_dataset.py @@ -75,9 +75,14 @@ def __init__( # size for easy batching. # For online mode, expect only one window to be set at a time via the # `load_data_online` method. - # Content to be indexed into during __getitem__. - # This cannot be stored as a ndarray due to its variable nature. - self._window_data: List[List[FrameData]] = [] + + # FrameData for the total set of frames. + # This is not being stored as a ndarray due to its variable nature. + self._frame_data: List[FrameData] = [] + # Content to be indexed into during __getitem__ that refers to which + # indices of self._frame_data compose that window index. + # Shape: (n_windows, window_size) + self._window_data_idx: Optional[npt.NDArray[int]] = None # The truth labels per-frame per-window. # Shape: (n_windows, window_size) self._window_truth: Optional[npt.NDArray[int]] = None @@ -90,10 +95,13 @@ def __init__( self._window_frames: Optional[npt.NDArray[int]] = None # Optionally calculated weight to apply to a window. This is to support # weighted random sampling during training. This should only be - # available when there is truth avaialble, i.e. during offline mode. + # available when there is truth available, i.e. during offline mode. self._window_weights: Optional[npt.NDArray[float]] = None - # Optionally defined set of pre-computed window vectors. - self._window_vectors: Optional[npt.NDArray[float]] = None + # Optionally defined set of pre-computed vectors for each frame. + # Congruent index association with self._frame_data, so + # self._window_data_idx values may be used here. + # Shape: (n_frames, feat_dim) # see self._frame_data + self._frame_vectors: Optional[npt.NDArray[np.float32]] = None # Constant 1's mask value to re-use during get-item. self._ones_mask: npt.NDArray[int] = np.ones(window_size, dtype=int) @@ -110,23 +118,6 @@ def window_weights(self) -> npt.NDArray[float]: raise RuntimeError("No class weights calculated for this dataset.") return self._window_weights - def _vectorize_window( - self, window_data: Sequence[FrameData] - ) -> npt.NDArray[np.float32]: - """ - Vectorize a single window of data. - - Args: - window_data: Window of data to vectorize. Must be window-size - in length. - - Returns: - Transformed vector. - """ - assert len(window_data) == self.window_size - v = self.vectorizer - return np.asarray([v(d) for d in window_data]) - def load_data_offline( self, activity_coco: kwcoco.CocoDataset, @@ -252,17 +243,26 @@ def load_data_offline( # Collect per-frame data first per-video, then slice into windows. # - # Windows of per-frame data that would go into producing a vector. - window_data: List[List[FrameData]] = [] + # FrameData instances for each frame of each video. Each entry here + # would ostensibly be transformed into a vector. + frame_data: List[FrameData] = [] + + # Windows specifying which frames are a part of that window via index + # reference into frame_data. + # Shape: (n_windows, window_size) + window_data_idx: List[List[int]] = [] # Activity classification truth labels per-frame per-window. + # Shape: (n_windows, window_size) window_truth: List[List[int]] = [] # Video ID represented per window. Only one video should be represented # in any one window. + # Shape: (n_windows,) window_vid: List[int] = [] # Image ID per-frame per-window. + # Shape: (n_windows, window_size) window_frames: List[List[int]] = [] # cache frequently called module functions @@ -273,7 +273,8 @@ def load_data_offline( vid_images = activity_coco.images(video_id=vid_id) vid_img_ids: List[int] = list(vid_images) vid_frames_all: List[int] = vid_images.lookup("frame_index") # noqa - # Iterate over sub-videos if applicable. See comment earlier in func. + # Iterate over sub-videos if applicable. This should only turn out + # to be some integer >= 1. See comment earlier in func. vid_fr_multiple = vid_id_to_fr_multiple[vid_id] for starting_idx in range(vid_fr_multiple): # may just be a single [0] # video-local storage to keep things separate, will extend main @@ -336,26 +337,42 @@ def load_data_offline( frame_poses = empty_pose vid_frame_data.append(FrameData(frame_dets, frame_poses)) + # Compose a list of indices into frame_data that this video's + # worth of content resides. + vid_frame_data_idx: List[int] = list( + range( + len(frame_data), + len(frame_data) + len(vid_frame_data), + ) + ) + frame_data.extend(vid_frame_data) + # Slide this video's worth of frame data into windows such that # each window is window_size long. # If this video has fewer frames than window_size, this video # effectively be skipped. - vid_window_truth = [] - vid_window_data = [] - vid_window_vid = [] # just a single ID per window referencing video - vid_window_frames = [] # Video frame numbers for frames of this window + vid_window_truth: List[List[int]] = [] + vid_window_data_idx: List[List[int]] = [] + # just a single ID per window referencing the video that window + # is pertaining to. + vid_window_vid: List[int] = [] + # Video frame numbers for frames in windows. + vid_window_frames = [] for i in range(len(vid_frame_data) - self.window_size): vid_window_truth.append(vid_frame_truth[i : i + self.window_size]) - vid_window_data.append(vid_frame_data[i : i + self.window_size]) + vid_window_data_idx.append( + vid_frame_data_idx[i : i + self.window_size] + ) vid_window_vid.append(vid_id) vid_window_frames.append(vid_frames[i : i + self.window_size]) window_truth.extend(vid_window_truth) - window_data.extend(vid_window_data) + window_data_idx.extend(vid_window_data_idx) window_vid.extend(vid_window_vid) window_frames.extend(vid_window_frames) - self._window_data = window_data + self._frame_data = frame_data + self._window_data_idx = np.asarray(window_data_idx) self._window_truth = np.asarray(window_truth) self._window_vid = np.asarray(window_vid) self._window_frames = np.asarray(window_frames) @@ -407,23 +424,23 @@ def load_data_offline( if pre_vectorize: if has_vector_cache: - logger.info("Loading window vectors from cache...") + logger.info("Loading frame vectors from cache...") with np.load(cache_filepath) as data: - self._window_vectors = data["window_vectors"] - logger.info("Loading window vectors from cache... Done") + self._frame_vectors = data["frame_vectors"] + logger.info("Loading frame vectors from cache... Done") else: # Pre-vectorize data for iteration efficiency during training. # * Creating a mini Dataset/Dataloader situation to efficiently # generate vectors. - vectorize_window = self._vectorize_window - window_vectors: List[npt.NDArray[np.float32]] = [] + frame_vectors: List[npt.NDArray[np.float32]] = [] + vectorizer = self.vectorizer class VecDset(Dataset): def __getitem__(self, item): - return vectorize_window(window_data[item]) + return vectorizer(frame_data[item]) def __len__(self): - return len(window_data) + return len(frame_data) # Using larger batch sizes than 1 did not show any particular # increase in throughput. This may require increasing @@ -436,18 +453,18 @@ def __len__(self): for batch in tqdm( dloader, - desc="Windows vectorized", - unit="windows", + desc="Frame data vectorized", + unit="frames", ): - window_vectors.extend(batch.numpy()) - self._window_vectors = window_vectors + frame_vectors.extend(batch.numpy()) + self._frame_vectors = np.asarray(frame_vectors) if cache_filepath is not None: logger.info("Saving window vectors to cache...") cache_filepath.parent.mkdir(parents=True, exist_ok=True) np.savez_compressed( cache_filepath, - window_vectors=window_vectors, + frame_vectors=frame_vectors, ) logger.info("Saving window vectors to cache... Done") @@ -471,14 +488,17 @@ def load_data_online( f"Input sequences did not match the configured window size " f"({len(window_data)} != {self.window_size})." ) + window_size = self.window_size # Assign a single window of frame data. - self._window_data = [list(window_data)] + self._frame_data = list(window_data) + # Make sure it has shape of (1, window_size) with the reshape. + self._window_data_idx = np.arange(window_size, dtype=int).reshape(1, -1) # The following are undefined for online mode, so we're just filling in # 0's enough to match size/shape requirements. - self._window_truth = np.zeros(shape=(1, self.window_size), dtype=int) + self._window_truth = np.zeros(shape=(1, window_size), dtype=int) self._window_vid = np.asarray([0]) - self._window_frames = np.asarray([list(range(self.window_size))]) + self._window_frames = self._window_data_idx def __getitem__( self, index: int @@ -503,16 +523,20 @@ def __getitem__( * per-frame video ID, shape: (window_size,) * per-frame image ID, shape: (window_size,) """ - window_data = self._window_data[index] + frame_data = self._frame_data + window_data_idx = self._window_data_idx[index] window_truth = self._window_truth[index] window_vid = self._window_vid[index] window_frames = self._window_frames[index] - window_vectors = self._window_vectors - if window_vectors is not None: - tcn_vector = window_vectors[index] + frame_vectors = self._frame_vectors + if frame_vectors is not None: + window_mat = frame_vectors[window_data_idx] else: - tcn_vector = self._vectorize_window(window_data) + vectorizer = self.vectorizer + window_mat = np.asarray( + [vectorizer(frame_data[idx]) for idx in window_data_idx] + ) # Augmentation has to happen on the fly and cannot be pre-computed due # to random aspects that augmentation can be configured to have during @@ -521,10 +545,10 @@ def __getitem__( # TODO: Augment using a helper on the vectorizer? I'm imaging that # augmentations might be specific to which vectorizer is # used. - tcn_vector = self.transform(tcn_vector) + window_mat = self.transform(window_mat) return ( - tcn_vector, + window_mat, window_truth, # Under the current operation of this dataset, the mask should always # consist of 1's. This may be removed in the future. @@ -540,7 +564,7 @@ def __len__(self): Returns: length: Length of the dataset. """ - return len(self._window_data) + return len(self._window_data_idx) if __name__ == "__main__": @@ -550,17 +574,14 @@ def __len__(self): activity_coco = kwcoco.CocoDataset( # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/activity_truth.coco.json" "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-activity_truth.coco.json" - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TRAIN-activity_truth-vid_1.coco.json" ) dets_coco = kwcoco.CocoDataset( # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_object_detections.coco.json" "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-object_detections.coco.json" - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TRAIN-object_detections-vid_1.coco.json" ) pose_coco = kwcoco.CocoDataset( # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/all_poses.coco.json" "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TEST-pose_estimates.coco.json" - # "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/TRAIN-pose_estimates-vid_1.coco.json" ) from tcn_hpl.data.vectorize.classic import Classic @@ -568,14 +589,19 @@ def __len__(self): vectorizer = Classic( feat_version=6, top_k=1, - num_classes=7, # M2 object detection classes + # M2-specific object detection class indices + num_classes=7, background_idx=0, hand_left_idx=5, hand_right_idx=6, ) dataset = TCNDataset(window_size=25, vectorizer=vectorizer) dataset.load_data_offline( - activity_coco, dets_coco, pose_coco, target_framerate=15, cache_dir=None + activity_coco, + dets_coco, + pose_coco, + target_framerate=15, + cache_dir="./test_cache", ) print(f"dataset: {len(dataset)}") @@ -601,3 +627,19 @@ def __len__(self): print( f"Total batches of size {batch_size}: {count} ({duration:.02f} seconds total)" ) + + # Test creating online mode with subset of data from above. + dset_online = TCNDataset(window_size=25, vectorizer=vectorizer) + dset_online.load_data_online(dataset._frame_data[:25]) # noqa + assert len(dset_online) == 1, "Online dataset should be size 1" + _ = dset_online[0] + failed_index_error = True + try: + # Should index error + dset_online[1] + except IndexError: + failed_index_error = False + assert not failed_index_error, "Should have had an index error at [1]" + assert ( + (dataset[0][0] == dset_online[0][0]).all() # noqa + ), "Online should have produced same window matrix as offline version." diff --git a/tcn_hpl/data/utils/bbn.py b/tcn_hpl/data/utils/bbn.py index 72a132305..421fc69be 100755 --- a/tcn_hpl/data/utils/bbn.py +++ b/tcn_hpl/data/utils/bbn.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 - +from concurrent.futures import ThreadPoolExecutor import dataclasses import os from pathlib import Path @@ -30,11 +30,22 @@ @dataclasses.dataclass class VideoInfo: + # Path to the BBN Truth file associated with this video truth_file: Path + + # Path to the source MP4 video file mp4_file: Path + + # Directory path into which frame image files have been / are to be written frames_dir: Path = dataclasses.field(init=False) + + # Total number of frames extracted/extractable from this video num_frames: int = dataclasses.field(init=False) + + # Frames per second (Hz) of this video fps: float = dataclasses.field(init=False) + + # Frame size in (h, w) format frame_size: typing.Tuple[int, int] = dataclasses.field(init=False) @@ -66,20 +77,27 @@ def extract_bbn_video_frames( # quantity of frames in the video, we assume that this is already done. # Otherwise, progress for each frame, writing out the frame file if it does # not already exist in the directory. + # * Including parent directory name in description to be more contextually + # informative if ( not output_directory.is_dir() or len(list(output_directory.iterdir())) != num_frames ): output_directory.mkdir(exist_ok=True) + report_name = f"{video_path.parent.name}/{video_path.name}" for i in tqdm( range(int(num_frames)), - desc=f"Extracting frames from {video_path.name}", + # desc=f"Extracting frames from {video_path.name}", + desc=f"Extracting frames from {report_name}", unit="frame", ): ret, frame = video.read() frame_filepath = output_directory / f"{i:05d}.png" if not frame_filepath.is_file(): - cv2.imwrite(frame_filepath.as_posix(), frame) + # Safely write to a temp and then atomically (on unix) rename. + tmp_filepath = output_directory / f".{i:05d}-PENDING.png" + cv2.imwrite(tmp_filepath.as_posix(), frame) + tmp_filepath.rename(frame_filepath) return num_frames, fps, (frame_h, frame_w) @@ -145,7 +163,8 @@ def convert_truth_to_array( return activity_gt -@click.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.command() +@click.help_option("-h", "--help") @click.argument( "bbn_truth_root", type=click.Path( @@ -176,12 +195,23 @@ def convert_truth_to_array( "truth root directory and working directory, respecively." ), ) +@click.option( + "--cores-extraction", + type=int, + default=8, + show_default=True, + help=( + "Level of parallelization to extract videos with. We will extract " + "frames for this many videos at the same time." + ), +) def create_truth_coco( bbn_truth_root: Path, working_directory: Path, activity_label_config: Path, output_coco_filepath: Path, relative: bool, + cores_extraction: int, ) -> None: """ Extract the component frames aof a directory of MP4 videos that have an @@ -242,21 +272,35 @@ def create_truth_coco( ordered_vi_keys = sorted(video_info) + # # Pre-process video files into directories of frames. - # TODO: Could use thread-pool and submit a job per video. - for vi_key in tqdm( - ordered_vi_keys, - desc="Extracting frames from videos", - unit="videos", - ): - vi = video_info[vi_key] - frames_output_directory = working_directory / vi.mp4_file.relative_to( - bbn_truth_root - ).with_suffix(".frames") - vi.frames_dir = frames_output_directory - vi.num_frames, vi.fps, vi.frame_size = extract_bbn_video_frames( - vi.mp4_file, frames_output_directory + # + # ordered mp4 filenames + ordered_vi_mp4_filenames = [video_info[vi_key].mp4_file for vi_key in ordered_vi_keys] + # output directories per video + ordered_vi_output_dirs = [ + working_directory / vi_mp4_path.relative_to(bbn_truth_root).with_suffix(".frames") + for vi_mp4_path in ordered_vi_mp4_filenames + ] + # Specifically using ThreadPoolExecutor so tqdm shows progress bars + # separately for each task. + with ThreadPoolExecutor(max_workers=cores_extraction) as pool: + results = pool.map( + extract_bbn_video_frames, + ordered_vi_mp4_filenames, + ordered_vi_output_dirs ) + for vi_key, outdir_path, (num_frames, fps, frame_hw) in tqdm( + zip(ordered_vi_keys, ordered_vi_output_dirs, results), + desc="Integrating video frame extraction results", + total=len(ordered_vi_keys), + unit="videos", + ): + vi = video_info[vi_key] + vi.frames_dir = outdir_path + vi.num_frames = num_frames + vi.fps = fps + vi.frame_size = frame_hw # Home for our video, image and per-frame truth annotations. truth_ds = kwcoco.CocoDataset(img_root=working_directory.as_posix()) diff --git a/tcn_hpl/data/utils/kwcoco_guided_subset.py b/tcn_hpl/data/utils/kwcoco_guided_subset.py new file mode 100644 index 000000000..7e131f562 --- /dev/null +++ b/tcn_hpl/data/utils/kwcoco_guided_subset.py @@ -0,0 +1,85 @@ +from os.path import exists +from pathlib import Path + +import click +import kwcoco + + +@click.command() +@click.help_option("-h", "--help") +@click.argument( + "INPUT_COCO_FILEPATH", + type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), +) +@click.argument( + "GUIDE_COCO_FILEPATH", + type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), +) +@click.argument( + "OUTPUT_COCO_FILEPATH", + type=click.Path(dir_okay=False, writable=True, path_type=Path), +) +def main( + input_coco_filepath: Path, + guide_coco_filepath: Path, + output_coco_filepath: Path, +): + """ + CLI Utility to create a subset of a CocoDataset based on the image/video + content of some other dataset. + + This tool will assert that the video and image content of the guide dataset + matches content present in the input dataset, as this filtering only makes + sense if this is true. + + \b + Positional Arguments: + INPUT_COCO_FILEPATH: + Path to the COCO JSON file to be filtered into a subset. + GUIDE_COCO_FILEPATH + Path to the COCO JSON file to provide the video/image content to + guide the filtering. + OUTPUT_COCO_FILEPATH + Path to where we should save the output COCO JSON file. + """ + dset_input = kwcoco.CocoDataset(input_coco_filepath) + dset_guide = kwcoco.CocoDataset(guide_coco_filepath) + + # Assert that guide dataset video and image ID content is present in the + # input dataset + assert bool(dset_input.videos()) == bool( + dset_guide.videos() + ), "Input or guide has videos, but the other doesn't!" + if dset_input.videos(): + # ensure video content in guide is present in input and matches exactly + guide_vid_diff = set(dset_guide.videos()).difference(dset_input.videos()) + assert ( + not guide_vid_diff + ), f"Guide dataset has video IDs not present in the input dataset: {guide_vid_diff}" + unmatched_guide_vid = [ + vid + for vid in dset_guide.videos() + if dset_guide.index.videos[vid] != dset_input.index.videos[vid] + ] + assert ( + not unmatched_guide_vid + ), f"Some guide videos are not present exactly in input dset: {unmatched_guide_vid}" + guide_gid_diff = set(dset_guide.images()).difference(dset_input.images()) + assert ( + not guide_gid_diff + ), f"Guide dataset has image IDs not present in the input dataset: {guide_gid_diff}" + unmatched_guide_gid = [ + gid + for gid in dset_guide.images() + if dset_guide.index.imgs[gid] != dset_input.index.imgs[gid] + ] + assert ( + not unmatched_guide_gid + ), f"Some guide images are not present exactly in the input dset: {unmatched_guide_gid}" + + dset_subset: kwcoco.CocoDataset = dset_input.subset(dset_guide.images().gids) + output_coco_filepath.parent.mkdir(parents=True, exist_ok=True) + dset_subset.dump( + output_coco_filepath, + newlines=True, + ) diff --git a/tcn_hpl/data/vectorize/_data.py b/tcn_hpl/data/vectorize/_data.py index f8ec55dea..95714cc0d 100644 --- a/tcn_hpl/data/vectorize/_data.py +++ b/tcn_hpl/data/vectorize/_data.py @@ -70,8 +70,39 @@ def __bool__(self): @dataclass class FrameData: + """ + Structure composing information and correlated analytic results for a + single image frame. + """ + + # Object detection results for this frame. + # This may contain an instance with empty (zero-sized) component vectors, + # which implies that an object detection analytic was run on this frame but + # did not predict any objects. + # This may be None, which implies that an object detection analytic was not + # run for this frame. object_detections: tg.Optional[FrameObjectDetections] + + # Pose estimation results for this frame. + # This may contain an instance with empty (zero-sized) component vectors, + # which implies that a pose estimation analytic was run on this frame but + # did not predict any poses. + # This may be None, which implies that an object pose estimation was not + # run for this frame. poses: tg.Optional[FramePoses] def __bool__(self): + """ + Get if this frame contains analytic results or not. + + This frame data instance is "False" if there are no analytic results + for this frame, and "True" if there is any analytic results (but + possibly not both). + Having "no analytic results" is defined as either the slot being `None` + valued or if the component results evaluates as False itself (see the + bool logic for the component dataclass). + + :return: True if this frame has any analytic results, False if it does + not. + """ return bool(self.object_detections) or bool(self.poses) diff --git a/tcn_hpl/data/vectorize_classic.py b/tcn_hpl/data/vectorize_classic.py index 2b7de4d13..773d67931 100644 --- a/tcn_hpl/data/vectorize_classic.py +++ b/tcn_hpl/data/vectorize_classic.py @@ -1,3 +1,7 @@ +""" +This file is a straight port of what previously was in angel_system/tcn_hpl/... +The content here is being left as is to maintain previous behaviors. +""" import os from typing import Dict, Tuple, List