From fdd059c698b6212f9191d3338e670f235026cb3e Mon Sep 17 00:00:00 2001 From: Feny Patel Date: Fri, 31 Jan 2025 08:20:05 -0800 Subject: [PATCH] fix kernel launch delay (#214) Summary: Pull Request resolved: https://github.com/facebookresearch/HolisticTraceAnalysis/pull/214 This diff fixes the kernel launch statistics for HTA. When we are calculating the launch delay (assuming the delay is the time from the end of kernel launch to the start of the kernel execution. the equation should be `(gpu_kernel_start_ts) - (cpu_kernel_start_ts + duration_cpu)`? The sign for `cpu_duration` is flipped here https://fburl.com/code/tckgei9p, which is incorrect. This fix fixes that. Moreover, we clip the kernel launch delay to 0 is the value is negative i.e kernel execution starts before the launch has finished. Reviewed By: fengxizhou Differential Revision: D68877268 fbshipit-source-id: 7eea89e270c2a7a6422af059aecb8e7604cf1ff1 --- hta/analyzers/cuda_kernel_analysis.py | 5 ++++- tests/test_trace_analysis.py | 11 +++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/hta/analyzers/cuda_kernel_analysis.py b/hta/analyzers/cuda_kernel_analysis.py index 0b0ec55..c0e0760 100644 --- a/hta/analyzers/cuda_kernel_analysis.py +++ b/hta/analyzers/cuda_kernel_analysis.py @@ -442,9 +442,12 @@ def cuda_kernel_launch_stats( on="correlation", ) joined_df["launch_delay"] = ( - joined_df["ts_y"] - joined_df["ts_x"] + joined_df["dur_x"] + joined_df["ts_y"] - joined_df["ts_x"] - joined_df["dur_x"] ) + # clip the launch delay to 0 if it is negative + joined_df["launch_delay"] = joined_df["launch_delay"].clip(lower=0) + # rename columns and select the required columns from the final dataframe renamed_df = joined_df.rename( columns={"dur_x": "cpu_duration", "dur_y": "gpu_duration"} diff --git a/tests/test_trace_analysis.py b/tests/test_trace_analysis.py index d39eae2..87c4709 100644 --- a/tests/test_trace_analysis.py +++ b/tests/test_trace_analysis.py @@ -110,10 +110,10 @@ def test_get_cuda_kernel_launch_stats_training_multiple_ranks(self): self.assertEqual(row1["cpu_duration"].item(), 16) self.assertEqual(row1["gpu_duration"].item(), 2394) - self.assertEqual(row1["launch_delay"].item(), 16491) + self.assertEqual(row1["launch_delay"].item(), 16459) self.assertEqual(row2["cpu_duration"].item(), 21) self.assertEqual(row2["gpu_duration"].item(), 94) - self.assertEqual(row2["launch_delay"].item(), 41) + self.assertEqual(row2["launch_delay"].item(), 0) def test_get_cuda_kernel_launch_stats_inference_single_rank(self): dataframe_list = self.inference_t.get_cuda_kernel_launch_stats(visualize=False) @@ -122,7 +122,7 @@ def test_get_cuda_kernel_launch_stats_inference_single_rank(self): self.assertEqual(row["cpu_duration"].item(), 9) self.assertEqual(row["gpu_duration"].item(), 3) - self.assertEqual(row["launch_delay"].item(), 20) + self.assertEqual(row["launch_delay"].item(), 2) def test_get_mtia_kernel_launch_stats_inference_single_rank(self): dataframe_list = self.mtia_single_rank_trace_t.get_cuda_kernel_launch_stats( @@ -133,8 +133,7 @@ def test_get_mtia_kernel_launch_stats_inference_single_rank(self): self.assertAlmostEqual(row["cpu_duration"].item(), 435.200, delta=2.0) self.assertAlmostEqual(row["gpu_duration"].item(), 124.768, delta=2.0) - # To debug this test case: value being produced is 774.0 - # self.assertAlmostEqual(row["launch_delay"].item(), 340.291, delta=2.0) + self.assertAlmostEqual(row["launch_delay"].item(), 0, delta=2.0) def test_get_cuda_kernel_launch_stats_for_h100(self): dataframe_dict = self.h100_trace_t.get_cuda_kernel_launch_stats( @@ -146,7 +145,7 @@ def test_get_cuda_kernel_launch_stats_for_h100(self): self.assertEqual(rank_1_df.shape[0], 32835) self.assertEqual(row["cpu_duration"].item(), 20) self.assertEqual(row["gpu_duration"].item(), 31) - self.assertEqual(row["launch_delay"].item(), 41) + self.assertEqual(row["launch_delay"].item(), 1) def test_get_profiler_steps(self): results = self.vision_transformer_t.get_profiler_steps()