Skip to content

Commit

Permalink
fix kernel launch delay (#214)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #214

This diff fixes the kernel launch statistics for HTA.

When we are calculating the launch delay (assuming the delay is the time from the end of kernel launch to the start of the kernel execution. the equation should be `(gpu_kernel_start_ts) - (cpu_kernel_start_ts + duration_cpu)`? The sign for `cpu_duration` is flipped here https://fburl.com/code/tckgei9p, which is incorrect. This fix fixes that.

Moreover, we clip the kernel launch delay to 0 is the value is negative i.e kernel execution starts before the launch has finished.

Reviewed By: fengxizhou

Differential Revision: D68877268

fbshipit-source-id: 7eea89e270c2a7a6422af059aecb8e7604cf1ff1
  • Loading branch information
fenypatel99 authored and facebook-github-bot committed Jan 31, 2025
1 parent 235bee7 commit fdd059c
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
5 changes: 4 additions & 1 deletion hta/analyzers/cuda_kernel_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,9 +442,12 @@ def cuda_kernel_launch_stats(
on="correlation",
)
joined_df["launch_delay"] = (
joined_df["ts_y"] - joined_df["ts_x"] + joined_df["dur_x"]
joined_df["ts_y"] - joined_df["ts_x"] - joined_df["dur_x"]
)

# clip the launch delay to 0 if it is negative
joined_df["launch_delay"] = joined_df["launch_delay"].clip(lower=0)

# rename columns and select the required columns from the final dataframe
renamed_df = joined_df.rename(
columns={"dur_x": "cpu_duration", "dur_y": "gpu_duration"}
Expand Down
11 changes: 5 additions & 6 deletions tests/test_trace_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ def test_get_cuda_kernel_launch_stats_training_multiple_ranks(self):

self.assertEqual(row1["cpu_duration"].item(), 16)
self.assertEqual(row1["gpu_duration"].item(), 2394)
self.assertEqual(row1["launch_delay"].item(), 16491)
self.assertEqual(row1["launch_delay"].item(), 16459)
self.assertEqual(row2["cpu_duration"].item(), 21)
self.assertEqual(row2["gpu_duration"].item(), 94)
self.assertEqual(row2["launch_delay"].item(), 41)
self.assertEqual(row2["launch_delay"].item(), 0)

def test_get_cuda_kernel_launch_stats_inference_single_rank(self):
dataframe_list = self.inference_t.get_cuda_kernel_launch_stats(visualize=False)
Expand All @@ -122,7 +122,7 @@ def test_get_cuda_kernel_launch_stats_inference_single_rank(self):

self.assertEqual(row["cpu_duration"].item(), 9)
self.assertEqual(row["gpu_duration"].item(), 3)
self.assertEqual(row["launch_delay"].item(), 20)
self.assertEqual(row["launch_delay"].item(), 2)

def test_get_mtia_kernel_launch_stats_inference_single_rank(self):
dataframe_list = self.mtia_single_rank_trace_t.get_cuda_kernel_launch_stats(
Expand All @@ -133,8 +133,7 @@ def test_get_mtia_kernel_launch_stats_inference_single_rank(self):

self.assertAlmostEqual(row["cpu_duration"].item(), 435.200, delta=2.0)
self.assertAlmostEqual(row["gpu_duration"].item(), 124.768, delta=2.0)
# To debug this test case: value being produced is 774.0
# self.assertAlmostEqual(row["launch_delay"].item(), 340.291, delta=2.0)
self.assertAlmostEqual(row["launch_delay"].item(), 0, delta=2.0)

def test_get_cuda_kernel_launch_stats_for_h100(self):
dataframe_dict = self.h100_trace_t.get_cuda_kernel_launch_stats(
Expand All @@ -146,7 +145,7 @@ def test_get_cuda_kernel_launch_stats_for_h100(self):
self.assertEqual(rank_1_df.shape[0], 32835)
self.assertEqual(row["cpu_duration"].item(), 20)
self.assertEqual(row["gpu_duration"].item(), 31)
self.assertEqual(row["launch_delay"].item(), 41)
self.assertEqual(row["launch_delay"].item(), 1)

def test_get_profiler_steps(self):
results = self.vision_transformer_t.get_profiler_steps()
Expand Down

0 comments on commit fdd059c

Please sign in to comment.