tenstorrent · avoraTT · Feb 7, 2025 · Feb 10, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py b/tests/nightly/tg/ccl/test_ccl_async_TG_llama_nightly.py
@@ -0,0 +1 @@
+../../../ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py
@@ -14,6 +14,7 @@
     teardown_fabric_interface,
     create_global_semaphore_with_same_address,
 )
+from models.perf.benchmarking_utils import BenchmarkProfiler
 
 
 def report_mismatches(golden, actual, max_printable=None):
@@ -63,7 +64,9 @@ def run_with_trace(
     n_worker=None,
     n_buffer=None,
     num_iter=20,
+    warmup_iters=0,
     use_all_gather_async=False,
+    profiler=BenchmarkProfiler(),
 ):
     # Compile Run
     logger.info("Compiling model")
@@ -97,44 +100,68 @@ def run_with_trace(
 
     # Capture trace
     logger.info("Capturing trace")
-    trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
-    for i in range(num_iter):
-        if use_all_gather_async:
-            logger.info("Running all-gather async")
-            tt_out_tensor = ttnn.experimental.all_gather_async(
-                input_tensor,
-                dim,
-                cluster_axis=cluster_axis,
-                mesh_device=mesh_device,
-                topology=ttnn.Topology.Linear,
-                multi_device_global_semaphore=ccl_semaphore_handles[i]
-                if type(ccl_semaphore_handles) == list
-                else ccl_semaphore_handles,
-                num_links=num_links,
-                memory_config=output_mem_config,
-                subdevice_id=worker_sub_device_id,
-                enable_persistent_fabric_mode=enable_persistent_fabric,
-            )
-        else:
-            tt_out_tensor = ttnn.all_gather(
-                input_tensor,
-                dim=dim,
-                cluster_axis=cluster_axis,
-                mesh_device=mesh_device,
-                num_links=num_links,
-                memory_config=output_mem_config,
-                topology=all_gather_topology,
-            )
-    ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
-    for d in mesh_device.get_devices():
-        ttnn.synchronize_device(d)
+
+    def capture_trace(n_iters):
+        trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
+        for i in range(n_iters):
+            if use_all_gather_async:
+                logger.info("Running all-gather async")
+                tt_out_tensor = ttnn.experimental.all_gather_async(
+                    input_tensor,
+                    dim,
+                    cluster_axis=cluster_axis,
+                    mesh_device=mesh_device,
+                    topology=ttnn.Topology.Linear,
+                    multi_device_global_semaphore=ccl_semaphore_handles[i]
+                    if type(ccl_semaphore_handles) == list
+                    else ccl_semaphore_handles,
+                    num_links=num_links,
+                    memory_config=output_mem_config,
+                    subdevice_id=worker_sub_device_id,
+                    enable_persistent_fabric_mode=enable_persistent_fabric,
+                )
+            else:
+                tt_out_tensor = ttnn.all_gather(
+                    input_tensor,
+                    dim=dim,
+                    cluster_axis=cluster_axis,
+                    mesh_device=mesh_device,
+                    num_links=num_links,
+                    memory_config=output_mem_config,
+                    topology=all_gather_topology,
+                )
+        ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
+        for d in mesh_device.get_devices():
+            ttnn.synchronize_device(d)
+        return trace_id
+
+    if warmup_iters > 0:
+        trace_id_warmup = capture_trace(warmup_iters)
+    trace_id = capture_trace(num_iter)
 
     # Run the op
     logger.info("Starting Trace perf test...")
+    profiler.start("all-gather-async-trace-warmup")
+    if warmup_iters > 0:
+        ttnn.execute_trace(mesh_device, trace_id_warmup, blocking=False)
+        ttnn.release_trace(mesh_device, trace_id_warmup)
+        for d in mesh_device.get_devices():
+            ttnn.synchronize_device(d)
+    profiler.end("all-gather-async-trace-warmup")
+
+    profiler.start("all-gather-async-trace")
     ttnn.execute_trace(mesh_device, trace_id, blocking=False)
     ttnn.release_trace(mesh_device, trace_id)
     for d in mesh_device.get_devices():
         ttnn.synchronize_device(d)
+    profiler.end("all-gather-async-trace")
+    time_taken = profiler.get_duration("all-gather-async-trace") - profiler.get_duration(
+        "all-gather-async-trace-warmup"
+    )
+    effective_iter = num_iter - warmup_iters
+    logger.info(f"Time taken: {time_taken} s")
+    logger.info(f"Time per iter: {time_taken / effective_iter} s")
+    logger.info(f"Time per iter: {time_taken / effective_iter * 1e6} us")
 
     return tt_out_tensor
 
@@ -156,10 +183,12 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     output_shard_spec: ttnn.ShardSpec = None,
     num_all_gather_instances: int = 1,
     num_iters: int = 1,
+    warmup_iters: int = 0,
     cluster_axis: int = 0,
     tile=(32, 32),
     trace_mode=False,
     debug=False,
+    profiler=BenchmarkProfiler(),
     # New all-gather-async and persistent fabric params
     use_all_gather_async=False,
     enable_persistent_fabric=False,
@@ -269,7 +298,9 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
                 enable_persistent_fabric=enable_persistent_fabric,
                 all_gather_topology=ttnn.Topology.Linear,
                 num_iter=num_iters,
+                warmup_iters=warmup_iters,
                 use_all_gather_async=use_all_gather_async,
+                profiler=profiler,
             )
 
         else:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../ttnn/unit_tests/operations/ccl/test_ccl_async_TG_llama.py