feat[dace][next]: Fixing strides in optimization (#1782)

Added functionality to properly handle changes of strides. During the implementation of the scan we found that the strides were not handled properly. Most importantly a change on one level was not propagated into the next levels, i.e. they were still using the old strides. This PR Solves most of the problems, but there are still some issues that are unsolved: - Views are not adjusted yet (Fixed in [PR@1784](#1784)). - It is not properly checked if the symbols of the propagated strides are safe to introduce into the nested SDFG. The initial functionality of this PR was done by Edoardo Paone (@edopao). --------- Co-authored-by: edopao <[email protected]>
GridTools · Dec 20, 2024 · 77cad7c · 77cad7c
1 parent 06b398a
commit 77cad7c
Show file tree

Hide file tree

Showing 6 changed files with 1,238 additions and 26 deletions.
diff --git a/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/__init__.py b/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/__init__.py
@@ -35,7 +35,13 @@
     gt_simplify,
     gt_substitute_compiletime_symbols,
 )
-from .strides import gt_change_transient_strides
+from .strides import (
+    gt_change_transient_strides,
+    gt_map_strides_to_dst_nested_sdfg,
+    gt_map_strides_to_src_nested_sdfg,
+    gt_propagate_strides_from_access_node,
+    gt_propagate_strides_of,
+)
 from .util import gt_find_constant_arguments, gt_make_transients_persistent
 
 
@@ -59,6 +65,10 @@
     "gt_gpu_transformation",
     "gt_inline_nested_sdfg",
     "gt_make_transients_persistent",
+    "gt_map_strides_to_dst_nested_sdfg",
+    "gt_map_strides_to_src_nested_sdfg",
+    "gt_propagate_strides_from_access_node",
+    "gt_propagate_strides_of",
     "gt_reduce_distributed_buffering",
     "gt_set_gpu_blocksize",
     "gt_set_iteration_order",

diff --git a/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/gpu_utils.py b/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/gpu_utils.py
@@ -95,7 +95,7 @@ def gt_gpu_transformation(
 
     if try_removing_trivial_maps:
         # In DaCe a Tasklet, outside of a Map, can not write into an _array_ that is on
-        #  GPU. `sdfg.appyl_gpu_transformations()` will wrap such Tasklets in a Map. So
+        #  GPU. `sdfg.apply_gpu_transformations()` will wrap such Tasklets in a Map. So
         #  we might end up with lots of these trivial Maps, each requiring a separate
         #  kernel launch. To prevent this we will combine these trivial maps, if
         #  possible, with their downstream maps.

diff --git a/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/simplify.py b/src/gt4py/next/program_processors/runners/dace_fieldview/transformations/simplify.py
@@ -950,7 +950,7 @@ def _perform_pointwise_test(
 
     def apply(
         self,
-        graph: dace.SDFGState | dace.SDFG,
+        graph: dace.SDFGState,
         sdfg: dace.SDFG,
     ) -> None:
         # Removal
@@ -971,6 +971,9 @@ def apply(
             tmp_out_subset = dace_subsets.Range.from_array(tmp_desc)
         assert glob_in_subset is not None
 
+        # Recursively visit the nested SDFGs for mapping of strides from inner to outer array
+        gtx_transformations.gt_map_strides_to_src_nested_sdfg(sdfg, graph, map_to_tmp_edge, glob_ac)
+
         # We now remove the `tmp` node, and create a new connection between
         #  the global node and the map exit.
         new_map_to_glob_edge = graph.add_edge(