From 0d639aca9d6dc6ac975f427da13b25c5bd8d4c35 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Sun, 13 Oct 2024 18:10:01 -0400
Subject: [PATCH 01/16] initial implmentation of meomry algorithm

---
 .../compiler/cost_estimator/cost_estimator.h  |  9 +--
 .../compiler/cost_estimator/cost_metric.h     | 28 ++++++++
 .../cost_estimator/cost_metric.struct.toml    | 18 ++++++
 ...easible_machine_mapping_result.struct.toml |  5 +-
 .../get_optimal_machine_mapping.h             | 18 ++++--
 .../machine_mapping_config.struct.toml        | 13 ++++
 .../machine_mapping/machine_mapping_result.h  | 20 ++++--
 .../machine_mapping_state.struct.toml         | 10 +++
 .../machine_memory_constraints.struct.toml    | 13 ++++
 .../compiler/cost_estimator/cost_estimator.cc |  4 +-
 .../compiler/cost_estimator/cost_metric.cc    | 55 ++++++++++++++++
 .../get_optimal_machine_mapping.cc            | 56 +++++++++++-----
 .../machine_mapping/machine_mapping_result.cc | 64 +++++++++++++++----
 13 files changed, 269 insertions(+), 44 deletions(-)
 create mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml
 create mode 100644 lib/compiler/src/compiler/cost_estimator/cost_metric.cc
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index 65bae0c76a..55311af83b 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 
+#include "compiler/cost_estimator/cost_metric.dtg.h"
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
 #include "compiler/cost_estimator/tensor_set_movement.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
@@ -11,8 +12,8 @@
 namespace FlexFlow {
 
 struct ICostEstimator {
-  virtual float estimate_cost(OpCostEstimateKey const &) const = 0;
-  virtual float estimate_cost(TensorSetMovement const &) const = 0;
+  virtual CostMetric estimate_cost(OpCostEstimateKey const &) const = 0;
+  virtual CostMetric estimate_cost(TensorSetMovement const &) const = 0;
 
   ICostEstimator() = default;
   ICostEstimator(ICostEstimator const &) = delete;
@@ -23,8 +24,8 @@ struct ICostEstimator {
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
-  float estimate_cost(OpCostEstimateKey const &k) const;
-  float estimate_cost(TensorSetMovement const &m) const;
+  CostMetric estimate_cost(OpCostEstimateKey const &k) const;
+  CostMetric estimate_cost(TensorSetMovement const &m) const;
 
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<ICostEstimator, T>::value,
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.h b/lib/compiler/include/compiler/cost_estimator/cost_metric.h
new file mode 100644
index 0000000000..98b0cb228d
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/cost_metric.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H
+
+#include "compiler/cost_estimator/cost_metric.dtg.h"
+#include <vector>
+
+namespace FlexFlow {
+
+CostMetric zero_cost_metric();
+
+CostMetric combine_cost_metrics_inter_device(CostMetric const &c1,
+                                             CostMetric const &c2);
+CostMetric
+    combine_cost_metrics_inter_device(std::vector<CostMetric> const &costs);
+
+CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1,
+                                                        CostMetric const &c2);
+CostMetric combine_cost_metrics_intra_device_sequential(
+    std::vector<CostMetric> const &costs);
+
+CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1,
+                                                      CostMetric const &c2);
+CostMetric combine_cost_metrics_intra_device_parallel(
+    std::vector<CostMetric> const &costs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml b/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml
new file mode 100644
index 0000000000..0666bb9e11
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "CostMetric"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "runtime"
+type = "float"
+
+[[fields]]
+name = "memory"
+type = "size_t"
diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
index e71cfc540f..07dc30d2fc 100644
--- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
@@ -8,11 +8,12 @@ features = [
 
 includes = [
   "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
+  "compiler/cost_estimator/cost_metric.dtg.h",
 ]
 
 [[fields]]
-name = "runtime"
-type = "float"
+name = "cost"
+type = "::FlexFlow::CostMetric"
 
 [[fields]]
 name = "machine_mapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
index 62da90bfcb..cd4896e260 100644
--- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
@@ -2,11 +2,13 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_GET_OPTIMAL_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_context.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 
@@ -17,7 +19,9 @@ MachineMappingResult
                                 MachineMappingContext const &context,
                                 MachineMappingProblemTree const &problem_tree,
                                 MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints);
+                                MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
+                                MachineMappingConfig const &config);
 
 MachineMappingResult
     get_optimal_machine_mapping(MachineMappingCache &result_cache,
@@ -25,22 +29,28 @@ MachineMappingResult
                                 MMProblemTreeSeriesSplit const &series_split,
                                 MachineSpecification const &resources,
                                 MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
                                 std::optional<ParallelSplitTransformation> const
-                                    &parallel_split_transformation);
+                                    &parallel_split_transformation,
+                                MachineMappingConfig const &config);
 
 MachineMappingResult get_optimal_machine_mapping(
     MachineMappingCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints);
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config);
 
 MachineMappingResult
     get_optimal_machine_mapping(MachineMappingCache &result_cache,
                                 MachineMappingContext const &,
                                 UnmappedOpCostEstimateKey const &leaf,
                                 MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints);
+                                MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
+                                MachineMappingConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
new file mode 100644
index 0000000000..f4c0b61291
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "MachineMappingConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = []
+
+[[fields]]
+name = "enable_memory_optimization"
+type = "bool"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index b21fea5f24..642d48ec02 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -3,6 +3,8 @@
 
 #include "compiler/machine_mapping/machine_mapping_result.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 
 namespace FlexFlow {
 
@@ -14,22 +16,32 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     std::unordered_set<MachineMappingResult> const &);
 
 [[nodiscard]] MachineMappingResult
-    series_combine(float comm_cost,
+    series_combine(MachineMappingConfig const &config,
+                   MachineMemoryConstraints const &memory_constraints,
+                   CostMetric const &comm_cost,
                    MachineMappingResult const &pre_result,
                    MachineMappingResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
                        &parallel_split_transformation);
 [[nodiscard]] MachineMappingResult
-    parallel_combine(MachineMappingResult const &lhs_result,
+    parallel_combine(MachineMappingConfig const &config,
+                     MachineMemoryConstraints const &memory_constraints,
+                     MachineMappingResult const &lhs_result,
                      MachineMappingResult const &rhs_result);
 
 [[nodiscard]] MachineMappingResult
     minimize_runtime(MachineMappingResult const &m1,
                      MachineMappingResult const &m2);
 
+[[nodiscard]] MachineMappingResult make_singleton_machine_mapping_result(
+    MachineMappingConfig const &config,
+    MachineMemoryConstraints const &memory_constraints,
+    CostMetric const &cost,
+    MachineView const &machine_view);
+
 [[nodiscard]] MachineMappingResult
-    make_singleton_machine_mapping_result(float runtime,
-                                          MachineView const &machine_view);
+    machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints,
+                                 MachineMappingResult const &result);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
index 1346f6ebe7..b4a6147b5a 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
@@ -9,7 +9,9 @@ features = [
 includes = [
   "pcg/machine_specification.dtg.h",
   "compiler/machine_mapping/machine_mapping_constraints.dtg.h",
+  "compiler/machine_mapping/machine_mapping_config.dtg.h",
   "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h",
+  "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h",
 ]
 
 [[fields]]
@@ -23,3 +25,11 @@ type = "::FlexFlow::MachineSpecification"
 [[fields]]
 name = "constraints"
 type = "::FlexFlow::MachineMappingConstraints"
+
+[[fields]]
+name = "memory_constraints"
+type = "::FlexFlow::MachineMemoryConstraints"
+
+[[fields]]
+name = "config"
+type = "::FlexFlow::MachineMappingConfig"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml
new file mode 100644
index 0000000000..0d2572c783
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "MachineMemoryConstraints"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = []
+
+[[fields]]
+name = "memory_limit"
+type = "size_t"
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
index 051ffcd190..10e999dc1a 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
@@ -5,11 +5,11 @@ namespace FlexFlow {
 CostEstimator::CostEstimator(std::shared_ptr<ICostEstimator> implementation_ptr)
     : implementation_ptr(implementation_ptr) {}
 
-float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+CostMetric CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->implementation_ptr->estimate_cost(k);
 }
 
-float CostEstimator::estimate_cost(TensorSetMovement const &m) const {
+CostMetric CostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->implementation_ptr->estimate_cost(m);
 }
 
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
new file mode 100644
index 0000000000..370afab406
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
@@ -0,0 +1,55 @@
+#include "compiler/cost_estimator/cost_metric.h"
+
+namespace FlexFlow {
+
+CostMetric zero_cost_metric() {
+  return CostMetric{
+    /*runtime=*/0,
+    /*memory=*/0,
+  };
+}
+
+CostMetric combine_cost_metrics_inter_device(CostMetric const &c1,
+                                             CostMetric const &c2) {
+  return CostMetric{c1.runtime + c2.runtime, c1.memory + c2.memory};
+}
+
+CostMetric
+    combine_cost_metrics_inter_device(std::vector<CostMetric> const &costs) {
+  CostMetric result = zero_cost_metric();
+  for (CostMetric const &cost : costs) {
+    result = combine_cost_metrics_inter_device(result, cost);
+  }
+  return result;
+}
+
+CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1,
+                                                        CostMetric const &c2) {
+  return CostMetric{c1.runtime + c2.runtime, std::max(c1.memory, c2.memory)};
+}
+
+CostMetric combine_cost_metrics_intra_device_sequential(
+    std::vector<CostMetric> const &costs) {
+  CostMetric result = zero_cost_metric();
+  for (CostMetric const &cost : costs) {
+    result = combine_cost_metrics_intra_device_sequential(result, cost);
+  }
+  return result;
+}
+
+CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1,
+                                                      CostMetric const &c2) {
+  return CostMetric{std::max(c1.runtime, c2.runtime),
+                    std::max(c1.memory, c2.memory)};
+}
+
+CostMetric combine_cost_metrics_intra_device_parallel(
+    std::vector<CostMetric> const &costs) {
+  CostMetric result = zero_cost_metric();
+  for (CostMetric const &cost : costs) {
+    result = combine_cost_metrics_intra_device_parallel(result, cost);
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 10abd7ff90..a1a1595d98 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -29,12 +29,16 @@ MachineMappingResult
                                 MachineMappingContext const &context,
                                 MachineMappingProblemTree const &problem_tree,
                                 MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints) {
+                                MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
+                                MachineMappingConfig const &config) {
 
   MachineMappingState state = MachineMappingState{
       problem_tree,
       resources,
       constraints,
+      memory_constraints,
+      config,
   };
 
   {
@@ -54,14 +58,18 @@ MachineMappingResult
                 series_split,
                 resources,
                 constraints,
-                /*parallel_split_transformation=*/std::nullopt);
+                memory_constraints,
+                /*parallel_split_transformation=*/std::nullopt,
+                config);
           },
           [&](auto const &decomp_tree_node) {
             return get_optimal_machine_mapping(result_cache,
                                                context,
                                                decomp_tree_node,
                                                resources,
-                                               constraints);
+                                               constraints,
+                                               memory_constraints,
+                                               config);
           },
       });
 
@@ -75,8 +83,10 @@ MachineMappingResult
                                 MMProblemTreeSeriesSplit const &series_split,
                                 MachineSpecification const &resources,
                                 MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
                                 std::optional<ParallelSplitTransformation> const
-                                    &parallel_split_transformation) {
+                                    &parallel_split_transformation,
+                                MachineMappingConfig const &config) {
 
   auto get_boundary_machine_view_assignments =
       [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
@@ -110,7 +120,9 @@ MachineMappingResult
                                         context,
                                         series_split.get_left_child(),
                                         resources,
-                                        pre_candidate);
+                                        pre_candidate,
+                                        memory_constraints,
+                                        config);
 
         return pre_result;
       };
@@ -126,7 +138,9 @@ MachineMappingResult
                                         context,
                                         series_split.get_right_child(),
                                         resources,
-                                        post_candidate);
+                                        post_candidate,
+                                        memory_constraints,
+                                        config);
 
         return post_result;
       };
@@ -155,11 +169,13 @@ MachineMappingResult
               tensor_movement,
               /*pre_mapping=*/assigned_pre_machine_views,
               /*post_mapping=*/assigned_post_machine_views);
-      float cost_across_split =
+      CostMetric cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
       result = minimize_runtime(result,
-                                series_combine(cost_across_split,
+                                series_combine(config,
+                                               memory_constraints,
+                                               cost_across_split,
                                                pre_result,
                                                post_result,
                                                parallel_split_transformation));
@@ -174,7 +190,9 @@ MachineMappingResult get_optimal_machine_mapping(
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints) {
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config) {
 
   MachineMappingProblemTree lhs = parallel_split.get_left_child();
   MachineMappingProblemTree rhs = parallel_split.get_right_child();
@@ -191,7 +209,9 @@ MachineMappingResult get_optimal_machine_mapping(
                                        series_split,
                                        resources,
                                        constraints,
-                                       ParallelSplitTransformation::LthenR);
+                                       memory_constraints,
+                                       ParallelSplitTransformation::LthenR,
+                                       config);
   }();
 
   MachineMappingConstraints left_constraints =
@@ -203,15 +223,17 @@ MachineMappingResult get_optimal_machine_mapping(
       [&](std::pair<MachineSpecification, MachineSpecification> const
               &resource_split) {
         MachineMappingResult left_result = get_optimal_machine_mapping(
-            result_cache, context, lhs, resource_split.first, left_constraints);
+            result_cache, context, lhs, resource_split.first, left_constraints, memory_constraints, config);
         MachineMappingResult right_result =
             get_optimal_machine_mapping(result_cache,
                                         context,
                                         rhs,
                                         resource_split.second,
-                                        right_constraints);
+                                        right_constraints,
+                                        memory_constraints,
+                                        config);
 
-        return parallel_combine(left_result, right_result);
+        return parallel_combine(config, memory_constraints, left_result, right_result);
       };
 
   std::unordered_set<MachineMappingResult> parallel_results = transform(
@@ -226,7 +248,9 @@ MachineMappingResult
                                 MachineMappingContext const &context,
                                 UnmappedOpCostEstimateKey const &leaf,
                                 MachineSpecification const &resource,
-                                MachineMappingConstraints const &constraints) {
+                                MachineMappingConstraints const &constraints,
+                                MachineMemoryConstraints const &memory_constraints,
+                                MachineMappingConfig const &config) {
 
   std::unordered_set<MachineView> candidates = [&] {
     std::optional<MachineView> machine_view = require_only_root(constraints);
@@ -240,9 +264,9 @@ MachineMappingResult
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    float cost = context.cost_estimator.estimate_cost(mapped);
+    CostMetric cost = context.cost_estimator.estimate_cost(mapped);
 
-    return make_singleton_machine_mapping_result(cost, machine_view);
+    return make_singleton_machine_mapping_result(config, memory_constraints, cost, machine_view);
   };
 
   std::unordered_set<MachineMappingResult> candidate_results =
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 3409f7f871..18e5049022 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/cost_estimator/cost_metric.h"
 #include "compiler/machine_mapping/machine_mapping.h"
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h"
 #include "utils/containers/map_keys.h"
@@ -32,7 +33,9 @@ FeasibleMachineMappingResult
 }
 
 MachineMappingResult
-    series_combine(float comm_cost,
+    series_combine(MachineMappingConfig const &config,
+                   MachineMemoryConstraints const &memory_constraints,
+                   CostMetric const &comm_cost,
                    MachineMappingResult const &maybe_pre_result,
                    MachineMappingResult const &maybe_post_result,
                    std::optional<ParallelSplitTransformation> const
@@ -63,16 +66,25 @@ MachineMappingResult
     }
   }();
 
-  return MachineMappingResult{
+  MachineMappingResult result_without_memory_check = MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*runtime=*/pre_result.runtime + comm_cost + post_result.runtime,
+          /*cost=*/combine_cost_metrics_inter_device(
+              {pre_result.cost, comm_cost, post_result.cost}),
           /*machine_mapping=*/mapping,
       },
   };
+
+  if (config.enable_memory_optimization) {
+    return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+  } else {
+    return result_without_memory_check;
+  }
 }
 
 MachineMappingResult
-    parallel_combine(MachineMappingResult const &maybe_lhs_result,
+    parallel_combine(MachineMappingConfig const &config,
+                     MachineMemoryConstraints const &memory_constraints,
+                     MachineMappingResult const &maybe_lhs_result,
                      MachineMappingResult const &maybe_rhs_result) {
   FeasibleMachineMappingResult lhs_result = ({
     if (is_infeasible(maybe_lhs_result)) {
@@ -88,14 +100,21 @@ MachineMappingResult
     require_feasible(maybe_rhs_result);
   });
 
-  return MachineMappingResult{
+  MachineMappingResult result_without_memory_check = MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*runtime=*/std::max(lhs_result.runtime, rhs_result.runtime),
+          /*cost=*/combine_cost_metrics_intra_device_parallel(lhs_result.cost,
+                                                              rhs_result.cost),
           /*machine_mapping=*/
           binary_combine_mappings(/*lhs=*/lhs_result.machine_mapping,
                                   /*rhs=*/rhs_result.machine_mapping),
       },
   };
+
+  if (config.enable_memory_optimization) {
+    return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+  } else {
+    return result_without_memory_check;
+  }
 }
 
 MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1,
@@ -114,25 +133,46 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1,
     require_feasible(maybe_m2);
   });
 
-  if (m2.runtime < m1.runtime) {
+  if (m2.cost.runtime < m1.cost.runtime) {
     return maybe_m2;
   } else {
     return maybe_m1;
   }
 }
 
-MachineMappingResult
-    make_singleton_machine_mapping_result(float runtime,
-                                          MachineView const &machine_view) {
-  return MachineMappingResult{
+MachineMappingResult make_singleton_machine_mapping_result(
+    MachineMappingConfig const &config,
+    MachineMemoryConstraints const &memory_constraints,
+    CostMetric const &cost,
+    MachineView const &machine_view) {
+  MachineMappingResult result_without_memory_check = MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*runtime=*/runtime,
+          /*cost=*/cost,
           /*machine_mapping=*/
           ParallelLayerGuidObliviousMachineMapping{{
               {binary_tree_root_path(), machine_view},
           }},
       },
   };
+
+  return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+}
+
+MachineMappingResult
+    machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints,
+                                 MachineMappingResult const &result) {
+  FeasibleMachineMappingResult feasible_result = ({
+    if (is_infeasible(result)) {
+      return infeasible_machine_mapping_result();
+    }
+    require_feasible(result);
+  });
+
+  if (feasible_result.cost.memory > memory_constraints.memory_limit) {
+    return infeasible_machine_mapping_result();
+  } else {
+    return result;
+  }
 }
 
 } // namespace FlexFlow

From da857a5e1e2e888773772f09bcc7d003cd2d95d5 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 16 Oct 2024 00:39:56 -0400
Subject: [PATCH 02/16] fmt

---
 .../get_optimal_machine_mapping.h             | 54 +++++++--------
 .../machine_mapping/machine_mapping_result.h  | 10 +--
 .../compiler/cost_estimator/cost_metric.cc    |  4 +-
 .../get_optimal_machine_mapping.cc            | 68 +++++++++++--------
 .../machine_mapping/machine_mapping_result.cc | 15 ++--
 5 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
index cd4896e260..e8b3771430 100644
--- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
@@ -14,25 +14,25 @@
 
 namespace FlexFlow {
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                MachineMappingProblemTree const &problem_tree,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                MachineMappingConfig const &config);
-
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                MMProblemTreeSeriesSplit const &series_split,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                std::optional<ParallelSplitTransformation> const
-                                    &parallel_split_transformation,
-                                MachineMappingConfig const &config);
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config);
+
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation,
+    MachineMappingConfig const &config);
 
 MachineMappingResult get_optimal_machine_mapping(
     MachineMappingCache &result_cache,
@@ -43,14 +43,14 @@ MachineMappingResult get_optimal_machine_mapping(
     MachineMemoryConstraints const &memory_constraints,
     MachineMappingConfig const &config);
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &,
-                                UnmappedOpCostEstimateKey const &leaf,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                MachineMappingConfig const &config);
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index 642d48ec02..c240d68f2b 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H
 
+#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_result.dtg.h"
-#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 #include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h"
-#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 
 namespace FlexFlow {
 
@@ -39,9 +39,9 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     CostMetric const &cost,
     MachineView const &machine_view);
 
-[[nodiscard]] MachineMappingResult
-    machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints,
-                                 MachineMappingResult const &result);
+[[nodiscard]] MachineMappingResult machine_mapping_memory_check(
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingResult const &result);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
index 370afab406..dfaf0702c9 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
@@ -4,8 +4,8 @@ namespace FlexFlow {
 
 CostMetric zero_cost_metric() {
   return CostMetric{
-    /*runtime=*/0,
-    /*memory=*/0,
+      /*runtime=*/0,
+      /*memory=*/0,
   };
 }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index a1a1595d98..3321d53e98 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -24,14 +24,14 @@
 
 namespace FlexFlow {
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                MachineMappingProblemTree const &problem_tree,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                MachineMappingConfig const &config) {
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config) {
 
   MachineMappingState state = MachineMappingState{
       problem_tree,
@@ -77,16 +77,16 @@ MachineMappingResult
   return result;
 }
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                MMProblemTreeSeriesSplit const &series_split,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                std::optional<ParallelSplitTransformation> const
-                                    &parallel_split_transformation,
-                                MachineMappingConfig const &config) {
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation,
+    MachineMappingConfig const &config) {
 
   auto get_boundary_machine_view_assignments =
       [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
@@ -222,8 +222,14 @@ MachineMappingResult get_optimal_machine_mapping(
   auto evaluate_resource_split =
       [&](std::pair<MachineSpecification, MachineSpecification> const
               &resource_split) {
-        MachineMappingResult left_result = get_optimal_machine_mapping(
-            result_cache, context, lhs, resource_split.first, left_constraints, memory_constraints, config);
+        MachineMappingResult left_result =
+            get_optimal_machine_mapping(result_cache,
+                                        context,
+                                        lhs,
+                                        resource_split.first,
+                                        left_constraints,
+                                        memory_constraints,
+                                        config);
         MachineMappingResult right_result =
             get_optimal_machine_mapping(result_cache,
                                         context,
@@ -233,7 +239,8 @@ MachineMappingResult get_optimal_machine_mapping(
                                         memory_constraints,
                                         config);
 
-        return parallel_combine(config, memory_constraints, left_result, right_result);
+        return parallel_combine(
+            config, memory_constraints, left_result, right_result);
       };
 
   std::unordered_set<MachineMappingResult> parallel_results = transform(
@@ -243,14 +250,14 @@ MachineMappingResult get_optimal_machine_mapping(
                           get_mapping_with_minimal_runtime(parallel_results));
 }
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                UnmappedOpCostEstimateKey const &leaf,
-                                MachineSpecification const &resource,
-                                MachineMappingConstraints const &constraints,
-                                MachineMemoryConstraints const &memory_constraints,
-                                MachineMappingConfig const &config) {
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resource,
+    MachineMappingConstraints const &constraints,
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingConfig const &config) {
 
   std::unordered_set<MachineView> candidates = [&] {
     std::optional<MachineView> machine_view = require_only_root(constraints);
@@ -266,7 +273,8 @@ MachineMappingResult
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
     CostMetric cost = context.cost_estimator.estimate_cost(mapped);
 
-    return make_singleton_machine_mapping_result(config, memory_constraints, cost, machine_view);
+    return make_singleton_machine_mapping_result(
+        config, memory_constraints, cost, machine_view);
   };
 
   std::unordered_set<MachineMappingResult> candidate_results =
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 18e5049022..fc9f747743 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -75,7 +75,8 @@ MachineMappingResult
   };
 
   if (config.enable_memory_optimization) {
-    return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+    return machine_mapping_memory_check(memory_constraints,
+                                        result_without_memory_check);
   } else {
     return result_without_memory_check;
   }
@@ -111,7 +112,8 @@ MachineMappingResult
   };
 
   if (config.enable_memory_optimization) {
-    return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+    return machine_mapping_memory_check(memory_constraints,
+                                        result_without_memory_check);
   } else {
     return result_without_memory_check;
   }
@@ -155,12 +157,13 @@ MachineMappingResult make_singleton_machine_mapping_result(
       },
   };
 
-  return machine_mapping_memory_check(memory_constraints, result_without_memory_check);
+  return machine_mapping_memory_check(memory_constraints,
+                                      result_without_memory_check);
 }
 
-MachineMappingResult
-    machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints,
-                                 MachineMappingResult const &result) {
+MachineMappingResult machine_mapping_memory_check(
+    MachineMemoryConstraints const &memory_constraints,
+    MachineMappingResult const &result) {
   FeasibleMachineMappingResult feasible_result = ({
     if (is_infeasible(result)) {
       return infeasible_machine_mapping_result();

From ef8c5c2f2d6eafec7fc9ec95da417879368457cf Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 16 Oct 2024 18:18:14 -0400
Subject: [PATCH 03/16] pass existing tests

---
 .../cost_estimator_for_test.cc                |  18 +--
 .../machine_mapping/cost_estimator_for_test.h |  18 +--
 .../get_optimal_machine_mapping.cc            |  82 +++++++++---
 .../machine_mapping/machine_mapping_result.cc | 123 ++++++++++++++----
 4 files changed, 179 insertions(+), 62 deletions(-)

diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index 9ee596af3e..7607132832 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -5,23 +5,25 @@
 namespace FlexFlow {
 
 TestCostEstimator::TestCostEstimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost,
+    std::function<CostMetric(TensorSetMovement const &)> const
         &get_communication_cost)
     : get_operator_cost(get_operator_cost),
       get_communication_cost(get_communication_cost) {}
 
-float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+CostMetric TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
 }
 
-float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
+CostMetric TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->get_communication_cost(m);
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost,
+    std::function<CostMetric(TensorSetMovement const &)> const
         &get_communication_cost) {
 
   return CostEstimator::create<TestCostEstimator>(get_operator_cost,
@@ -29,8 +31,8 @@ CostEstimator make_fake_cost_estimator(
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
+    std::unordered_map<OpCostEstimateKey, CostMetric> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, CostMetric> const &comm_cost_map) {
   return make_fake_cost_estimator(
       [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
       [comm_cost_map](TensorSetMovement const &m) {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 7c1d06207a..1b2cc9e91e 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -11,27 +11,27 @@
 namespace FlexFlow {
 
 struct TestCostEstimator : public ICostEstimator {
-  std::function<float(OpCostEstimateKey const &)> get_operator_cost;
-  std::function<float(TensorSetMovement const &)> get_communication_cost;
+  std::function<CostMetric(OpCostEstimateKey const &)> get_operator_cost;
+  std::function<CostMetric(TensorSetMovement const &)> get_communication_cost;
 
   TestCostEstimator() = delete;
   TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost,
                     decltype(get_communication_cost)
                         const &get_communication_cost);
 
-  float estimate_cost(OpCostEstimateKey const &) const override;
-
-  float estimate_cost(TensorSetMovement const &) const override;
+  CostMetric estimate_cost(OpCostEstimateKey const &) const override;
+  CostMetric estimate_cost(TensorSetMovement const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost,
+    std::function<CostMetric(TensorSetMovement const &)> const
         &get_communication_cost);
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
+    std::unordered_map<OpCostEstimateKey, CostMetric> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, CostMetric> const &comm_cost_map);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 0a874948e4..440e8506c4 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -118,22 +118,22 @@ TEST_SUITE(FF_TEST_SUITE) {
         }};
 
     CostEstimator cost_estimator = make_fake_cost_estimator(
-        std::unordered_map<OpCostEstimateKey, float>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
+        std::unordered_map<OpCostEstimateKey, CostMetric>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 1}},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 2}},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 3}},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 3}},
         }},
-        std::unordered_map<TensorSetMovement, float>{{
-            {TensorSetMovement{{}}, 0.0},
+        std::unordered_map<TensorSetMovement, CostMetric>{{
+            {TensorSetMovement{{}}, CostMetric{0.0, 0}},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             0.1},
+             CostMetric{0.1, 0}},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             0.2},
+             CostMetric{0.2, 0}},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             0.3},
+             CostMetric{0.3, 0}},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             0.4},
+             CostMetric{0.4, 0}},
         }});
 
     MachineMappingContext context = MachineMappingContext{
@@ -150,11 +150,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResult result = get_optimal_machine_mapping(
-          cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
+          /*memory_limit=*/10,
+      };
+
+      MachineMappingConfig config = MachineMappingConfig{
+          /*enable_memory_optimization=*/false,
+      };
+
+      MachineMappingResult result =
+          get_optimal_machine_mapping(cache,
+                                      context,
+                                      problem_tree,
+                                      full_machine_spec,
+                                      constraints,
+                                      memory_constraints,
+                                      config);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/1.0,
+              /*cost=*/CostMetric{1.0, 1},
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
@@ -173,11 +187,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResult result = get_optimal_machine_mapping(
-          cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
+          /*memory_limit=*/10,
+      };
+
+      MachineMappingConfig config = MachineMappingConfig{
+          /*enable_memory_optimization=*/false,
+      };
+
+      MachineMappingResult result =
+          get_optimal_machine_mapping(cache,
+                                      context,
+                                      problem_tree,
+                                      full_machine_spec,
+                                      constraints,
+                                      memory_constraints,
+                                      config);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/1.0 + 2.0 + 0.1,
+              /*cost=*/CostMetric{1.0 + 2.0 + 0.1, 1 + 2 + 0},
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -207,11 +235,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResult result = get_optimal_machine_mapping(
-          cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
+          /*memory_limit=*/10,
+      };
+
+      MachineMappingConfig config = MachineMappingConfig{
+          /*enable_memory_optimization=*/false,
+      };
+
+      MachineMappingResult result =
+          get_optimal_machine_mapping(cache,
+                                      context,
+                                      problem_tree,
+                                      full_machine_spec,
+                                      constraints,
+                                      memory_constraints,
+                                      config);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/2.5,
+              /*cost=*/CostMetric{2.5, 3},
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index 254d6b2784..7665f929f2 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -6,10 +6,20 @@ using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("series_combine") {
+    MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
+        /*memory_limit=*/10,
+    };
+    MachineMappingConfig config = MachineMappingConfig{
+        /*enable_memory_optimization=*/false,
+    };
+
     MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
     MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
 
-    float pre_cost = 2.0;
+    CostMetric pre_cost = CostMetric{
+        /*runtime=*/2.0,
+        /*memory=*/2,
+    };
     MachineMappingResult pre = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/pre_cost,
@@ -31,7 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    float post_cost = 4.0;
+    CostMetric post_cost = CostMetric{
+        /*runtime=*/4.0,
+        /*memory=*/1,
+    };
     MachineMappingResult post = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/post_cost,
@@ -47,19 +60,32 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
-    float comm_cost = 3.0;
+    CostMetric comm_cost = CostMetric{
+        /*runtime=*/3.0,
+        /*memory=*/0,
+    };
 
     SUBCASE("pre is infeasbile") {
-      MachineMappingResult result = series_combine(
-          comm_cost, infeasible, post, ParallelSplitTransformation::LthenR);
+      MachineMappingResult result =
+          series_combine(config,
+                         memory_constraints,
+                         comm_cost,
+                         infeasible,
+                         post,
+                         ParallelSplitTransformation::LthenR);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("post is infeasbile") {
-      MachineMappingResult result = series_combine(
-          comm_cost, pre, infeasible, ParallelSplitTransformation::LthenR);
+      MachineMappingResult result =
+          series_combine(config,
+                         memory_constraints,
+                         comm_cost,
+                         pre,
+                         infeasible,
+                         ParallelSplitTransformation::LthenR);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
@@ -67,7 +93,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("both are infeasible") {
       MachineMappingResult result =
-          series_combine(comm_cost,
+          series_combine(config,
+                         memory_constraints,
+                         comm_cost,
                          infeasible,
                          infeasible,
                          ParallelSplitTransformation::LthenR);
@@ -77,9 +105,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("both are feasible") {
+      CostMetric no_parallel_split_transform_cost = CostMetric{
+          /*runtime=*/pre_cost.runtime + post_cost.runtime + comm_cost.runtime,
+          /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory,
+      };
       MachineMappingResult no_parallel_split_transform = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/pre_cost + comm_cost + post_cost,
+              /*cost=*/no_parallel_split_transform_cost,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -107,27 +139,42 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       SUBCASE("parallel_split_transformation = std::nullopt") {
-        MachineMappingResult result =
-            series_combine(comm_cost, pre, post, std::nullopt);
+        MachineMappingResult result = series_combine(
+            config, memory_constraints, comm_cost, pre, post, std::nullopt);
         MachineMappingResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = LthenR") {
-        MachineMappingResult result = series_combine(
-            comm_cost, pre, post, ParallelSplitTransformation::LthenR);
+        MachineMappingResult result =
+            series_combine(config,
+                           memory_constraints,
+                           comm_cost,
+                           pre,
+                           post,
+                           ParallelSplitTransformation::LthenR);
         MachineMappingResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = RthenL") {
-        MachineMappingResult result = series_combine(
-            comm_cost, pre, post, ParallelSplitTransformation::RthenL);
+        MachineMappingResult result =
+            series_combine(config,
+                           memory_constraints,
+                           comm_cost,
+                           pre,
+                           post,
+                           ParallelSplitTransformation::RthenL);
+        CostMetric correct_cost = CostMetric{
+            /*runtime=*/pre_cost.runtime + post_cost.runtime +
+                comm_cost.runtime,
+            /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory,
+        };
         MachineMappingResult correct = MachineMappingResult{
             FeasibleMachineMappingResult{
-                /*runtime=*/pre_cost + comm_cost + post_cost,
+                /*runtime=*/correct_cost,
                 /*machine_mapping=*/
                 ParallelLayerGuidObliviousMachineMapping{{
                     {
@@ -160,12 +207,29 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("parallel_combine") {
+    MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
+        /*memory_limit=*/10,
+    };
+    MachineMappingConfig config = MachineMappingConfig{
+        /*enable_memory_optimization=*/false,
+    };
+
     MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
     MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
 
+    CostMetric lhs_cost = CostMetric{
+        /*runtime=*/2.0,
+        /*memory=*/2,
+    };
+
+    CostMetric rhs_cost = CostMetric{
+        /*runtime=*/4.0,
+        /*memory=*/1,
+    };
+
     MachineMappingResult lhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/2.0,
+            /*cost=*/lhs_cost,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -186,7 +250,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult rhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/4.0,
+            /*cost=*/rhs_cost,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -200,31 +264,40 @@ TEST_SUITE(FF_TEST_SUITE) {
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
     SUBCASE("lhs is infeasbile") {
-      MachineMappingResult result = parallel_combine(infeasible, rhs);
+      MachineMappingResult result =
+          parallel_combine(config, memory_constraints, infeasible, rhs);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("rhs is infeasbile") {
-      MachineMappingResult result = parallel_combine(lhs, infeasible);
+      MachineMappingResult result =
+          parallel_combine(config, memory_constraints, lhs, infeasible);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are infeasible") {
-      MachineMappingResult result = parallel_combine(infeasible, infeasible);
+      MachineMappingResult result =
+          parallel_combine(config, memory_constraints, infeasible, infeasible);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are feasible") {
-      MachineMappingResult result = parallel_combine(lhs, rhs);
+      MachineMappingResult result =
+          parallel_combine(config, memory_constraints, lhs, rhs);
+
+      CostMetric correct_cost = CostMetric{
+          /*runtime=*/4.0,
+          /*memory=*/2,
+      };
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/4.0,
+              /*cost=*/correct_cost,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -261,7 +334,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult faster = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/2.0,
+            /*cost=*/CostMetric{2.0, 2},
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -282,7 +355,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult slower = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/4.0,
+            /*cost=*/CostMetric{4.0, 1},
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {

From 982f1f5a711c0d7e6708f2e574257ef02645fea6 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 30 Oct 2024 20:15:31 -0400
Subject: [PATCH 04/16] initialize memory algorithm

---
 .../compiler/cost_estimator/cost_estimator.h  |  11 +-
 ...easible_machine_mapping_result.struct.toml |   5 +-
 .../get_optimal_machine_mapping.h             |  55 ++--
 .../machine_mapping/machine_mapping_result.h  |  22 +-
 .../machine_mapping_state.struct.toml         |  10 -
 .../get_optimal_machine_mapping_with_memory.h |  49 ++++
 .../machine_mapping_cache_with_memory.h       |  19 ++
 ...hine_mapping_cache_with_memory.struct.toml |  22 ++
 .../machine_mapping_result_with_memory.h      |  40 +++
 ...ine_mapping_result_with_memory.struct.toml |  20 ++
 .../machine_memory_constraints.struct.toml    |   0
 .../single_machine_mapping.struct.toml        |  20 ++
 ...lel_layer_guid_oblivious_machine_mapping.h |   1 +
 .../compiler/cost_estimator/cost_estimator.cc |   9 +-
 .../get_optimal_machine_mapping.cc            | 100 +++----
 .../machine_mapping/machine_mapping.cc        |   1 -
 .../machine_mapping/machine_mapping_result.cc |  67 +----
 ...get_optimal_machine_mapping_with_memory.cc | 264 ++++++++++++++++++
 .../machine_mapping_cache_with_memory.cc      |  32 +++
 .../machine_mapping_result_with_memory.cc     | 134 +++++++++
 .../test/src/allowed_machine_views.cc         | 104 +++++++
 .../cost_estimator_for_test.cc                |  18 +-
 .../machine_mapping/cost_estimator_for_test.h |  18 +-
 .../get_optimal_machine_mapping.cc            | 113 ++++----
 .../get_tensor_set_movement_across_split.cc   |  63 ++++-
 .../machine_mapping/machine_mapping.cc        |  84 +++++-
 .../machine_mapping/machine_mapping_result.cc | 216 +++++++-------
 27 files changed, 1102 insertions(+), 395 deletions(-)
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml
 rename lib/compiler/include/compiler/machine_mapping/{machine_memory_constraints => memory_optimization}/machine_memory_constraints.struct.toml (100%)
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
 create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
 create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc
 create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
 create mode 100644 lib/compiler/test/src/allowed_machine_views.cc

diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index 55311af83b..828200cc6a 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -12,8 +12,10 @@
 namespace FlexFlow {
 
 struct ICostEstimator {
-  virtual CostMetric estimate_cost(OpCostEstimateKey const &) const = 0;
-  virtual CostMetric estimate_cost(TensorSetMovement const &) const = 0;
+  virtual float estimate_cost(OpCostEstimateKey const &) const = 0;
+  virtual float estimate_cost(TensorSetMovement const &) const = 0;
+  virtual CostMetric
+      estimate_cost_with_memory(OpCostEstimateKey const &) const = 0;
 
   ICostEstimator() = default;
   ICostEstimator(ICostEstimator const &) = delete;
@@ -24,8 +26,9 @@ struct ICostEstimator {
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
-  CostMetric estimate_cost(OpCostEstimateKey const &k) const;
-  CostMetric estimate_cost(TensorSetMovement const &m) const;
+  float estimate_cost(OpCostEstimateKey const &k) const;
+  float estimate_cost(TensorSetMovement const &m) const;
+  CostMetric estimate_cost_with_memory(OpCostEstimateKey const &k) const;
 
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<ICostEstimator, T>::value,
diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
index 07dc30d2fc..e71cfc540f 100644
--- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
@@ -8,12 +8,11 @@ features = [
 
 includes = [
   "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
-  "compiler/cost_estimator/cost_metric.dtg.h",
 ]
 
 [[fields]]
-name = "cost"
-type = "::FlexFlow::CostMetric"
+name = "runtime"
+type = "float"
 
 [[fields]]
 name = "machine_mapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
index e8b3771430..f69e6ab91b 100644
--- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
@@ -8,49 +8,40 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
-#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 
 namespace FlexFlow {
 
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &context,
-    MachineMappingProblemTree const &problem_tree,
-    MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config);
-
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &context,
-    MMProblemTreeSeriesSplit const &series_split,
-    MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    std::optional<ParallelSplitTransformation> const
-        &parallel_split_transformation,
-    MachineMappingConfig const &config);
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &context,
+                                MachineMappingProblemTree const &problem_tree,
+                                MachineSpecification const &resources,
+                                MachineMappingConstraints const &constraints);
+
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &context,
+                                MMProblemTreeSeriesSplit const &series_split,
+                                MachineSpecification const &resources,
+                                MachineMappingConstraints const &constraints,
+                                std::optional<ParallelSplitTransformation> const
+                                    &parallel_split_transformation);
 
 MachineMappingResult get_optimal_machine_mapping(
     MachineMappingCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config);
-
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &,
-    UnmappedOpCostEstimateKey const &leaf,
-    MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config);
+    MachineMappingConstraints const &constraints);
+
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &,
+                                UnmappedOpCostEstimateKey const &leaf,
+                                MachineSpecification const &resources,
+                                MachineMappingConstraints const &constraints);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index c240d68f2b..b21fea5f24 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -1,9 +1,7 @@
 #ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H
 
-#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_result.dtg.h"
-#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 
 namespace FlexFlow {
@@ -16,32 +14,22 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     std::unordered_set<MachineMappingResult> const &);
 
 [[nodiscard]] MachineMappingResult
-    series_combine(MachineMappingConfig const &config,
-                   MachineMemoryConstraints const &memory_constraints,
-                   CostMetric const &comm_cost,
+    series_combine(float comm_cost,
                    MachineMappingResult const &pre_result,
                    MachineMappingResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
                        &parallel_split_transformation);
 [[nodiscard]] MachineMappingResult
-    parallel_combine(MachineMappingConfig const &config,
-                     MachineMemoryConstraints const &memory_constraints,
-                     MachineMappingResult const &lhs_result,
+    parallel_combine(MachineMappingResult const &lhs_result,
                      MachineMappingResult const &rhs_result);
 
 [[nodiscard]] MachineMappingResult
     minimize_runtime(MachineMappingResult const &m1,
                      MachineMappingResult const &m2);
 
-[[nodiscard]] MachineMappingResult make_singleton_machine_mapping_result(
-    MachineMappingConfig const &config,
-    MachineMemoryConstraints const &memory_constraints,
-    CostMetric const &cost,
-    MachineView const &machine_view);
-
-[[nodiscard]] MachineMappingResult machine_mapping_memory_check(
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingResult const &result);
+[[nodiscard]] MachineMappingResult
+    make_singleton_machine_mapping_result(float runtime,
+                                          MachineView const &machine_view);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
index b4a6147b5a..1346f6ebe7 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml
@@ -9,9 +9,7 @@ features = [
 includes = [
   "pcg/machine_specification.dtg.h",
   "compiler/machine_mapping/machine_mapping_constraints.dtg.h",
-  "compiler/machine_mapping/machine_mapping_config.dtg.h",
   "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h",
-  "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h",
 ]
 
 [[fields]]
@@ -25,11 +23,3 @@ type = "::FlexFlow::MachineSpecification"
 [[fields]]
 name = "constraints"
 type = "::FlexFlow::MachineMappingConstraints"
-
-[[fields]]
-name = "memory_constraints"
-type = "::FlexFlow::MachineMemoryConstraints"
-
-[[fields]]
-name = "config"
-type = "::FlexFlow::MachineMappingConfig"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
new file mode 100644
index 0000000000..f8a2e4d75a
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_context.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation);
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeParallelSplit const &parallel_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h
new file mode 100644
index 0000000000..2c45c04d3d
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h"
+
+namespace FlexFlow {
+
+MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory();
+std::optional<MachineMappingResultWithMemory>
+    machine_mapping_cache_with_memory_load(
+        MachineMappingCacheWithMemory const &, MachineMappingState const &);
+void machine_mapping_cache_with_memory_save(
+    MachineMappingCacheWithMemory &,
+    MachineMappingState const &,
+    MachineMappingResultWithMemory const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml
new file mode 100644
index 0000000000..e7afa26bb3
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "MachineMappingCacheWithMemory"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "<unordered_map>",
+  "compiler/machine_mapping/machine_mapping_state.dtg.h",
+  "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
+]
+
+[[fields]]
+name = "raw_map"
+type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingResultWithMemory>"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
new file mode 100644
index 0000000000..6203b99e55
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
@@ -0,0 +1,40 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+
+namespace FlexFlow {
+
+[[nodiscard]] MachineMappingResultWithMemory
+    empty_machine_mapping_result_with_memory();
+[[nodiscard]] bool is_empty(MachineMappingResultWithMemory const &);
+
+[[nodiscard]] MachineMappingResultWithMemory get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingResultWithMemory> const &);
+
+[[nodiscard]] MachineMappingResultWithMemory
+    remove_non_dominating_machine_mapping_result(
+        MachineMappingResultWithMemory const &);
+
+[[nodiscard]] MachineMappingResultWithMemory
+    series_combine(float comm_cost,
+                   MachineMappingResultWithMemory const &pre_result,
+                   MachineMappingResultWithMemory const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation);
+[[nodiscard]] MachineMappingResultWithMemory
+    parallel_combine(MachineMappingResultWithMemory const &lhs_result,
+                     MachineMappingResultWithMemory const &rhs_result);
+
+[[nodiscard]] MachineMappingResultWithMemory
+    minimize_runtime(MachineMappingResultWithMemory const &m1,
+                     MachineMappingResultWithMemory const &m2);
+
+[[nodiscard]] MachineMappingResultWithMemory
+    make_singleton_machine_mapping_result_with_memory(
+        CostMetric cost, MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml
new file mode 100644
index 0000000000..f3b2895b83
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "MachineMappingResultWithMemory"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/memory_optimization/single_machine_mapping.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_set.h",
+  "utils/fmt/unordered_set.h",
+]
+
+[[fields]]
+name = "machine_mappings"
+type = "std::unordered_set<::FlexFlow::SingleMachineMapping>"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml
similarity index 100%
rename from lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml
rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
new file mode 100644
index 0000000000..05a23e905a
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "SingleMachineMapping"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
+  "compiler/cost_estimator/cost_metric.dtg.h",
+]
+
+[[fields]]
+name = "cost"
+type = "::FlexFlow::CostMetric"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::ParallelLayerGuidObliviousMachineMapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h
index accd96af4c..cb3af9c689 100644
--- a/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_PARALLEL_LAYER_GUID_OBLIVIOUS_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
index 10e999dc1a..40a0f4e2a4 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
@@ -5,12 +5,17 @@ namespace FlexFlow {
 CostEstimator::CostEstimator(std::shared_ptr<ICostEstimator> implementation_ptr)
     : implementation_ptr(implementation_ptr) {}
 
-CostMetric CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->implementation_ptr->estimate_cost(k);
 }
 
-CostMetric CostEstimator::estimate_cost(TensorSetMovement const &m) const {
+float CostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->implementation_ptr->estimate_cost(m);
 }
 
+CostMetric
+    CostEstimator::estimate_cost_with_memory(OpCostEstimateKey const &k) const {
+  return this->implementation_ptr->estimate_cost_with_memory(k);
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 3321d53e98..10abd7ff90 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -24,21 +24,17 @@
 
 namespace FlexFlow {
 
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &context,
-    MachineMappingProblemTree const &problem_tree,
-    MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config) {
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &context,
+                                MachineMappingProblemTree const &problem_tree,
+                                MachineSpecification const &resources,
+                                MachineMappingConstraints const &constraints) {
 
   MachineMappingState state = MachineMappingState{
       problem_tree,
       resources,
       constraints,
-      memory_constraints,
-      config,
   };
 
   {
@@ -58,18 +54,14 @@ MachineMappingResult get_optimal_machine_mapping(
                 series_split,
                 resources,
                 constraints,
-                memory_constraints,
-                /*parallel_split_transformation=*/std::nullopt,
-                config);
+                /*parallel_split_transformation=*/std::nullopt);
           },
           [&](auto const &decomp_tree_node) {
             return get_optimal_machine_mapping(result_cache,
                                                context,
                                                decomp_tree_node,
                                                resources,
-                                               constraints,
-                                               memory_constraints,
-                                               config);
+                                               constraints);
           },
       });
 
@@ -77,16 +69,14 @@ MachineMappingResult get_optimal_machine_mapping(
   return result;
 }
 
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &context,
-    MMProblemTreeSeriesSplit const &series_split,
-    MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    std::optional<ParallelSplitTransformation> const
-        &parallel_split_transformation,
-    MachineMappingConfig const &config) {
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &context,
+                                MMProblemTreeSeriesSplit const &series_split,
+                                MachineSpecification const &resources,
+                                MachineMappingConstraints const &constraints,
+                                std::optional<ParallelSplitTransformation> const
+                                    &parallel_split_transformation) {
 
   auto get_boundary_machine_view_assignments =
       [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
@@ -120,9 +110,7 @@ MachineMappingResult get_optimal_machine_mapping(
                                         context,
                                         series_split.get_left_child(),
                                         resources,
-                                        pre_candidate,
-                                        memory_constraints,
-                                        config);
+                                        pre_candidate);
 
         return pre_result;
       };
@@ -138,9 +126,7 @@ MachineMappingResult get_optimal_machine_mapping(
                                         context,
                                         series_split.get_right_child(),
                                         resources,
-                                        post_candidate,
-                                        memory_constraints,
-                                        config);
+                                        post_candidate);
 
         return post_result;
       };
@@ -169,13 +155,11 @@ MachineMappingResult get_optimal_machine_mapping(
               tensor_movement,
               /*pre_mapping=*/assigned_pre_machine_views,
               /*post_mapping=*/assigned_post_machine_views);
-      CostMetric cost_across_split =
+      float cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
       result = minimize_runtime(result,
-                                series_combine(config,
-                                               memory_constraints,
-                                               cost_across_split,
+                                series_combine(cost_across_split,
                                                pre_result,
                                                post_result,
                                                parallel_split_transformation));
@@ -190,9 +174,7 @@ MachineMappingResult get_optimal_machine_mapping(
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config) {
+    MachineMappingConstraints const &constraints) {
 
   MachineMappingProblemTree lhs = parallel_split.get_left_child();
   MachineMappingProblemTree rhs = parallel_split.get_right_child();
@@ -209,9 +191,7 @@ MachineMappingResult get_optimal_machine_mapping(
                                        series_split,
                                        resources,
                                        constraints,
-                                       memory_constraints,
-                                       ParallelSplitTransformation::LthenR,
-                                       config);
+                                       ParallelSplitTransformation::LthenR);
   }();
 
   MachineMappingConstraints left_constraints =
@@ -222,25 +202,16 @@ MachineMappingResult get_optimal_machine_mapping(
   auto evaluate_resource_split =
       [&](std::pair<MachineSpecification, MachineSpecification> const
               &resource_split) {
-        MachineMappingResult left_result =
-            get_optimal_machine_mapping(result_cache,
-                                        context,
-                                        lhs,
-                                        resource_split.first,
-                                        left_constraints,
-                                        memory_constraints,
-                                        config);
+        MachineMappingResult left_result = get_optimal_machine_mapping(
+            result_cache, context, lhs, resource_split.first, left_constraints);
         MachineMappingResult right_result =
             get_optimal_machine_mapping(result_cache,
                                         context,
                                         rhs,
                                         resource_split.second,
-                                        right_constraints,
-                                        memory_constraints,
-                                        config);
+                                        right_constraints);
 
-        return parallel_combine(
-            config, memory_constraints, left_result, right_result);
+        return parallel_combine(left_result, right_result);
       };
 
   std::unordered_set<MachineMappingResult> parallel_results = transform(
@@ -250,14 +221,12 @@ MachineMappingResult get_optimal_machine_mapping(
                           get_mapping_with_minimal_runtime(parallel_results));
 }
 
-MachineMappingResult get_optimal_machine_mapping(
-    MachineMappingCache &result_cache,
-    MachineMappingContext const &context,
-    UnmappedOpCostEstimateKey const &leaf,
-    MachineSpecification const &resource,
-    MachineMappingConstraints const &constraints,
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingConfig const &config) {
+MachineMappingResult
+    get_optimal_machine_mapping(MachineMappingCache &result_cache,
+                                MachineMappingContext const &context,
+                                UnmappedOpCostEstimateKey const &leaf,
+                                MachineSpecification const &resource,
+                                MachineMappingConstraints const &constraints) {
 
   std::unordered_set<MachineView> candidates = [&] {
     std::optional<MachineView> machine_view = require_only_root(constraints);
@@ -271,10 +240,9 @@ MachineMappingResult get_optimal_machine_mapping(
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    CostMetric cost = context.cost_estimator.estimate_cost(mapped);
+    float cost = context.cost_estimator.estimate_cost(mapped);
 
-    return make_singleton_machine_mapping_result(
-        config, memory_constraints, cost, machine_view);
+    return make_singleton_machine_mapping_result(cost, machine_view);
   };
 
   std::unordered_set<MachineMappingResult> candidate_results =
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 6f350d8773..57e82684e9 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,5 +1,4 @@
 #include "compiler/machine_mapping/machine_mapping.h"
-#include "utils/containers.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/merge_maps.h"
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index fc9f747743..3409f7f871 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -1,5 +1,4 @@
 #include "compiler/machine_mapping/machine_mapping_result.h"
-#include "compiler/cost_estimator/cost_metric.h"
 #include "compiler/machine_mapping/machine_mapping.h"
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h"
 #include "utils/containers/map_keys.h"
@@ -33,9 +32,7 @@ FeasibleMachineMappingResult
 }
 
 MachineMappingResult
-    series_combine(MachineMappingConfig const &config,
-                   MachineMemoryConstraints const &memory_constraints,
-                   CostMetric const &comm_cost,
+    series_combine(float comm_cost,
                    MachineMappingResult const &maybe_pre_result,
                    MachineMappingResult const &maybe_post_result,
                    std::optional<ParallelSplitTransformation> const
@@ -66,26 +63,16 @@ MachineMappingResult
     }
   }();
 
-  MachineMappingResult result_without_memory_check = MachineMappingResult{
+  return MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*cost=*/combine_cost_metrics_inter_device(
-              {pre_result.cost, comm_cost, post_result.cost}),
+          /*runtime=*/pre_result.runtime + comm_cost + post_result.runtime,
           /*machine_mapping=*/mapping,
       },
   };
-
-  if (config.enable_memory_optimization) {
-    return machine_mapping_memory_check(memory_constraints,
-                                        result_without_memory_check);
-  } else {
-    return result_without_memory_check;
-  }
 }
 
 MachineMappingResult
-    parallel_combine(MachineMappingConfig const &config,
-                     MachineMemoryConstraints const &memory_constraints,
-                     MachineMappingResult const &maybe_lhs_result,
+    parallel_combine(MachineMappingResult const &maybe_lhs_result,
                      MachineMappingResult const &maybe_rhs_result) {
   FeasibleMachineMappingResult lhs_result = ({
     if (is_infeasible(maybe_lhs_result)) {
@@ -101,22 +88,14 @@ MachineMappingResult
     require_feasible(maybe_rhs_result);
   });
 
-  MachineMappingResult result_without_memory_check = MachineMappingResult{
+  return MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*cost=*/combine_cost_metrics_intra_device_parallel(lhs_result.cost,
-                                                              rhs_result.cost),
+          /*runtime=*/std::max(lhs_result.runtime, rhs_result.runtime),
           /*machine_mapping=*/
           binary_combine_mappings(/*lhs=*/lhs_result.machine_mapping,
                                   /*rhs=*/rhs_result.machine_mapping),
       },
   };
-
-  if (config.enable_memory_optimization) {
-    return machine_mapping_memory_check(memory_constraints,
-                                        result_without_memory_check);
-  } else {
-    return result_without_memory_check;
-  }
 }
 
 MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1,
@@ -135,47 +114,25 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1,
     require_feasible(maybe_m2);
   });
 
-  if (m2.cost.runtime < m1.cost.runtime) {
+  if (m2.runtime < m1.runtime) {
     return maybe_m2;
   } else {
     return maybe_m1;
   }
 }
 
-MachineMappingResult make_singleton_machine_mapping_result(
-    MachineMappingConfig const &config,
-    MachineMemoryConstraints const &memory_constraints,
-    CostMetric const &cost,
-    MachineView const &machine_view) {
-  MachineMappingResult result_without_memory_check = MachineMappingResult{
+MachineMappingResult
+    make_singleton_machine_mapping_result(float runtime,
+                                          MachineView const &machine_view) {
+  return MachineMappingResult{
       FeasibleMachineMappingResult{
-          /*cost=*/cost,
+          /*runtime=*/runtime,
           /*machine_mapping=*/
           ParallelLayerGuidObliviousMachineMapping{{
               {binary_tree_root_path(), machine_view},
           }},
       },
   };
-
-  return machine_mapping_memory_check(memory_constraints,
-                                      result_without_memory_check);
-}
-
-MachineMappingResult machine_mapping_memory_check(
-    MachineMemoryConstraints const &memory_constraints,
-    MachineMappingResult const &result) {
-  FeasibleMachineMappingResult feasible_result = ({
-    if (is_infeasible(result)) {
-      return infeasible_machine_mapping_result();
-    }
-    require_feasible(result);
-  });
-
-  if (feasible_result.cost.memory > memory_constraints.memory_limit) {
-    return infeasible_machine_mapping_result();
-  } else {
-    return result;
-  }
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
new file mode 100644
index 0000000000..676f3a6c8e
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -0,0 +1,264 @@
+#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "compiler/machine_mapping/get_machine_resource_splits.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "compiler/machine_mapping/transitive_reduced_pcg.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/get_all_assignments.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/exception.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MachineMappingProblemTree const &problem_tree,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints) {
+
+  MachineMappingState state = MachineMappingState{
+      problem_tree,
+      resources,
+      constraints,
+  };
+
+  {
+    std::optional<MachineMappingResultWithMemory> cached_result =
+        machine_mapping_cache_with_memory_load(result_cache, state);
+    if (cached_result) {
+      return cached_result.value();
+    }
+  }
+
+  MachineMappingResultWithMemory result =
+      problem_tree.visit<MachineMappingResultWithMemory>(overload{
+          [&](MMProblemTreeSeriesSplit const &series_split) {
+            return get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split,
+                resources,
+                constraints,
+                /*parallel_split_transformation=*/std::nullopt);
+          },
+          [&](auto const &decomp_tree_node) {
+            return get_optimal_machine_mapping_with_memory(result_cache,
+                                                           context,
+                                                           decomp_tree_node,
+                                                           resources,
+                                                           constraints);
+          },
+      });
+
+  machine_mapping_cache_with_memory_save(result_cache, state, result);
+  return result;
+}
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeSeriesSplit const &series_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints,
+    std::optional<ParallelSplitTransformation> const
+        &parallel_split_transformation) {
+
+  auto get_boundary_machine_view_assignments =
+      [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
+      -> std::unordered_set<ParallelLayerGuidObliviousMachineMapping> {
+    std::unordered_map<BinaryTreePath, std::unordered_set<MachineView>>
+        allowed = generate_map(
+            boundary_layers,
+            [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
+              UnmappedOpCostEstimateKey leaf =
+                  mm_problem_tree_get_subtree_at_path(
+                      MachineMappingProblemTree{series_split}, l)
+                      .value()
+                      .get<UnmappedOpCostEstimateKey>();
+              return context.allowed_machine_views(leaf, resources);
+            });
+    return transform(
+        get_all_assignments(allowed),
+        [](std::unordered_map<BinaryTreePath, MachineView> const &m) {
+          return ParallelLayerGuidObliviousMachineMapping{m};
+        });
+  };
+
+  auto eval_pre_boundary_mapping =
+      [&](ParallelLayerGuidObliviousMachineMapping const
+              &assigned_pre_machine_views) {
+        MachineMappingConstraints pre_candidate = with_additional_constraints(
+            restrict_to_left_child(constraints), assigned_pre_machine_views);
+
+        MachineMappingResultWithMemory pre_result =
+            get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split.get_left_child(),
+                resources,
+                pre_candidate);
+
+        return pre_result;
+      };
+
+  auto eval_post_boundary_mapping =
+      [&](ParallelLayerGuidObliviousMachineMapping const
+              &assigned_post_machine_views) {
+        MachineMappingConstraints post_candidate = with_additional_constraints(
+            restrict_to_right_child(constraints), assigned_post_machine_views);
+
+        MachineMappingResultWithMemory post_result =
+            get_optimal_machine_mapping_with_memory(
+                result_cache,
+                context,
+                series_split.get_right_child(),
+                resources,
+                post_candidate);
+
+        return post_result;
+      };
+
+  MachineMappingResultWithMemory result =
+      empty_machine_mapping_result_with_memory();
+  AbstractedTensorSetMovement tensor_movement =
+      series_split.tensor_set_movement;
+
+  for (ParallelLayerGuidObliviousMachineMapping const
+           &assigned_pre_machine_views :
+       get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) {
+
+    MachineMappingResultWithMemory pre_result =
+        eval_pre_boundary_mapping(assigned_pre_machine_views);
+
+    for (ParallelLayerGuidObliviousMachineMapping const
+             &assigned_post_machine_views :
+         get_boundary_machine_view_assignments(
+             get_dst_layers(tensor_movement))) {
+
+      MachineMappingResultWithMemory post_result =
+          eval_post_boundary_mapping(assigned_post_machine_views);
+
+      TensorSetMovement comm_across_split =
+          concretize_abstracted_tensor_set_movement(
+              tensor_movement,
+              /*pre_mapping=*/assigned_pre_machine_views,
+              /*post_mapping=*/assigned_post_machine_views);
+      float cost_across_split =
+          context.cost_estimator.estimate_cost(comm_across_split);
+
+      result = minimize_runtime(result,
+                                series_combine(cost_across_split,
+                                               pre_result,
+                                               post_result,
+                                               parallel_split_transformation));
+    }
+  }
+
+  return result;
+}
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    MMProblemTreeParallelSplit const &parallel_split,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints) {
+
+  MachineMappingProblemTree lhs = parallel_split.get_left_child();
+  MachineMappingProblemTree rhs = parallel_split.get_right_child();
+
+  MachineMappingResultWithMemory series_result = [&] {
+    MMProblemTreeSeriesSplit series_split = MMProblemTreeSeriesSplit{
+        /*tensor_set_movement=*/empty_abstracted_tensor_set_movement(),
+        /*left_child=*/lhs,
+        /*right_child=*/rhs,
+    };
+
+    return get_optimal_machine_mapping_with_memory(
+        result_cache,
+        context,
+        series_split,
+        resources,
+        constraints,
+        ParallelSplitTransformation::LthenR);
+  }();
+
+  MachineMappingConstraints left_constraints =
+      restrict_to_left_child(constraints);
+  MachineMappingConstraints right_constraints =
+      restrict_to_right_child(constraints);
+
+  auto evaluate_resource_split =
+      [&](std::pair<MachineSpecification, MachineSpecification> const
+              &resource_split) {
+        MachineMappingResultWithMemory left_result =
+            get_optimal_machine_mapping_with_memory(result_cache,
+                                                    context,
+                                                    lhs,
+                                                    resource_split.first,
+                                                    left_constraints);
+        MachineMappingResultWithMemory right_result =
+            get_optimal_machine_mapping_with_memory(result_cache,
+                                                    context,
+                                                    rhs,
+                                                    resource_split.second,
+                                                    right_constraints);
+
+        return parallel_combine(left_result, right_result);
+      };
+
+  std::unordered_set<MachineMappingResultWithMemory> parallel_results =
+      transform(get_machine_resource_splits(resources),
+                evaluate_resource_split);
+
+  return minimize_runtime(series_result,
+                          get_mapping_with_minimal_runtime(parallel_results));
+}
+
+MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
+    MachineMappingCacheWithMemory &result_cache,
+    MachineMappingContext const &context,
+    UnmappedOpCostEstimateKey const &leaf,
+    MachineSpecification const &resource,
+    MachineMappingConstraints const &constraints) {
+
+  std::unordered_set<MachineView> candidates = [&] {
+    std::optional<MachineView> machine_view = require_only_root(constraints);
+    if (machine_view.has_value()) {
+      return std::unordered_set{machine_view.value()};
+    } else {
+      return context.allowed_machine_views(leaf, resource);
+    }
+  }();
+
+  auto get_mapping_result = [&](MachineView const &machine_view) {
+    OpCostEstimateKey mapped =
+        map_unmapped_op_cost_estimate_key(leaf, machine_view);
+    CostMetric cost = context.cost_estimator.estimate_cost_with_memory(mapped);
+
+    return make_singleton_machine_mapping_result_with_memory(cost,
+                                                             machine_view);
+  };
+
+  std::unordered_set<MachineMappingResultWithMemory> candidate_results =
+      transform(candidates, get_mapping_result);
+
+  return get_mapping_with_minimal_runtime(candidate_results);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc
new file mode 100644
index 0000000000..e74612250e
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc
@@ -0,0 +1,32 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/try_at.h"
+
+namespace FlexFlow {
+
+MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory() {
+  return MachineMappingCacheWithMemory{{}};
+}
+
+std::optional<MachineMappingResultWithMemory>
+    machine_mapping_cache_with_memory_load(
+        MachineMappingCacheWithMemory const &cache,
+        MachineMappingState const &k) {
+  return try_at(cache.raw_map, k);
+}
+
+void machine_mapping_cache_with_memory_save(
+    MachineMappingCacheWithMemory &cache,
+    MachineMappingState const &k,
+    MachineMappingResultWithMemory const &v) {
+  if (contains_key(cache.raw_map, k)) {
+    throw mk_runtime_error(fmt::format(
+        "machine_mapping_cache_with_memory_save expected key to not already "
+        "exist, but received existing key {}",
+        k));
+  }
+
+  cache.raw_map.emplace(k, v);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
new file mode 100644
index 0000000000..1c4f8e1142
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -0,0 +1,134 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h"
+#include "utils/containers/set_union.h"
+#include "utils/full_binary_tree/binary_tree_path.h"
+
+namespace FlexFlow {
+
+MachineMappingResultWithMemory empty_machine_mapping_result_with_memory() {
+  return MachineMappingResultWithMemory{
+      {},
+  };
+}
+
+MachineMappingResultWithMemory get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingResultWithMemory> const &candidates) {
+  MachineMappingResultWithMemory result =
+      empty_machine_mapping_result_with_memory();
+
+  for (MachineMappingResultWithMemory const &candidate : candidates) {
+    result = minimize_runtime(result, candidate);
+  }
+
+  return result;
+}
+
+MachineMappingResultWithMemory remove_non_dominating_machine_mapping_result(
+    MachineMappingResultWithMemory const &result) {
+  std::unordered_set<SingleMachineMapping> non_dominating_mappings;
+  for (SingleMachineMapping const &mapping : result.machine_mappings) {
+    bool is_dominating = true;
+    for (SingleMachineMapping const &other_mapping : result.machine_mappings) {
+      if (mapping.cost.runtime >= other_mapping.cost.runtime &&
+          mapping.cost.memory >= other_mapping.cost.memory &&
+          mapping != other_mapping) {
+        is_dominating = false;
+        break;
+      }
+    }
+    if (is_dominating) {
+      non_dominating_mappings.insert(mapping);
+    }
+  }
+  return MachineMappingResultWithMemory{std::move(non_dominating_mappings)};
+}
+
+MachineMappingResultWithMemory
+    series_combine(float comm_cost,
+                   MachineMappingResultWithMemory const &pre_result,
+                   MachineMappingResultWithMemory const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation) {
+  auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm,
+                                     SingleMachineMapping const &post_mm) {
+    CostMetric cost = CostMetric{
+        pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
+        pre_mm.cost.memory + post_mm.cost.memory,
+    };
+
+    ParallelLayerGuidObliviousMachineMapping mapping = [&] {
+      if (parallel_split_transformation.has_value() &&
+          parallel_split_transformation.value() ==
+              ParallelSplitTransformation::RthenL) {
+        return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping,
+                                       /*rhs=*/pre_mm.machine_mapping);
+      } else {
+        return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping,
+                                       /*rhs=*/post_mm.machine_mapping);
+      }
+    }();
+
+    return SingleMachineMapping{cost, mapping};
+  };
+
+  MachineMappingResultWithMemory result =
+      empty_machine_mapping_result_with_memory();
+  for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) {
+    for (SingleMachineMapping const &post_mm : post_result.machine_mappings) {
+      result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm));
+    }
+  }
+
+  return remove_non_dominating_machine_mapping_result(result);
+}
+
+MachineMappingResultWithMemory
+    parallel_combine(MachineMappingResultWithMemory const &lhs_result,
+                     MachineMappingResultWithMemory const &rhs_result) {
+  auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm,
+                                     SingleMachineMapping const &rhs_mm) {
+    CostMetric cost = CostMetric{
+        std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
+        std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
+    };
+
+    ParallelLayerGuidObliviousMachineMapping mapping =
+        binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping);
+
+    return SingleMachineMapping{cost, mapping};
+  };
+
+  MachineMappingResultWithMemory result =
+      empty_machine_mapping_result_with_memory();
+  for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) {
+    for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) {
+      result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm));
+    }
+  }
+
+  return remove_non_dominating_machine_mapping_result(result);
+}
+
+MachineMappingResultWithMemory
+    minimize_runtime(MachineMappingResultWithMemory const &m1,
+                     MachineMappingResultWithMemory const &m2) {
+  MachineMappingResultWithMemory result = MachineMappingResultWithMemory{
+      set_union(m1.machine_mappings, m2.machine_mappings),
+  };
+  return remove_non_dominating_machine_mapping_result(result);
+}
+
+MachineMappingResultWithMemory
+    make_singleton_machine_mapping_result_with_memory(
+        CostMetric cost, MachineView const &machine_view) {
+  return MachineMappingResultWithMemory{{
+      SingleMachineMapping{
+          cost,
+          ParallelLayerGuidObliviousMachineMapping{{
+              {binary_tree_root_path(), machine_view},
+          }},
+      },
+  }};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
new file mode 100644
index 0000000000..936894ad2d
--- /dev/null
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -0,0 +1,104 @@
+#include "compiler/allowed_machine_views.h"
+#include "doctest/doctest.h"
+#include "utils/containers/extend.h"
+#include "utils/containers/range.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/containers/zip.h"
+#include "utils/fmt/unordered_set.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("get_allowed_machine_views") {
+
+    SUBCASE("1 degree of parallelism") {
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/1,
+          /*num_cpus_per_node=*/5,
+          /*num_gpus_per_node=*/5,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+
+      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+
+      std::unordered_set<MachineView> correct = {
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
+              {MachineViewDimension{stride_t{2},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2 degrees of parallelism") {
+
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/3,
+          /*num_cpus_per_node=*/3,
+          /*num_gpus_per_node=*/3,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{2, 3}};
+
+      auto make_2d_view = [&](int start_node_idx,
+                              int start_device_idx,
+                              int stride1,
+                              int stride2,
+                              MachineSpecificationDimension m1,
+                              MachineSpecificationDimension m2) {
+        return MachineView{
+            MachineSpaceCoordinate{
+                start_node_idx, start_device_idx, DeviceType::GPU},
+            {MachineViewDimension{stride_t{stride1}, m1},
+             MachineViewDimension{stride_t{stride2}, m2}},
+        };
+      };
+
+      auto intra = MachineSpecificationDimension::INTRA_NODE;
+      auto inter = MachineSpecificationDimension::INTER_NODE;
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
+          make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
+          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra),
+
+          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter),
+          make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter),
+          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter),
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+  }
+}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index 7607132832..9ee596af3e 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -5,25 +5,23 @@
 namespace FlexFlow {
 
 TestCostEstimator::TestCostEstimator(
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost,
-    std::function<CostMetric(TensorSetMovement const &)> const
+    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<float(TensorSetMovement const &)> const
         &get_communication_cost)
     : get_operator_cost(get_operator_cost),
       get_communication_cost(get_communication_cost) {}
 
-CostMetric TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
 }
 
-CostMetric TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
+float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->get_communication_cost(m);
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost,
-    std::function<CostMetric(TensorSetMovement const &)> const
+    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<float(TensorSetMovement const &)> const
         &get_communication_cost) {
 
   return CostEstimator::create<TestCostEstimator>(get_operator_cost,
@@ -31,8 +29,8 @@ CostEstimator make_fake_cost_estimator(
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, CostMetric> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, CostMetric> const &comm_cost_map) {
+    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
   return make_fake_cost_estimator(
       [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
       [comm_cost_map](TensorSetMovement const &m) {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 1b2cc9e91e..7c1d06207a 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -11,27 +11,27 @@
 namespace FlexFlow {
 
 struct TestCostEstimator : public ICostEstimator {
-  std::function<CostMetric(OpCostEstimateKey const &)> get_operator_cost;
-  std::function<CostMetric(TensorSetMovement const &)> get_communication_cost;
+  std::function<float(OpCostEstimateKey const &)> get_operator_cost;
+  std::function<float(TensorSetMovement const &)> get_communication_cost;
 
   TestCostEstimator() = delete;
   TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost,
                     decltype(get_communication_cost)
                         const &get_communication_cost);
 
-  CostMetric estimate_cost(OpCostEstimateKey const &) const override;
-  CostMetric estimate_cost(TensorSetMovement const &) const override;
+  float estimate_cost(OpCostEstimateKey const &) const override;
+
+  float estimate_cost(TensorSetMovement const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost,
-    std::function<CostMetric(TensorSetMovement const &)> const
+    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<float(TensorSetMovement const &)> const
         &get_communication_cost);
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, CostMetric> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, CostMetric> const &comm_cost_map);
+    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 440e8506c4..a0d06fe930 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -42,8 +42,35 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
     };
 
-    MachineView mv1 = make_1d_machine_view(gpu_id_t(1), gpu_id_t(2));
-    MachineView mv2 = make_1d_machine_view(gpu_id_t(1), gpu_id_t(3));
+    MachineView mv1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView mv2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
 
     MachineSpecification full_machine_spec = MachineSpecification{
         /*num_nodes=*/2,
@@ -118,22 +145,22 @@ TEST_SUITE(FF_TEST_SUITE) {
         }};
 
     CostEstimator cost_estimator = make_fake_cost_estimator(
-        std::unordered_map<OpCostEstimateKey, CostMetric>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 1}},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 2}},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 3}},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 3}},
+        std::unordered_map<OpCostEstimateKey, float>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
         }},
-        std::unordered_map<TensorSetMovement, CostMetric>{{
-            {TensorSetMovement{{}}, CostMetric{0.0, 0}},
+        std::unordered_map<TensorSetMovement, float>{{
+            {TensorSetMovement{{}}, 0.0},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             CostMetric{0.1, 0}},
+             0.1},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             CostMetric{0.2, 0}},
+             0.2},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             CostMetric{0.3, 0}},
+             0.3},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             CostMetric{0.4, 0}},
+             0.4},
         }});
 
     MachineMappingContext context = MachineMappingContext{
@@ -150,25 +177,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
-          /*memory_limit=*/10,
-      };
-
-      MachineMappingConfig config = MachineMappingConfig{
-          /*enable_memory_optimization=*/false,
-      };
-
-      MachineMappingResult result =
-          get_optimal_machine_mapping(cache,
-                                      context,
-                                      problem_tree,
-                                      full_machine_spec,
-                                      constraints,
-                                      memory_constraints,
-                                      config);
+      MachineMappingResult result = get_optimal_machine_mapping(
+          cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*cost=*/CostMetric{1.0, 1},
+              /*runtime=*/1.0,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
@@ -187,25 +200,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
-          /*memory_limit=*/10,
-      };
-
-      MachineMappingConfig config = MachineMappingConfig{
-          /*enable_memory_optimization=*/false,
-      };
-
-      MachineMappingResult result =
-          get_optimal_machine_mapping(cache,
-                                      context,
-                                      problem_tree,
-                                      full_machine_spec,
-                                      constraints,
-                                      memory_constraints,
-                                      config);
+      MachineMappingResult result = get_optimal_machine_mapping(
+          cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*cost=*/CostMetric{1.0 + 2.0 + 0.1, 1 + 2 + 0},
+              /*runtime=*/1.0 + 2.0 + 0.1,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -235,25 +234,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
-          /*memory_limit=*/10,
-      };
-
-      MachineMappingConfig config = MachineMappingConfig{
-          /*enable_memory_optimization=*/false,
-      };
-
-      MachineMappingResult result =
-          get_optimal_machine_mapping(cache,
-                                      context,
-                                      problem_tree,
-                                      full_machine_spec,
-                                      constraints,
-                                      memory_constraints,
-                                      config);
+      MachineMappingResult result = get_optimal_machine_mapping(
+          cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*cost=*/CostMetric{2.5, 3},
+              /*runtime=*/2.5,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index c66d533d0f..e22f715d82 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -64,10 +64,65 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelLayerAddedResult relu_2 = add_parallel_layer(
         pcg, relu_attrs, {get_only(relu_1.outputs)}, {relu_output_attrs});
 
-    MachineView pre_mv1 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1});
-    MachineView pre_mv2 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{2});
-    MachineView post_mv1 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{3});
-    MachineView post_mv2 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{4});
+    MachineView pre_mv1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView pre_mv2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView post_mv1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{3},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView post_mv2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{4},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
 
     SUBCASE("single edge across split") {
       PCGBinarySeriesSplit split = PCGBinarySeriesSplit{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index 6b16a54c1f..221cca3ae1 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -8,33 +8,89 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("combine_disjoint_mappings(MachineMapping, MachineMappping)") {
-    MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
-    MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
     MachineMapping machine_mapping_0 = MachineMapping({
-        {parallel_layer_guid_t(Node(0)), machine_view_0},
+        {parallel_layer_guid_t{Node{0}}, machine_view_0},
     });
     MachineMapping machine_mapping_1 = MachineMapping({
-        {parallel_layer_guid_t(Node(1)), machine_view_1},
-    });
-    MachineMapping correct = MachineMapping({
-        {parallel_layer_guid_t(Node(0)), machine_view_0},
-        {parallel_layer_guid_t(Node(1)), machine_view_1},
+        {parallel_layer_guid_t{Node{1}}, machine_view_1},
     });
+    MachineMapping correct = MachineMapping{{
+        {parallel_layer_guid_t{Node{0}}, machine_view_0},
+        {parallel_layer_guid_t{Node{1}}, machine_view_1},
+    }};
     MachineMapping result =
         combine_disjoint_mappings(machine_mapping_0, machine_mapping_1);
     CHECK(result == correct);
   }
 
   TEST_CASE("nodes_are_disjoint(MachineMapping, MachineMappping)") {
-    MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
-    MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
     MachineMapping machine_mapping_0 = MachineMapping({
-        {parallel_layer_guid_t(Node(0)), machine_view_0},
+        {parallel_layer_guid_t{Node{0}}, machine_view_0},
     });
 
     SUBCASE("nodes are disjoint") {
       MachineMapping machine_mapping_1 = MachineMapping({
-          {parallel_layer_guid_t(Node(1)), machine_view_1},
+          {parallel_layer_guid_t{Node{1}}, machine_view_1},
       });
 
       bool correct = true;
@@ -44,8 +100,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("nodes are not disjoint") {
       MachineMapping machine_mapping_1 = MachineMapping({
-          {parallel_layer_guid_t(Node(0)), machine_view_0},
-          {parallel_layer_guid_t(Node(1)), machine_view_1},
+          {parallel_layer_guid_t{Node{0}}, machine_view_0},
+          {parallel_layer_guid_t{Node{1}}, machine_view_1},
       });
       bool correct = false;
       bool result = nodes_are_disjoint(machine_mapping_0, machine_mapping_1);
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index 7665f929f2..73b921fc98 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -6,20 +6,37 @@ using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("series_combine") {
-    MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
-        /*memory_limit=*/10,
-    };
-    MachineMappingConfig config = MachineMappingConfig{
-        /*enable_memory_optimization=*/false,
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
     };
 
-    MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
-    MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
-
-    CostMetric pre_cost = CostMetric{
-        /*runtime=*/2.0,
-        /*memory=*/2,
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
     };
+
+    float pre_cost = 2.0;
     MachineMappingResult pre = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/pre_cost,
@@ -41,10 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostMetric post_cost = CostMetric{
-        /*runtime=*/4.0,
-        /*memory=*/1,
-    };
+    float post_cost = 4.0;
     MachineMappingResult post = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/post_cost,
@@ -60,32 +74,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
-    CostMetric comm_cost = CostMetric{
-        /*runtime=*/3.0,
-        /*memory=*/0,
-    };
+    float comm_cost = 3.0;
 
-    SUBCASE("pre is infeasbile") {
-      MachineMappingResult result =
-          series_combine(config,
-                         memory_constraints,
-                         comm_cost,
-                         infeasible,
-                         post,
-                         ParallelSplitTransformation::LthenR);
+    SUBCASE("pre is infeasible") {
+      MachineMappingResult result = series_combine(
+          comm_cost, infeasible, post, ParallelSplitTransformation::LthenR);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
-    SUBCASE("post is infeasbile") {
-      MachineMappingResult result =
-          series_combine(config,
-                         memory_constraints,
-                         comm_cost,
-                         pre,
-                         infeasible,
-                         ParallelSplitTransformation::LthenR);
+    SUBCASE("post is infeasible") {
+      MachineMappingResult result = series_combine(
+          comm_cost, pre, infeasible, ParallelSplitTransformation::LthenR);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
@@ -93,9 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("both are infeasible") {
       MachineMappingResult result =
-          series_combine(config,
-                         memory_constraints,
-                         comm_cost,
+          series_combine(comm_cost,
                          infeasible,
                          infeasible,
                          ParallelSplitTransformation::LthenR);
@@ -105,13 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("both are feasible") {
-      CostMetric no_parallel_split_transform_cost = CostMetric{
-          /*runtime=*/pre_cost.runtime + post_cost.runtime + comm_cost.runtime,
-          /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory,
-      };
       MachineMappingResult no_parallel_split_transform = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*cost=*/no_parallel_split_transform_cost,
+              /*runtime=*/pre_cost + comm_cost + post_cost,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -139,42 +134,27 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       SUBCASE("parallel_split_transformation = std::nullopt") {
-        MachineMappingResult result = series_combine(
-            config, memory_constraints, comm_cost, pre, post, std::nullopt);
+        MachineMappingResult result =
+            series_combine(comm_cost, pre, post, std::nullopt);
         MachineMappingResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = LthenR") {
-        MachineMappingResult result =
-            series_combine(config,
-                           memory_constraints,
-                           comm_cost,
-                           pre,
-                           post,
-                           ParallelSplitTransformation::LthenR);
+        MachineMappingResult result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::LthenR);
         MachineMappingResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = RthenL") {
-        MachineMappingResult result =
-            series_combine(config,
-                           memory_constraints,
-                           comm_cost,
-                           pre,
-                           post,
-                           ParallelSplitTransformation::RthenL);
-        CostMetric correct_cost = CostMetric{
-            /*runtime=*/pre_cost.runtime + post_cost.runtime +
-                comm_cost.runtime,
-            /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory,
-        };
+        MachineMappingResult result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::RthenL);
         MachineMappingResult correct = MachineMappingResult{
             FeasibleMachineMappingResult{
-                /*runtime=*/correct_cost,
+                /*runtime=*/pre_cost + comm_cost + post_cost,
                 /*machine_mapping=*/
                 ParallelLayerGuidObliviousMachineMapping{{
                     {
@@ -207,29 +187,39 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("parallel_combine") {
-    MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{
-        /*memory_limit=*/10,
-    };
-    MachineMappingConfig config = MachineMappingConfig{
-        /*enable_memory_optimization=*/false,
-    };
-
-    MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
-    MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
-
-    CostMetric lhs_cost = CostMetric{
-        /*runtime=*/2.0,
-        /*memory=*/2,
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
     };
 
-    CostMetric rhs_cost = CostMetric{
-        /*runtime=*/4.0,
-        /*memory=*/1,
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
     };
 
     MachineMappingResult lhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*cost=*/lhs_cost,
+            /*runtime=*/2.0,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -250,7 +240,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult rhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*cost=*/rhs_cost,
+            /*runtime=*/4.0,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -263,41 +253,32 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
-    SUBCASE("lhs is infeasbile") {
-      MachineMappingResult result =
-          parallel_combine(config, memory_constraints, infeasible, rhs);
+    SUBCASE("lhs is infeasible") {
+      MachineMappingResult result = parallel_combine(infeasible, rhs);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
-    SUBCASE("rhs is infeasbile") {
-      MachineMappingResult result =
-          parallel_combine(config, memory_constraints, lhs, infeasible);
+    SUBCASE("rhs is infeasible") {
+      MachineMappingResult result = parallel_combine(lhs, infeasible);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are infeasible") {
-      MachineMappingResult result =
-          parallel_combine(config, memory_constraints, infeasible, infeasible);
+      MachineMappingResult result = parallel_combine(infeasible, infeasible);
       MachineMappingResult correct = infeasible;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are feasible") {
-      MachineMappingResult result =
-          parallel_combine(config, memory_constraints, lhs, rhs);
-
-      CostMetric correct_cost = CostMetric{
-          /*runtime=*/4.0,
-          /*memory=*/2,
-      };
+      MachineMappingResult result = parallel_combine(lhs, rhs);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*cost=*/correct_cost,
+              /*runtime=*/4.0,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -329,12 +310,39 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("minimize_runtime") {
-    MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1));
-    MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2));
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
 
     MachineMappingResult faster = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*cost=*/CostMetric{2.0, 2},
+            /*runtime=*/2.0,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -355,7 +363,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult slower = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*cost=*/CostMetric{4.0, 1},
+            /*runtime=*/4.0,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -368,7 +376,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
-    SUBCASE("lhs is infeasbile") {
+    SUBCASE("lhs is infeasible") {
       MachineMappingResult result = minimize_runtime(infeasible, slower);
       MachineMappingResult correct = slower;
 

From 964c885c5bf667ec1285eca6c0f7746b7c2e6edc Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 30 Oct 2024 20:35:08 -0400
Subject: [PATCH 05/16] fix tests & format

---
 .../machine_mapping_result_with_memory.h      |  1 +
 .../cost_estimator_for_test.cc                | 47 +++++++++++++++++--
 .../machine_mapping/cost_estimator_for_test.h | 22 ++++++++-
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
index 6203b99e55..d56d33f7ec 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
@@ -3,6 +3,7 @@
 
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index 9ee596af3e..b55b4d283c 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -7,9 +7,12 @@ namespace FlexFlow {
 TestCostEstimator::TestCostEstimator(
     std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
-        &get_communication_cost)
+        &get_communication_cost,
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost_with_memory)
     : get_operator_cost(get_operator_cost),
-      get_communication_cost(get_communication_cost) {}
+      get_communication_cost(get_communication_cost),
+      get_operator_cost_with_memory(get_operator_cost_with_memory) {}
 
 float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
@@ -19,13 +22,24 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->get_communication_cost(m);
 }
 
+CostMetric TestCostEstimator::estimate_cost_with_memory(
+    OpCostEstimateKey const &k) const {
+  return this->get_operator_cost_with_memory(k);
+}
+
 CostEstimator make_fake_cost_estimator(
     std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost) {
+  auto get_operator_cost_with_memory = [=](OpCostEstimateKey const &k) {
+    return CostMetric{
+        get_operator_cost(k),
+        0,
+    };
+  };
 
-  return CostEstimator::create<TestCostEstimator>(get_operator_cost,
-                                                  get_communication_cost);
+  return make_fake_cost_estimator(
+      get_operator_cost, get_communication_cost, get_operator_cost_with_memory);
 }
 
 CostEstimator make_fake_cost_estimator(
@@ -38,4 +52,29 @@ CostEstimator make_fake_cost_estimator(
       });
 }
 
+CostEstimator make_fake_cost_estimator(
+    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<float(TensorSetMovement const &)> const
+        &get_communication_cost,
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost_with_memory) {
+  return CostEstimator::create<TestCostEstimator>(
+      get_operator_cost, get_communication_cost, get_operator_cost_with_memory);
+}
+
+CostEstimator make_fake_cost_estimator(
+    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, float> const &comm_cost_map,
+    std::unordered_map<OpCostEstimateKey, CostMetric> const
+        &op_cost_with_memory_map) {
+  return make_fake_cost_estimator(
+      [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
+      [comm_cost_map](TensorSetMovement const &m) {
+        return comm_cost_map.at(m);
+      },
+      [op_cost_with_memory_map](OpCostEstimateKey const &k) {
+        return op_cost_with_memory_map.at(k);
+      });
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 7c1d06207a..302421f873 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -13,15 +13,22 @@ namespace FlexFlow {
 struct TestCostEstimator : public ICostEstimator {
   std::function<float(OpCostEstimateKey const &)> get_operator_cost;
   std::function<float(TensorSetMovement const &)> get_communication_cost;
+  std::function<CostMetric(OpCostEstimateKey const &)>
+      get_operator_cost_with_memory;
 
   TestCostEstimator() = delete;
   TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost,
                     decltype(get_communication_cost)
-                        const &get_communication_cost);
+                        const &get_communication_cost,
+                    decltype(get_operator_cost_with_memory)
+                        const &get_operator_cost_with_memory);
 
   float estimate_cost(OpCostEstimateKey const &) const override;
 
   float estimate_cost(TensorSetMovement const &) const override;
+
+  CostMetric
+      estimate_cost_with_memory(OpCostEstimateKey const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
@@ -33,6 +40,19 @@ CostEstimator make_fake_cost_estimator(
     std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
+CostEstimator make_fake_cost_estimator(
+    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<float(TensorSetMovement const &)> const
+        &get_communication_cost,
+    std::function<CostMetric(OpCostEstimateKey const &)> const
+        &get_operator_cost_with_memory);
+
+CostEstimator make_fake_cost_estimator(
+    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, float> const &comm_cost_map,
+    std::unordered_map<OpCostEstimateKey, CostMetric> const
+        &op_cost_with_memory_map);
+
 } // namespace FlexFlow
 
 #endif

From 0c0e7b042fe9bee725e4592259b62256dac17882 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 6 Nov 2024 20:56:36 -0500
Subject: [PATCH 06/16] minimum tests for memory algorithm

---
 ...get_optimal_machine_mapping_with_memory.cc | 293 +++++++++
 .../machine_mapping_result_with_memory.cc     | 585 ++++++++++++++++++
 2 files changed, 878 insertions(+)
 create mode 100644 lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
 create mode 100644 lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc

diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
new file mode 100644
index 0000000000..566af800ea
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -0,0 +1,293 @@
+#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "utils/containers/get_only.h"
+#include "utils/full_binary_tree/binary_tree_path.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_optimal_machine_mapping_with_memory") {
+    auto make_leaf = [](UnmappedOpCostEstimateKey const &k) {
+      return MachineMappingProblemTree{k};
+    };
+
+    auto make_series_split =
+        [](AbstractedTensorSetMovement const &tensor_set_movement,
+           MachineMappingProblemTree const &lhs,
+           MachineMappingProblemTree const &rhs) {
+          return MachineMappingProblemTree{
+              MMProblemTreeSeriesSplit{
+                  /*tensor_set_movement=*/tensor_set_movement,
+                  /*left_child=*/lhs,
+                  /*right_child=*/rhs,
+              },
+          };
+        };
+
+    auto make_parallel_split = [](MachineMappingProblemTree const &lhs,
+                                  MachineMappingProblemTree const &rhs) {
+      return MachineMappingProblemTree{
+          MMProblemTreeParallelSplit{
+              /*left_child=*/lhs,
+              /*right_child=*/rhs,
+          },
+      };
+    };
+
+    MachineView mv1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView mv2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/2,
+        /*num_cpus_per_node=*/1,
+        /*num_gpus_per_node=*/1,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    MachineSpecification split_machine_spec = MachineSpecification{
+        /*num_nodes=*/1,
+        /*num_cpus_per_node=*/1,
+        /*num_gpus_per_node=*/1,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &,
+                                      MachineSpecification const &resources) {
+      if (resources == full_machine_spec) {
+        return std::unordered_set<MachineView>{mv1, mv2};
+      } else {
+        return std::unordered_set<MachineView>{mv2};
+      }
+    };
+
+    UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
+        /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}},
+        /*input_shapes=*/{},
+        /*weight_shapes=*/{},
+        /*output_shapes=*/{},
+    };
+
+    UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
+        /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{
+            /*type=*/OperatorType::EW_ADD,
+            /*compute_type=*/DataType::FLOAT,
+            /*should_broadcast_lhs=*/false,
+            /*should_broadcast_rhs=*/false,
+        }},
+        /*input_shapes=*/{},
+        /*weight_shapes=*/{},
+        /*output_shapes=*/{},
+    };
+
+    ParallelTensorShape tensor_shape1 = ParallelTensorShape{
+        ParallelTensorDims{
+            FFOrdered<ShardParallelDim>{},
+            ReplicaParallelDimSet{
+                SumDegree{1},
+                DiscardCopyDegree{1},
+            },
+        },
+        DataType::FLOAT,
+    };
+
+    AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{
+        AbstractedSingleTensorMovement{
+            /*parallel_tensor_shape=*/tensor_shape1,
+            /*src_machine_views=*/{},
+            /*dst_machine_views=*/{},
+        },
+    }};
+
+    ParallelLayerGuidObliviousMachineMapping mm1 =
+        ParallelLayerGuidObliviousMachineMapping{{
+            {binary_tree_root_path(), mv1},
+        }};
+    ParallelLayerGuidObliviousMachineMapping mm2 =
+        ParallelLayerGuidObliviousMachineMapping{{
+            {binary_tree_root_path(), mv2},
+        }};
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        std::unordered_map<OpCostEstimateKey, float>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
+        }},
+        std::unordered_map<TensorSetMovement, float>{{
+            {TensorSetMovement{{}}, 0.0},
+            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
+             0.1},
+            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
+             0.2},
+            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
+             0.3},
+            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
+             0.4},
+        }},
+        std::unordered_map<OpCostEstimateKey, CostMetric>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 2}},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 3}},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 1}},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 2}},
+        }});
+
+    MachineMappingContext context = MachineMappingContext{
+        cost_estimator,
+        allowed_machine_views1,
+    };
+
+    MachineMappingCacheWithMemory cache =
+        empty_machine_mapping_cache_with_memory();
+
+    SUBCASE("single layer") {
+      MachineMappingProblemTree problem_tree = make_leaf(k1);
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingResultWithMemory result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+          SingleMachineMapping{
+              CostMetric{1.0, 2},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {binary_tree_root_path(), mv1},
+              }},
+          },
+          SingleMachineMapping{
+              CostMetric{1.5, 1},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {binary_tree_root_path(), mv2},
+              }},
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("pair of layers in sequence") {
+      MachineMappingProblemTree problem_tree =
+          make_series_split(movement1, make_leaf(k1), make_leaf(k2));
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingResultWithMemory result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+          SingleMachineMapping{
+              CostMetric{1.0 + 2.0 + 0.1, 2 + 3},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv1,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv1,
+                  },
+              }},
+          },
+          SingleMachineMapping{
+              CostMetric{1.5 + 2.5 + 0.1, 1 + 2},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv2,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv2,
+                  },
+              }},
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("pair of layers in parallel") {
+      MachineMappingProblemTree problem_tree =
+          make_parallel_split(make_leaf(k1), make_leaf(k2));
+
+      MachineMappingConstraints constraints =
+          get_unconstrained_solution_for_layers(
+              get_all_leaf_paths(problem_tree));
+
+      MachineMappingResultWithMemory result =
+          get_optimal_machine_mapping_with_memory(
+              cache, context, problem_tree, full_machine_spec, constraints);
+      MachineMappingResultWithMemory correct =
+          MachineMappingResultWithMemory{{SingleMachineMapping{
+              CostMetric{2.5, 2},
+              ParallelLayerGuidObliviousMachineMapping{{
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::LEFT_CHILD,
+                      }},
+                      mv2,
+                  },
+                  {
+                      BinaryTreePath{{
+                          BinaryTreePathEntry::RIGHT_CHILD,
+                      }},
+                      mv2,
+                  },
+              }},
+
+          }}};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
new file mode 100644
index 0000000000..6ca551c436
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -0,0 +1,585 @@
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "pcg/machine_view.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("remove_non_dominating_machine_mapping_result") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{4},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    CostMetric cost1 = CostMetric{
+        2.0,
+        2,
+    };
+    CostMetric cost2 = CostMetric{
+        4.0,
+        1,
+    };
+    CostMetric cost3 = CostMetric{
+        2.0,
+        3,
+    };
+
+    SingleMachineMapping mm1 = SingleMachineMapping{
+        cost1,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_0,
+                },
+            },
+        },
+    };
+
+    SingleMachineMapping mm2 = SingleMachineMapping{
+        cost2,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_1,
+                },
+            },
+        },
+    };
+
+    SingleMachineMapping mm3 = SingleMachineMapping{
+        cost3,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_2,
+                },
+            },
+        },
+    };
+
+    SUBCASE("empty") {
+      MachineMappingResultWithMemory to_remove =
+          empty_machine_mapping_result_with_memory();
+      MachineMappingResultWithMemory result =
+          remove_non_dominating_machine_mapping_result(to_remove);
+      MachineMappingResultWithMemory correct =
+          empty_machine_mapping_result_with_memory();
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("no non-dominating") {
+      MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{
+          {
+              mm1,
+              mm2,
+          },
+      };
+      MachineMappingResultWithMemory result =
+          remove_non_dominating_machine_mapping_result(to_remove);
+      MachineMappingResultWithMemory correct = to_remove;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("non-dominating") {
+      MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{
+          {
+              mm1,
+              mm2,
+              mm3,
+          },
+      };
+      MachineMappingResultWithMemory result =
+          remove_non_dominating_machine_mapping_result(to_remove);
+      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+          {
+              mm1,
+              mm2,
+          },
+      };
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("series_combine(memory)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    CostMetric pre_cost = CostMetric{
+        2.0,
+        2,
+    };
+    MachineMappingResultWithMemory pre = MachineMappingResultWithMemory{{
+        SingleMachineMapping{
+            pre_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::LEFT_CHILD},
+                        },
+                        machine_view_0,
+                    },
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::RIGHT_CHILD},
+                        },
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    CostMetric post_cost = CostMetric{
+        4.0,
+        1,
+    };
+
+    MachineMappingResultWithMemory post = MachineMappingResultWithMemory{{
+        SingleMachineMapping{
+            post_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{{}},
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    MachineMappingResultWithMemory empty =
+        empty_machine_mapping_result_with_memory();
+
+    float comm_cost = 3.0;
+
+    SUBCASE("pre is empty") {
+      MachineMappingResultWithMemory result = series_combine(
+          comm_cost, empty, post, ParallelSplitTransformation::LthenR);
+      MachineMappingResultWithMemory correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("post is empty") {
+      MachineMappingResultWithMemory result = series_combine(
+          comm_cost, pre, empty, ParallelSplitTransformation::LthenR);
+      MachineMappingResultWithMemory correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("both are nonempty") {
+      MachineMappingResultWithMemory no_parallel_split_transform =
+          MachineMappingResultWithMemory{
+              {
+                  SingleMachineMapping{
+                      /*cost=*/CostMetric{
+                          pre_cost.runtime + comm_cost + post_cost.runtime,
+                          pre_cost.memory + post_cost.memory,
+                      },
+                      /*machine_mapping=*/
+                      ParallelLayerGuidObliviousMachineMapping{{
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                              }},
+                              machine_view_0,
+                          },
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::LEFT_CHILD,
+                                  BinaryTreePathEntry::RIGHT_CHILD,
+                              }},
+                              machine_view_1,
+                          },
+                          {
+                              BinaryTreePath{{
+                                  BinaryTreePathEntry::RIGHT_CHILD,
+                              }},
+                              machine_view_1,
+                          },
+                      }},
+                  },
+              },
+          };
+
+      SUBCASE("parallel_split_transformation = std::nullopt") {
+        MachineMappingResultWithMemory result =
+            series_combine(comm_cost, pre, post, std::nullopt);
+        MachineMappingResultWithMemory correct = no_parallel_split_transform;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("parallel_split_transformation = LthenR") {
+        MachineMappingResultWithMemory result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::LthenR);
+        MachineMappingResultWithMemory correct = no_parallel_split_transform;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("parallel_split_transformation = RthenL") {
+        MachineMappingResultWithMemory result = series_combine(
+            comm_cost, pre, post, ParallelSplitTransformation::RthenL);
+        MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+            {
+                SingleMachineMapping{
+                    /*cost=*/CostMetric{
+                        pre_cost.runtime + comm_cost + post_cost.runtime,
+                        pre_cost.memory + post_cost.memory,
+                    },
+                    /*machine_mapping=*/
+                    ParallelLayerGuidObliviousMachineMapping{{
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                                BinaryTreePathEntry::LEFT_CHILD,
+                            }},
+                            machine_view_0,
+                        },
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                                BinaryTreePathEntry::RIGHT_CHILD,
+                            }},
+                            machine_view_1,
+                        },
+                        {
+                            BinaryTreePath{{
+                                BinaryTreePathEntry::LEFT_CHILD,
+                            }},
+                            machine_view_1,
+                        },
+                    }},
+                },
+            },
+        };
+
+        CHECK(result == correct);
+      }
+    }
+  }
+
+  TEST_CASE("parallel_combine(memory)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    CostMetric lhs_cost = CostMetric{
+        2.0,
+        2,
+    };
+    MachineMappingResultWithMemory lhs = MachineMappingResultWithMemory{{
+        SingleMachineMapping{
+            lhs_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::LEFT_CHILD},
+                        },
+                        machine_view_0,
+                    },
+                    {
+                        BinaryTreePath{
+                            {BinaryTreePathEntry::RIGHT_CHILD},
+                        },
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    CostMetric rhs_cost = CostMetric{
+        4.0,
+        1,
+    };
+    MachineMappingResultWithMemory rhs = MachineMappingResultWithMemory{{
+        SingleMachineMapping{
+            rhs_cost,
+            ParallelLayerGuidObliviousMachineMapping{
+                {
+                    {
+                        BinaryTreePath{{}},
+                        machine_view_1,
+                    },
+                },
+            },
+        },
+    }};
+
+    MachineMappingResultWithMemory empty =
+        empty_machine_mapping_result_with_memory();
+
+    SUBCASE("lhs is empty") {
+      MachineMappingResultWithMemory result = parallel_combine(empty, rhs);
+      MachineMappingResultWithMemory correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("rhs is empty") {
+      MachineMappingResultWithMemory result = parallel_combine(lhs, empty);
+      MachineMappingResultWithMemory correct = empty;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("both are nonempty") {
+      MachineMappingResultWithMemory result = parallel_combine(lhs, rhs);
+      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+          SingleMachineMapping{
+              /*cost=*/CostMetric{
+                  std::max(lhs_cost.runtime, rhs_cost.runtime),
+                  std::max(lhs_cost.memory, rhs_cost.memory),
+              },
+              /*machine_mapping=*/
+              ParallelLayerGuidObliviousMachineMapping{
+                  {
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD,
+                                          BinaryTreePathEntry::LEFT_CHILD}},
+                          machine_view_0,
+                      },
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD,
+                                          BinaryTreePathEntry::RIGHT_CHILD}},
+                          machine_view_1,
+                      },
+                      {
+                          BinaryTreePath{{BinaryTreePathEntry::RIGHT_CHILD}},
+                          machine_view_1,
+                      },
+                  },
+              },
+          },
+      }};
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("minimize_runtime(memory)") {
+    MachineView machine_view_0 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{1},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_1 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{2},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    MachineView machine_view_2 = MachineView{
+        /*start=*/MachineSpaceCoordinate{
+            /*node_idx=*/0,
+            /*device_idx=*/0,
+            /*device_type=*/DeviceType::GPU,
+        },
+        /*dimensions=*/
+        {
+            MachineViewDimension{
+                stride_t{4},
+                MachineSpecificationDimension::INTRA_NODE,
+            },
+        },
+    };
+
+    CostMetric cost1 = CostMetric{
+        2.0,
+        2,
+    };
+    CostMetric cost2 = CostMetric{
+        4.0,
+        1,
+    };
+    CostMetric cost3 = CostMetric{
+        2.0,
+        3,
+    };
+
+    SingleMachineMapping mm1 = SingleMachineMapping{
+        cost1,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_0,
+                },
+            },
+        },
+    };
+
+    SingleMachineMapping mm2 = SingleMachineMapping{
+        cost2,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_1,
+                },
+            },
+        },
+    };
+
+    SingleMachineMapping mm3 = SingleMachineMapping{
+        cost3,
+        ParallelLayerGuidObliviousMachineMapping{
+            {
+                {
+                    BinaryTreePath{{}},
+                    machine_view_2,
+                },
+            },
+        },
+    };
+
+    MachineMappingResultWithMemory result1 = MachineMappingResultWithMemory{
+        {
+            mm1,
+            mm2,
+        },
+    };
+
+    MachineMappingResultWithMemory result2 = MachineMappingResultWithMemory{
+        {
+            mm2,
+            mm3,
+        },
+    };
+
+    MachineMappingResultWithMemory result = minimize_runtime(result1, result2);
+    MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+        {
+            mm1,
+            mm2,
+        },
+    };
+
+    CHECK(result == correct);
+  }
+}

From 77783771b873ba2f2abd0fa45d32a03db548138b Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 18 Dec 2024 00:38:17 -0500
Subject: [PATCH 07/16] renaming

---
 .../compiler/cost_estimator/cost_estimator.h  |   9 +-
 .../compiler/cost_estimator/cost_metric.h     |  28 ----
 ...truct.toml => op_cost_metrics.struct.toml} |   2 +-
 .../get_optimal_machine_mapping.h             |   1 -
 .../machine_mapping_config.struct.toml        |  13 --
 .../get_optimal_machine_mapping_with_memory.h |  19 ++-
 .../machine_mapping_result_with_memory.h      |  41 ------
 ....h => machine_mapping_with_memory_cache.h} |  16 +--
 ...ine_mapping_with_memory_cache.struct.toml} |   6 +-
 .../machine_mapping_with_memory_result.h      |  41 ++++++
 ...ne_mapping_with_memory_result.struct.toml} |   2 +-
 .../single_machine_mapping.struct.toml        |   4 +-
 .../compiler/cost_estimator/cost_estimator.cc |   7 +-
 .../compiler/cost_estimator/cost_metric.cc    |  55 --------
 .../get_optimal_machine_mapping.cc            |   2 +-
 ...get_optimal_machine_mapping_with_memory.cc |  56 ++++----
 ...c => machine_mapping_with_memory_cache.cc} |  20 +--
 ... => machine_mapping_with_memory_result.cc} |  78 +++++------
 .../cost_estimator_for_test.cc                |  56 ++------
 .../machine_mapping/cost_estimator_for_test.h |  30 +----
 .../get_optimal_machine_mapping.cc            |  41 +++++-
 ...get_optimal_machine_mapping_with_memory.cc |  46 +++----
 .../machine_mapping_result_with_memory.cc     | 122 +++++++++---------
 23 files changed, 276 insertions(+), 419 deletions(-)
 delete mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.h
 rename lib/compiler/include/compiler/cost_estimator/{cost_metric.struct.toml => op_cost_metrics.struct.toml} (88%)
 delete mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
 delete mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
 rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.h => machine_mapping_with_memory_cache.h} (51%)
 rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.struct.toml => machine_mapping_with_memory_cache.struct.toml} (78%)
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
 rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_result_with_memory.struct.toml => machine_mapping_with_memory_result.struct.toml} (89%)
 delete mode 100644 lib/compiler/src/compiler/cost_estimator/cost_metric.cc
 rename lib/compiler/src/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.cc => machine_mapping_with_memory_cache.cc} (50%)
 rename lib/compiler/src/compiler/machine_mapping/memory_optimization/{machine_mapping_result_with_memory.cc => machine_mapping_with_memory_result.cc} (59%)

diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index 828200cc6a..9b006f178a 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 
-#include "compiler/cost_estimator/cost_metric.dtg.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
 #include "compiler/cost_estimator/tensor_set_movement.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
@@ -12,10 +12,8 @@
 namespace FlexFlow {
 
 struct ICostEstimator {
-  virtual float estimate_cost(OpCostEstimateKey const &) const = 0;
+  virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0;
   virtual float estimate_cost(TensorSetMovement const &) const = 0;
-  virtual CostMetric
-      estimate_cost_with_memory(OpCostEstimateKey const &) const = 0;
 
   ICostEstimator() = default;
   ICostEstimator(ICostEstimator const &) = delete;
@@ -26,9 +24,8 @@ struct ICostEstimator {
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
-  float estimate_cost(OpCostEstimateKey const &k) const;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const;
   float estimate_cost(TensorSetMovement const &m) const;
-  CostMetric estimate_cost_with_memory(OpCostEstimateKey const &k) const;
 
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<ICostEstimator, T>::value,
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.h b/lib/compiler/include/compiler/cost_estimator/cost_metric.h
deleted file mode 100644
index 98b0cb228d..0000000000
--- a/lib/compiler/include/compiler/cost_estimator/cost_metric.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H
-#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H
-
-#include "compiler/cost_estimator/cost_metric.dtg.h"
-#include <vector>
-
-namespace FlexFlow {
-
-CostMetric zero_cost_metric();
-
-CostMetric combine_cost_metrics_inter_device(CostMetric const &c1,
-                                             CostMetric const &c2);
-CostMetric
-    combine_cost_metrics_inter_device(std::vector<CostMetric> const &costs);
-
-CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1,
-                                                        CostMetric const &c2);
-CostMetric combine_cost_metrics_intra_device_sequential(
-    std::vector<CostMetric> const &costs);
-
-CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1,
-                                                      CostMetric const &c2);
-CostMetric combine_cost_metrics_intra_device_parallel(
-    std::vector<CostMetric> const &costs);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
similarity index 88%
rename from lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml
rename to lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
index 0666bb9e11..f137935a4d 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "CostMetric"
+name = "OpCostMetrics"
 features = [
   "eq",
   "fmt",
diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
index f69e6ab91b..62da90bfcb 100644
--- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_GET_OPTIMAL_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
-#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_context.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
deleted file mode 100644
index f4c0b61291..0000000000
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-namespace = "FlexFlow"
-name = "MachineMappingConfig"
-features = [
-  "eq",
-  "hash",
-  "fmt",
-]
-
-includes = []
-
-[[fields]]
-name = "enable_memory_optimization"
-type = "bool"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
index f8a2e4d75a..d176d298db 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
@@ -2,27 +2,26 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H
 
 #include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
-#include "compiler/machine_mapping/machine_mapping_config.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_context.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 
 namespace FlexFlow {
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MachineMappingProblemTree const &problem_tree,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeSeriesSplit const &series_split,
     MachineSpecification const &resources,
@@ -30,15 +29,15 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
     std::optional<ParallelSplitTransformation> const
         &parallel_split_transformation);
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &,
     UnmappedOpCostEstimateKey const &leaf,
     MachineSpecification const &resources,
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
deleted file mode 100644
index d56d33f7ec..0000000000
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
-#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
-
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h"
-#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
-#include <optional>
-
-namespace FlexFlow {
-
-[[nodiscard]] MachineMappingResultWithMemory
-    empty_machine_mapping_result_with_memory();
-[[nodiscard]] bool is_empty(MachineMappingResultWithMemory const &);
-
-[[nodiscard]] MachineMappingResultWithMemory get_mapping_with_minimal_runtime(
-    std::unordered_set<MachineMappingResultWithMemory> const &);
-
-[[nodiscard]] MachineMappingResultWithMemory
-    remove_non_dominating_machine_mapping_result(
-        MachineMappingResultWithMemory const &);
-
-[[nodiscard]] MachineMappingResultWithMemory
-    series_combine(float comm_cost,
-                   MachineMappingResultWithMemory const &pre_result,
-                   MachineMappingResultWithMemory const &post_result,
-                   std::optional<ParallelSplitTransformation> const
-                       &parallel_split_transformation);
-[[nodiscard]] MachineMappingResultWithMemory
-    parallel_combine(MachineMappingResultWithMemory const &lhs_result,
-                     MachineMappingResultWithMemory const &rhs_result);
-
-[[nodiscard]] MachineMappingResultWithMemory
-    minimize_runtime(MachineMappingResultWithMemory const &m1,
-                     MachineMappingResultWithMemory const &m2);
-
-[[nodiscard]] MachineMappingResultWithMemory
-    make_singleton_machine_mapping_result_with_memory(
-        CostMetric cost, MachineView const &machine_view);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
similarity index 51%
rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h
rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
index 2c45c04d3d..b749235c89 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h
@@ -1,18 +1,18 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H
 
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
 
 namespace FlexFlow {
 
-MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory();
-std::optional<MachineMappingResultWithMemory>
-    machine_mapping_cache_with_memory_load(
-        MachineMappingCacheWithMemory const &, MachineMappingState const &);
-void machine_mapping_cache_with_memory_save(
-    MachineMappingCacheWithMemory &,
+MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache();
+std::optional<MachineMappingWithMemoryResult>
+    machine_mapping_with_memory_cache_load(
+        MachineMappingWithMemoryCache const &, MachineMappingState const &);
+void machine_mapping_with_memory_cache_save(
+    MachineMappingWithMemoryCache &,
     MachineMappingState const &,
-    MachineMappingResultWithMemory const &);
+    MachineMappingWithMemoryResult const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
similarity index 78%
rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml
rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
index e7afa26bb3..c2fe393e99 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "MachineMappingCacheWithMemory"
+name = "MachineMappingWithMemoryCache"
 features = [
   "eq",
   "hash",
@@ -9,7 +9,7 @@ features = [
 includes = [
   "<unordered_map>",
   "compiler/machine_mapping/machine_mapping_state.dtg.h",
-  "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h",
+  "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h",
 ]
 
 src_includes = [
@@ -19,4 +19,4 @@ src_includes = [
 
 [[fields]]
 name = "raw_map"
-type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingResultWithMemory>"
+type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingWithMemoryResult>"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
new file mode 100644
index 0000000000..0383376116
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H
+
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h"
+#include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    empty_machine_mapping_with_memory_result();
+[[nodiscard]] bool is_empty(MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingWithMemoryResult> const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    remove_non_pareto_optimal_machine_mapping_result(
+        MachineMappingWithMemoryResult const &);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    series_combine(float comm_cost,
+                   MachineMappingWithMemoryResult const &pre_result,
+                   MachineMappingWithMemoryResult const &post_result,
+                   std::optional<ParallelSplitTransformation> const
+                       &parallel_split_transformation);
+[[nodiscard]] MachineMappingWithMemoryResult
+    parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
+                     MachineMappingWithMemoryResult const &rhs_result);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    minimize_runtime(MachineMappingWithMemoryResult const &m1,
+                     MachineMappingWithMemoryResult const &m2);
+
+[[nodiscard]] MachineMappingWithMemoryResult
+    make_singleton_machine_mapping_with_memory_result(
+        OpCostMetrics cost, MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
similarity index 89%
rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml
rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
index f3b2895b83..50de145b36 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "MachineMappingResultWithMemory"
+name = "MachineMappingWithMemoryResult"
 features = [
   "eq",
   "hash",
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
index 05a23e905a..f33e320e3b 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
@@ -8,12 +8,12 @@ features = [
 
 includes = [
   "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
-  "compiler/cost_estimator/cost_metric.dtg.h",
+  "compiler/cost_estimator/op_cost_metrics.dtg.h",
 ]
 
 [[fields]]
 name = "cost"
-type = "::FlexFlow::CostMetric"
+type = "::FlexFlow::OpCostMetrics"
 
 [[fields]]
 name = "machine_mapping"
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
index 40a0f4e2a4..6ac6e3a8d6 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
@@ -5,7 +5,7 @@ namespace FlexFlow {
 CostEstimator::CostEstimator(std::shared_ptr<ICostEstimator> implementation_ptr)
     : implementation_ptr(implementation_ptr) {}
 
-float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+OpCostMetrics CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->implementation_ptr->estimate_cost(k);
 }
 
@@ -13,9 +13,4 @@ float CostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->implementation_ptr->estimate_cost(m);
 }
 
-CostMetric
-    CostEstimator::estimate_cost_with_memory(OpCostEstimateKey const &k) const {
-  return this->implementation_ptr->estimate_cost_with_memory(k);
-}
-
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
deleted file mode 100644
index dfaf0702c9..0000000000
--- a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-#include "compiler/cost_estimator/cost_metric.h"
-
-namespace FlexFlow {
-
-CostMetric zero_cost_metric() {
-  return CostMetric{
-      /*runtime=*/0,
-      /*memory=*/0,
-  };
-}
-
-CostMetric combine_cost_metrics_inter_device(CostMetric const &c1,
-                                             CostMetric const &c2) {
-  return CostMetric{c1.runtime + c2.runtime, c1.memory + c2.memory};
-}
-
-CostMetric
-    combine_cost_metrics_inter_device(std::vector<CostMetric> const &costs) {
-  CostMetric result = zero_cost_metric();
-  for (CostMetric const &cost : costs) {
-    result = combine_cost_metrics_inter_device(result, cost);
-  }
-  return result;
-}
-
-CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1,
-                                                        CostMetric const &c2) {
-  return CostMetric{c1.runtime + c2.runtime, std::max(c1.memory, c2.memory)};
-}
-
-CostMetric combine_cost_metrics_intra_device_sequential(
-    std::vector<CostMetric> const &costs) {
-  CostMetric result = zero_cost_metric();
-  for (CostMetric const &cost : costs) {
-    result = combine_cost_metrics_intra_device_sequential(result, cost);
-  }
-  return result;
-}
-
-CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1,
-                                                      CostMetric const &c2) {
-  return CostMetric{std::max(c1.runtime, c2.runtime),
-                    std::max(c1.memory, c2.memory)};
-}
-
-CostMetric combine_cost_metrics_intra_device_parallel(
-    std::vector<CostMetric> const &costs) {
-  CostMetric result = zero_cost_metric();
-  for (CostMetric const &cost : costs) {
-    result = combine_cost_metrics_intra_device_parallel(result, cost);
-  }
-  return result;
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 10abd7ff90..5bdd8645a5 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -240,7 +240,7 @@ MachineMappingResult
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    float cost = context.cost_estimator.estimate_cost(mapped);
+    float cost = context.cost_estimator.estimate_cost(mapped).runtime;
 
     return make_singleton_machine_mapping_result(cost, machine_view);
   };
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 676f3a6c8e..96a67afaab 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -4,8 +4,8 @@
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
@@ -24,8 +24,8 @@
 
 namespace FlexFlow {
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MachineMappingProblemTree const &problem_tree,
     MachineSpecification const &resources,
@@ -38,15 +38,15 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
   };
 
   {
-    std::optional<MachineMappingResultWithMemory> cached_result =
-        machine_mapping_cache_with_memory_load(result_cache, state);
+    std::optional<MachineMappingWithMemoryResult> cached_result =
+        machine_mapping_with_memory_cache_load(result_cache, state);
     if (cached_result) {
       return cached_result.value();
     }
   }
 
-  MachineMappingResultWithMemory result =
-      problem_tree.visit<MachineMappingResultWithMemory>(overload{
+  MachineMappingWithMemoryResult result =
+      problem_tree.visit<MachineMappingWithMemoryResult>(overload{
           [&](MMProblemTreeSeriesSplit const &series_split) {
             return get_optimal_machine_mapping_with_memory(
                 result_cache,
@@ -65,12 +65,12 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
           },
       });
 
-  machine_mapping_cache_with_memory_save(result_cache, state, result);
+  machine_mapping_with_memory_cache_save(result_cache, state, result);
   return result;
 }
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeSeriesSplit const &series_split,
     MachineSpecification const &resources,
@@ -105,7 +105,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
         MachineMappingConstraints pre_candidate = with_additional_constraints(
             restrict_to_left_child(constraints), assigned_pre_machine_views);
 
-        MachineMappingResultWithMemory pre_result =
+        MachineMappingWithMemoryResult pre_result =
             get_optimal_machine_mapping_with_memory(
                 result_cache,
                 context,
@@ -122,7 +122,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
         MachineMappingConstraints post_candidate = with_additional_constraints(
             restrict_to_right_child(constraints), assigned_post_machine_views);
 
-        MachineMappingResultWithMemory post_result =
+        MachineMappingWithMemoryResult post_result =
             get_optimal_machine_mapping_with_memory(
                 result_cache,
                 context,
@@ -133,8 +133,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
         return post_result;
       };
 
-  MachineMappingResultWithMemory result =
-      empty_machine_mapping_result_with_memory();
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
   AbstractedTensorSetMovement tensor_movement =
       series_split.tensor_set_movement;
 
@@ -142,7 +142,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
            &assigned_pre_machine_views :
        get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) {
 
-    MachineMappingResultWithMemory pre_result =
+    MachineMappingWithMemoryResult pre_result =
         eval_pre_boundary_mapping(assigned_pre_machine_views);
 
     for (ParallelLayerGuidObliviousMachineMapping const
@@ -150,7 +150,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
          get_boundary_machine_view_assignments(
              get_dst_layers(tensor_movement))) {
 
-      MachineMappingResultWithMemory post_result =
+      MachineMappingWithMemoryResult post_result =
           eval_post_boundary_mapping(assigned_post_machine_views);
 
       TensorSetMovement comm_across_split =
@@ -172,8 +172,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
   return result;
 }
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
@@ -182,7 +182,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
   MachineMappingProblemTree lhs = parallel_split.get_left_child();
   MachineMappingProblemTree rhs = parallel_split.get_right_child();
 
-  MachineMappingResultWithMemory series_result = [&] {
+  MachineMappingWithMemoryResult series_result = [&] {
     MMProblemTreeSeriesSplit series_split = MMProblemTreeSeriesSplit{
         /*tensor_set_movement=*/empty_abstracted_tensor_set_movement(),
         /*left_child=*/lhs,
@@ -206,13 +206,13 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
   auto evaluate_resource_split =
       [&](std::pair<MachineSpecification, MachineSpecification> const
               &resource_split) {
-        MachineMappingResultWithMemory left_result =
+        MachineMappingWithMemoryResult left_result =
             get_optimal_machine_mapping_with_memory(result_cache,
                                                     context,
                                                     lhs,
                                                     resource_split.first,
                                                     left_constraints);
-        MachineMappingResultWithMemory right_result =
+        MachineMappingWithMemoryResult right_result =
             get_optimal_machine_mapping_with_memory(result_cache,
                                                     context,
                                                     rhs,
@@ -222,7 +222,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
         return parallel_combine(left_result, right_result);
       };
 
-  std::unordered_set<MachineMappingResultWithMemory> parallel_results =
+  std::unordered_set<MachineMappingWithMemoryResult> parallel_results =
       transform(get_machine_resource_splits(resources),
                 evaluate_resource_split);
 
@@ -230,8 +230,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
                           get_mapping_with_minimal_runtime(parallel_results));
 }
 
-MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
-    MachineMappingCacheWithMemory &result_cache,
+MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
+    MachineMappingWithMemoryCache &result_cache,
     MachineMappingContext const &context,
     UnmappedOpCostEstimateKey const &leaf,
     MachineSpecification const &resource,
@@ -249,13 +249,13 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory(
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    CostMetric cost = context.cost_estimator.estimate_cost_with_memory(mapped);
+    OpCostMetrics cost = context.cost_estimator.estimate_cost(mapped);
 
-    return make_singleton_machine_mapping_result_with_memory(cost,
+    return make_singleton_machine_mapping_with_memory_result(cost,
                                                              machine_view);
   };
 
-  std::unordered_set<MachineMappingResultWithMemory> candidate_results =
+  std::unordered_set<MachineMappingWithMemoryResult> candidate_results =
       transform(candidates, get_mapping_result);
 
   return get_mapping_with_minimal_runtime(candidate_results);
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc
similarity index 50%
rename from lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc
rename to lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc
index e74612250e..617ba682be 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc
@@ -1,27 +1,27 @@
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/try_at.h"
 
 namespace FlexFlow {
 
-MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory() {
-  return MachineMappingCacheWithMemory{{}};
+MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache() {
+  return MachineMappingWithMemoryCache{{}};
 }
 
-std::optional<MachineMappingResultWithMemory>
-    machine_mapping_cache_with_memory_load(
-        MachineMappingCacheWithMemory const &cache,
+std::optional<MachineMappingWithMemoryResult>
+    machine_mapping_with_memory_cache_load(
+        MachineMappingWithMemoryCache const &cache,
         MachineMappingState const &k) {
   return try_at(cache.raw_map, k);
 }
 
-void machine_mapping_cache_with_memory_save(
-    MachineMappingCacheWithMemory &cache,
+void machine_mapping_with_memory_cache_save(
+    MachineMappingWithMemoryCache &cache,
     MachineMappingState const &k,
-    MachineMappingResultWithMemory const &v) {
+    MachineMappingWithMemoryResult const &v) {
   if (contains_key(cache.raw_map, k)) {
     throw mk_runtime_error(fmt::format(
-        "machine_mapping_cache_with_memory_save expected key to not already "
+        "machine_mapping_with_memory_cache_save expected key to not already "
         "exist, but received existing key {}",
         k));
   }
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
similarity index 59%
rename from lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
rename to lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index 1c4f8e1142..d38e4a7b6a 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -1,57 +1,57 @@
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h"
 #include "utils/containers/set_union.h"
 #include "utils/full_binary_tree/binary_tree_path.h"
 
 namespace FlexFlow {
 
-MachineMappingResultWithMemory empty_machine_mapping_result_with_memory() {
-  return MachineMappingResultWithMemory{
+MachineMappingWithMemoryResult empty_machine_mapping_with_memory_result() {
+  return MachineMappingWithMemoryResult{
       {},
   };
 }
 
-MachineMappingResultWithMemory get_mapping_with_minimal_runtime(
-    std::unordered_set<MachineMappingResultWithMemory> const &candidates) {
-  MachineMappingResultWithMemory result =
-      empty_machine_mapping_result_with_memory();
+MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
+    std::unordered_set<MachineMappingWithMemoryResult> const &candidates) {
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
 
-  for (MachineMappingResultWithMemory const &candidate : candidates) {
+  for (MachineMappingWithMemoryResult const &candidate : candidates) {
     result = minimize_runtime(result, candidate);
   }
 
   return result;
 }
 
-MachineMappingResultWithMemory remove_non_dominating_machine_mapping_result(
-    MachineMappingResultWithMemory const &result) {
-  std::unordered_set<SingleMachineMapping> non_dominating_mappings;
+MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
+    MachineMappingWithMemoryResult const &result) {
+  std::unordered_set<SingleMachineMapping> non_pareto_optimal_mappings;
   for (SingleMachineMapping const &mapping : result.machine_mappings) {
-    bool is_dominating = true;
+    bool is_pareto_optimal = true;
     for (SingleMachineMapping const &other_mapping : result.machine_mappings) {
       if (mapping.cost.runtime >= other_mapping.cost.runtime &&
           mapping.cost.memory >= other_mapping.cost.memory &&
           mapping != other_mapping) {
-        is_dominating = false;
+        is_pareto_optimal = false;
         break;
       }
     }
-    if (is_dominating) {
-      non_dominating_mappings.insert(mapping);
+    if (is_pareto_optimal) {
+      non_pareto_optimal_mappings.insert(mapping);
     }
   }
-  return MachineMappingResultWithMemory{std::move(non_dominating_mappings)};
+  return MachineMappingWithMemoryResult{std::move(non_pareto_optimal_mappings)};
 }
 
-MachineMappingResultWithMemory
+MachineMappingWithMemoryResult
     series_combine(float comm_cost,
-                   MachineMappingResultWithMemory const &pre_result,
-                   MachineMappingResultWithMemory const &post_result,
+                   MachineMappingWithMemoryResult const &pre_result,
+                   MachineMappingWithMemoryResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
                        &parallel_split_transformation) {
   auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm,
                                      SingleMachineMapping const &post_mm) {
-    CostMetric cost = CostMetric{
+    OpCostMetrics cost = OpCostMetrics{
         pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
         pre_mm.cost.memory + post_mm.cost.memory,
     };
@@ -71,23 +71,23 @@ MachineMappingResultWithMemory
     return SingleMachineMapping{cost, mapping};
   };
 
-  MachineMappingResultWithMemory result =
-      empty_machine_mapping_result_with_memory();
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
   for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) {
     for (SingleMachineMapping const &post_mm : post_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm));
     }
   }
 
-  return remove_non_dominating_machine_mapping_result(result);
+  return remove_non_pareto_optimal_machine_mapping_result(result);
 }
 
-MachineMappingResultWithMemory
-    parallel_combine(MachineMappingResultWithMemory const &lhs_result,
-                     MachineMappingResultWithMemory const &rhs_result) {
+MachineMappingWithMemoryResult
+    parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
+                     MachineMappingWithMemoryResult const &rhs_result) {
   auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm,
                                      SingleMachineMapping const &rhs_mm) {
-    CostMetric cost = CostMetric{
+    OpCostMetrics cost = OpCostMetrics{
         std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
         std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
     };
@@ -98,30 +98,30 @@ MachineMappingResultWithMemory
     return SingleMachineMapping{cost, mapping};
   };
 
-  MachineMappingResultWithMemory result =
-      empty_machine_mapping_result_with_memory();
+  MachineMappingWithMemoryResult result =
+      empty_machine_mapping_with_memory_result();
   for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) {
     for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm));
     }
   }
 
-  return remove_non_dominating_machine_mapping_result(result);
+  return remove_non_pareto_optimal_machine_mapping_result(result);
 }
 
-MachineMappingResultWithMemory
-    minimize_runtime(MachineMappingResultWithMemory const &m1,
-                     MachineMappingResultWithMemory const &m2) {
-  MachineMappingResultWithMemory result = MachineMappingResultWithMemory{
+MachineMappingWithMemoryResult
+    minimize_runtime(MachineMappingWithMemoryResult const &m1,
+                     MachineMappingWithMemoryResult const &m2) {
+  MachineMappingWithMemoryResult result = MachineMappingWithMemoryResult{
       set_union(m1.machine_mappings, m2.machine_mappings),
   };
-  return remove_non_dominating_machine_mapping_result(result);
+  return remove_non_pareto_optimal_machine_mapping_result(result);
 }
 
-MachineMappingResultWithMemory
-    make_singleton_machine_mapping_result_with_memory(
-        CostMetric cost, MachineView const &machine_view) {
-  return MachineMappingResultWithMemory{{
+MachineMappingWithMemoryResult
+    make_singleton_machine_mapping_with_memory_result(
+        OpCostMetrics cost, MachineView const &machine_view) {
+  return MachineMappingWithMemoryResult{{
       SingleMachineMapping{
           cost,
           ParallelLayerGuidObliviousMachineMapping{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index b55b4d283c..6ebfc45a6f 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -5,16 +5,13 @@
 namespace FlexFlow {
 
 TestCostEstimator::TestCostEstimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
-        &get_communication_cost,
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost_with_memory)
+        &get_communication_cost)
     : get_operator_cost(get_operator_cost),
-      get_communication_cost(get_communication_cost),
-      get_operator_cost_with_memory(get_operator_cost_with_memory) {}
+      get_communication_cost(get_communication_cost) {}
 
-float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+OpCostMetrics TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
 }
 
@@ -22,58 +19,21 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->get_communication_cost(m);
 }
 
-CostMetric TestCostEstimator::estimate_cost_with_memory(
-    OpCostEstimateKey const &k) const {
-  return this->get_operator_cost_with_memory(k);
-}
-
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost) {
-  auto get_operator_cost_with_memory = [=](OpCostEstimateKey const &k) {
-    return CostMetric{
-        get_operator_cost(k),
-        0,
-    };
-  };
-
-  return make_fake_cost_estimator(
-      get_operator_cost, get_communication_cost, get_operator_cost_with_memory);
-}
-
-CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
-  return make_fake_cost_estimator(
-      [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
-      [comm_cost_map](TensorSetMovement const &m) {
-        return comm_cost_map.at(m);
-      });
-}
-
-CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
-        &get_communication_cost,
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost_with_memory) {
   return CostEstimator::create<TestCostEstimator>(
-      get_operator_cost, get_communication_cost, get_operator_cost_with_memory);
+      get_operator_cost, get_communication_cost);
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map,
-    std::unordered_map<OpCostEstimateKey, CostMetric> const
-        &op_cost_with_memory_map) {
+    std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
   return make_fake_cost_estimator(
       [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
       [comm_cost_map](TensorSetMovement const &m) {
         return comm_cost_map.at(m);
-      },
-      [op_cost_with_memory_map](OpCostEstimateKey const &k) {
-        return op_cost_with_memory_map.at(k);
       });
 }
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 302421f873..7fb4bcc6f8 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -11,48 +11,28 @@
 namespace FlexFlow {
 
 struct TestCostEstimator : public ICostEstimator {
-  std::function<float(OpCostEstimateKey const &)> get_operator_cost;
+  std::function<OpCostMetrics(OpCostEstimateKey const &)> get_operator_cost;
   std::function<float(TensorSetMovement const &)> get_communication_cost;
-  std::function<CostMetric(OpCostEstimateKey const &)>
-      get_operator_cost_with_memory;
 
   TestCostEstimator() = delete;
   TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost,
                     decltype(get_communication_cost)
-                        const &get_communication_cost,
-                    decltype(get_operator_cost_with_memory)
-                        const &get_operator_cost_with_memory);
+                        const &get_communication_cost);
 
-  float estimate_cost(OpCostEstimateKey const &) const override;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override;
 
   float estimate_cost(TensorSetMovement const &) const override;
-
-  CostMetric
-      estimate_cost_with_memory(OpCostEstimateKey const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost);
 
 CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
+    std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
-CostEstimator make_fake_cost_estimator(
-    std::function<float(OpCostEstimateKey const &)> const &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
-        &get_communication_cost,
-    std::function<CostMetric(OpCostEstimateKey const &)> const
-        &get_operator_cost_with_memory);
-
-CostEstimator make_fake_cost_estimator(
-    std::unordered_map<OpCostEstimateKey, float> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map,
-    std::unordered_map<OpCostEstimateKey, CostMetric> const
-        &op_cost_with_memory_map);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index a0d06fe930..14a8b2e014 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -144,13 +144,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             {binary_tree_root_path(), mv2},
         }};
 
+    printf("Before constructing cost_estimator\n");
+
+    auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
+        }};
+
+    printf("After constructing map1\n");
+
     CostEstimator cost_estimator = make_fake_cost_estimator(
-        std::unordered_map<OpCostEstimateKey, float>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
-        }},
+        map1,
         std::unordered_map<TensorSetMovement, float>{{
             {TensorSetMovement{{}}, 0.0},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
@@ -163,11 +169,34 @@ TEST_SUITE(FF_TEST_SUITE) {
              0.4},
         }});
 
+    // CostEstimator cost_estimator = make_fake_cost_estimator(
+    //     std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
+    //         {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
+    //         {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
+    //         {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
+    //         {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
+    //     }},
+    //     std::unordered_map<TensorSetMovement, float>{{
+    //         {TensorSetMovement{{}}, 0.0},
+    //         {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
+    //          0.1},
+    //         {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
+    //          0.2},
+    //         {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
+    //          0.3},
+    //         {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
+    //          0.4},
+    //     }});
+
+    printf("After constructing cost_estimator\n");
+
     MachineMappingContext context = MachineMappingContext{
         cost_estimator,
         allowed_machine_views1,
     };
 
+    printf("After constructing context\n");
+
     MachineMappingCache cache = empty_machine_mapping_cache();
 
     SUBCASE("single layer") {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 566af800ea..440ebde343 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -4,7 +4,7 @@
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
@@ -145,11 +145,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         }};
 
     CostEstimator cost_estimator = make_fake_cost_estimator(
-        std::unordered_map<OpCostEstimateKey, float>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5},
+        std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
+            {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics{1.0, 2}},
+            {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics{2.0, 3}},
+            {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics{1.5, 1}},
+            {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics{2.5, 2}},
         }},
         std::unordered_map<TensorSetMovement, float>{{
             {TensorSetMovement{{}}, 0.0},
@@ -161,12 +161,6 @@ TEST_SUITE(FF_TEST_SUITE) {
              0.3},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
              0.4},
-        }},
-        std::unordered_map<OpCostEstimateKey, CostMetric>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 2}},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 3}},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 1}},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 2}},
         }});
 
     MachineMappingContext context = MachineMappingContext{
@@ -174,8 +168,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         allowed_machine_views1,
     };
 
-    MachineMappingCacheWithMemory cache =
-        empty_machine_mapping_cache_with_memory();
+    MachineMappingWithMemoryCache cache =
+        empty_machine_mapping_with_memory_cache();
 
     SUBCASE("single layer") {
       MachineMappingProblemTree problem_tree = make_leaf(k1);
@@ -184,18 +178,18 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResultWithMemory result =
+      MachineMappingWithMemoryResult result =
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
-      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           SingleMachineMapping{
-              CostMetric{1.0, 2},
+              OpCostMetrics{1.0, 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
               }},
           },
           SingleMachineMapping{
-              CostMetric{1.5, 1},
+              OpCostMetrics{1.5, 1},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv2},
               }},
@@ -213,12 +207,12 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResultWithMemory result =
+      MachineMappingWithMemoryResult result =
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
-      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           SingleMachineMapping{
-              CostMetric{1.0 + 2.0 + 0.1, 2 + 3},
+              OpCostMetrics{1.0 + 2.0 + 0.1, 2 + 3},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
@@ -235,7 +229,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               }},
           },
           SingleMachineMapping{
-              CostMetric{1.5 + 2.5 + 0.1, 1 + 2},
+              OpCostMetrics{1.5 + 2.5 + 0.1, 1 + 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
@@ -264,12 +258,12 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_unconstrained_solution_for_layers(
               get_all_leaf_paths(problem_tree));
 
-      MachineMappingResultWithMemory result =
+      MachineMappingWithMemoryResult result =
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
-      MachineMappingResultWithMemory correct =
-          MachineMappingResultWithMemory{{SingleMachineMapping{
-              CostMetric{2.5, 2},
+      MachineMappingWithMemoryResult correct =
+          MachineMappingWithMemoryResult{{SingleMachineMapping{
+              OpCostMetrics{2.5, 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 6ca551c436..bdd58f8717 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -1,11 +1,11 @@
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "pcg/machine_view.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("remove_non_dominating_machine_mapping_result") {
+  TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
             /*node_idx=*/0,
@@ -51,15 +51,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostMetric cost1 = CostMetric{
+    OpCostMetrics cost1 = OpCostMetrics{
         2.0,
         2,
     };
-    CostMetric cost2 = CostMetric{
+    OpCostMetrics cost2 = OpCostMetrics{
         4.0,
         1,
     };
-    CostMetric cost3 = CostMetric{
+    OpCostMetrics cost3 = OpCostMetrics{
         2.0,
         3,
     };
@@ -101,41 +101,41 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("empty") {
-      MachineMappingResultWithMemory to_remove =
-          empty_machine_mapping_result_with_memory();
-      MachineMappingResultWithMemory result =
-          remove_non_dominating_machine_mapping_result(to_remove);
-      MachineMappingResultWithMemory correct =
-          empty_machine_mapping_result_with_memory();
+      MachineMappingWithMemoryResult to_remove =
+          empty_machine_mapping_with_memory_result();
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(to_remove);
+      MachineMappingWithMemoryResult correct =
+          empty_machine_mapping_with_memory_result();
 
       CHECK(result == correct);
     }
 
-    SUBCASE("no non-dominating") {
-      MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{
+    SUBCASE("no non-pareto_optimal") {
+      MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{
           {
               mm1,
               mm2,
           },
       };
-      MachineMappingResultWithMemory result =
-          remove_non_dominating_machine_mapping_result(to_remove);
-      MachineMappingResultWithMemory correct = to_remove;
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(to_remove);
+      MachineMappingWithMemoryResult correct = to_remove;
 
       CHECK(result == correct);
     }
 
-    SUBCASE("non-dominating") {
-      MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{
+    SUBCASE("non-pareto_optimal") {
+      MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{
           {
               mm1,
               mm2,
               mm3,
           },
       };
-      MachineMappingResultWithMemory result =
-          remove_non_dominating_machine_mapping_result(to_remove);
-      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+      MachineMappingWithMemoryResult result =
+          remove_non_pareto_optimal_machine_mapping_result(to_remove);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
           {
               mm1,
               mm2,
@@ -177,11 +177,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostMetric pre_cost = CostMetric{
+    OpCostMetrics pre_cost = OpCostMetrics{
         2.0,
         2,
     };
-    MachineMappingResultWithMemory pre = MachineMappingResultWithMemory{{
+    MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
             pre_cost,
             ParallelLayerGuidObliviousMachineMapping{
@@ -203,12 +203,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     }};
 
-    CostMetric post_cost = CostMetric{
+    OpCostMetrics post_cost = OpCostMetrics{
         4.0,
         1,
     };
 
-    MachineMappingResultWithMemory post = MachineMappingResultWithMemory{{
+    MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
             post_cost,
             ParallelLayerGuidObliviousMachineMapping{
@@ -222,33 +222,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     }};
 
-    MachineMappingResultWithMemory empty =
-        empty_machine_mapping_result_with_memory();
+    MachineMappingWithMemoryResult empty =
+        empty_machine_mapping_with_memory_result();
 
     float comm_cost = 3.0;
 
     SUBCASE("pre is empty") {
-      MachineMappingResultWithMemory result = series_combine(
+      MachineMappingWithMemoryResult result = series_combine(
           comm_cost, empty, post, ParallelSplitTransformation::LthenR);
-      MachineMappingResultWithMemory correct = empty;
+      MachineMappingWithMemoryResult correct = empty;
 
       CHECK(result == correct);
     }
 
     SUBCASE("post is empty") {
-      MachineMappingResultWithMemory result = series_combine(
+      MachineMappingWithMemoryResult result = series_combine(
           comm_cost, pre, empty, ParallelSplitTransformation::LthenR);
-      MachineMappingResultWithMemory correct = empty;
+      MachineMappingWithMemoryResult correct = empty;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are nonempty") {
-      MachineMappingResultWithMemory no_parallel_split_transform =
-          MachineMappingResultWithMemory{
+      MachineMappingWithMemoryResult no_parallel_split_transform =
+          MachineMappingWithMemoryResult{
               {
                   SingleMachineMapping{
-                      /*cost=*/CostMetric{
+                      /*cost=*/OpCostMetrics{
                           pre_cost.runtime + comm_cost + post_cost.runtime,
                           pre_cost.memory + post_cost.memory,
                       },
@@ -280,28 +280,28 @@ TEST_SUITE(FF_TEST_SUITE) {
           };
 
       SUBCASE("parallel_split_transformation = std::nullopt") {
-        MachineMappingResultWithMemory result =
+        MachineMappingWithMemoryResult result =
             series_combine(comm_cost, pre, post, std::nullopt);
-        MachineMappingResultWithMemory correct = no_parallel_split_transform;
+        MachineMappingWithMemoryResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = LthenR") {
-        MachineMappingResultWithMemory result = series_combine(
+        MachineMappingWithMemoryResult result = series_combine(
             comm_cost, pre, post, ParallelSplitTransformation::LthenR);
-        MachineMappingResultWithMemory correct = no_parallel_split_transform;
+        MachineMappingWithMemoryResult correct = no_parallel_split_transform;
 
         CHECK(result == correct);
       }
 
       SUBCASE("parallel_split_transformation = RthenL") {
-        MachineMappingResultWithMemory result = series_combine(
+        MachineMappingWithMemoryResult result = series_combine(
             comm_cost, pre, post, ParallelSplitTransformation::RthenL);
-        MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+        MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
             {
                 SingleMachineMapping{
-                    /*cost=*/CostMetric{
+                    /*cost=*/OpCostMetrics{
                         pre_cost.runtime + comm_cost + post_cost.runtime,
                         pre_cost.memory + post_cost.memory,
                     },
@@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostMetric lhs_cost = CostMetric{
+    OpCostMetrics lhs_cost = OpCostMetrics{
         2.0,
         2,
     };
-    MachineMappingResultWithMemory lhs = MachineMappingResultWithMemory{{
+    MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
             lhs_cost,
             ParallelLayerGuidObliviousMachineMapping{
@@ -394,11 +394,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     }};
 
-    CostMetric rhs_cost = CostMetric{
+    OpCostMetrics rhs_cost = OpCostMetrics{
         4.0,
         1,
     };
-    MachineMappingResultWithMemory rhs = MachineMappingResultWithMemory{{
+    MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
             rhs_cost,
             ParallelLayerGuidObliviousMachineMapping{
@@ -412,28 +412,28 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     }};
 
-    MachineMappingResultWithMemory empty =
-        empty_machine_mapping_result_with_memory();
+    MachineMappingWithMemoryResult empty =
+        empty_machine_mapping_with_memory_result();
 
     SUBCASE("lhs is empty") {
-      MachineMappingResultWithMemory result = parallel_combine(empty, rhs);
-      MachineMappingResultWithMemory correct = empty;
+      MachineMappingWithMemoryResult result = parallel_combine(empty, rhs);
+      MachineMappingWithMemoryResult correct = empty;
 
       CHECK(result == correct);
     }
 
     SUBCASE("rhs is empty") {
-      MachineMappingResultWithMemory result = parallel_combine(lhs, empty);
-      MachineMappingResultWithMemory correct = empty;
+      MachineMappingWithMemoryResult result = parallel_combine(lhs, empty);
+      MachineMappingWithMemoryResult correct = empty;
 
       CHECK(result == correct);
     }
 
     SUBCASE("both are nonempty") {
-      MachineMappingResultWithMemory result = parallel_combine(lhs, rhs);
-      MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{
+      MachineMappingWithMemoryResult result = parallel_combine(lhs, rhs);
+      MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           SingleMachineMapping{
-              /*cost=*/CostMetric{
+              /*cost=*/OpCostMetrics{
                   std::max(lhs_cost.runtime, rhs_cost.runtime),
                   std::max(lhs_cost.memory, rhs_cost.memory),
               },
@@ -509,15 +509,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostMetric cost1 = CostMetric{
+    OpCostMetrics cost1 = OpCostMetrics{
         2.0,
         2,
     };
-    CostMetric cost2 = CostMetric{
+    OpCostMetrics cost2 = OpCostMetrics{
         4.0,
         1,
     };
-    CostMetric cost3 = CostMetric{
+    OpCostMetrics cost3 = OpCostMetrics{
         2.0,
         3,
     };
@@ -558,22 +558,22 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    MachineMappingResultWithMemory result1 = MachineMappingResultWithMemory{
+    MachineMappingWithMemoryResult result1 = MachineMappingWithMemoryResult{
         {
             mm1,
             mm2,
         },
     };
 
-    MachineMappingResultWithMemory result2 = MachineMappingResultWithMemory{
+    MachineMappingWithMemoryResult result2 = MachineMappingWithMemoryResult{
         {
             mm2,
             mm3,
         },
     };
 
-    MachineMappingResultWithMemory result = minimize_runtime(result1, result2);
-    MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{
+    MachineMappingWithMemoryResult result = minimize_runtime(result1, result2);
+    MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
         {
             mm1,
             mm2,

From 03151607792794f430b730a5c6605c4388f0a204 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 18 Dec 2024 10:13:54 -0500
Subject: [PATCH 08/16] fmt

---
 .../compiler/cost_estimator/cost_estimator.h  |  2 +-
 ...get_optimal_machine_mapping_with_memory.cc |  2 +-
 .../cost_estimator_for_test.cc                | 13 +++++++-----
 .../machine_mapping/cost_estimator_for_test.h |  3 ++-
 .../get_optimal_machine_mapping.cc            | 20 ++++++++++---------
 5 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index 9b006f178a..ecaffa337b 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H
 
-#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/cost_estimator/tensor_set_movement.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 96a67afaab..b67083e8cd 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -4,8 +4,8 @@
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
-#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
index 6ebfc45a6f..0431104878 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
@@ -5,13 +5,15 @@
 namespace FlexFlow {
 
 TestCostEstimator::TestCostEstimator(
-    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost)
     : get_operator_cost(get_operator_cost),
       get_communication_cost(get_communication_cost) {}
 
-OpCostMetrics TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
+OpCostMetrics
+    TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->get_operator_cost(k);
 }
 
@@ -20,11 +22,12 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
 }
 
 CostEstimator make_fake_cost_estimator(
-    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost) {
-  return CostEstimator::create<TestCostEstimator>(
-      get_operator_cost, get_communication_cost);
+  return CostEstimator::create<TestCostEstimator>(get_operator_cost,
+                                                  get_communication_cost);
 }
 
 CostEstimator make_fake_cost_estimator(
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
index 7fb4bcc6f8..16ea3a85bc 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
@@ -25,7 +25,8 @@ struct TestCostEstimator : public ICostEstimator {
 };
 
 CostEstimator make_fake_cost_estimator(
-    std::function<OpCostMetrics(OpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<OpCostMetrics(OpCostEstimateKey const &)> const
+        &get_operator_cost,
     std::function<float(TensorSetMovement const &)> const
         &get_communication_cost);
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 14a8b2e014..81665fbb94 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -147,11 +147,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     printf("Before constructing cost_estimator\n");
 
     auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
-            {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
-            {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
-            {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
-            {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
-        }};
+        {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
+        {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
+        {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
+        {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
+    }};
 
     printf("After constructing map1\n");
 
@@ -171,10 +171,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // CostEstimator cost_estimator = make_fake_cost_estimator(
     //     std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
-    //         {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
-    //         {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
-    //         {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
-    //         {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
+    //         {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0,
+    //         0)}, {map_unmapped_op_cost_estimate_key(k2, mv1),
+    //         OpCostMetrics(2.0, 0)}, {map_unmapped_op_cost_estimate_key(k1,
+    //         mv2), OpCostMetrics(1.5, 0)},
+    //         {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5,
+    //         0)},
     //     }},
     //     std::unordered_map<TensorSetMovement, float>{{
     //         {TensorSetMovement{{}}, 0.0},

From 855a7d53716ae5f7b3eb4fb51508c58530d2a4a4 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Mon, 30 Dec 2024 01:06:56 -0500
Subject: [PATCH 09/16] fix

---
 .../get_optimal_machine_mapping.cc            |  41 ++-----
 ...get_optimal_machine_mapping_with_memory.cc |   5 +-
 .../machine_mapping_result_with_memory.cc     | 104 ++++++++++--------
 3 files changed, 68 insertions(+), 82 deletions(-)

diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 81665fbb94..f5d5a5ee1b 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -144,17 +144,17 @@ TEST_SUITE(FF_TEST_SUITE) {
             {binary_tree_root_path(), mv2},
         }};
 
-    printf("Before constructing cost_estimator\n");
-
     auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
-        {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)},
-        {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)},
-        {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)},
-        {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)},
+        {map_unmapped_op_cost_estimate_key(k1, mv1),
+         OpCostMetrics{/*runtime=*/1.0, /*memory=*/0}},
+        {map_unmapped_op_cost_estimate_key(k2, mv1),
+         OpCostMetrics{/*runtime=*/2.0, /*memory=*/0}},
+        {map_unmapped_op_cost_estimate_key(k1, mv2),
+         OpCostMetrics{/*runtime=*/1.5, /*memory=*/0}},
+        {map_unmapped_op_cost_estimate_key(k2, mv2),
+         OpCostMetrics{/*runtime=*/2.5, /*memory=*/0}},
     }};
 
-    printf("After constructing map1\n");
-
     CostEstimator cost_estimator = make_fake_cost_estimator(
         map1,
         std::unordered_map<TensorSetMovement, float>{{
@@ -169,36 +169,11 @@ TEST_SUITE(FF_TEST_SUITE) {
              0.4},
         }});
 
-    // CostEstimator cost_estimator = make_fake_cost_estimator(
-    //     std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
-    //         {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0,
-    //         0)}, {map_unmapped_op_cost_estimate_key(k2, mv1),
-    //         OpCostMetrics(2.0, 0)}, {map_unmapped_op_cost_estimate_key(k1,
-    //         mv2), OpCostMetrics(1.5, 0)},
-    //         {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5,
-    //         0)},
-    //     }},
-    //     std::unordered_map<TensorSetMovement, float>{{
-    //         {TensorSetMovement{{}}, 0.0},
-    //         {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-    //          0.1},
-    //         {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-    //          0.2},
-    //         {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-    //          0.3},
-    //         {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-    //          0.4},
-    //     }});
-
-    printf("After constructing cost_estimator\n");
-
     MachineMappingContext context = MachineMappingContext{
         cost_estimator,
         allowed_machine_views1,
     };
 
-    printf("After constructing context\n");
-
     MachineMappingCache cache = empty_machine_mapping_cache();
 
     SUBCASE("single layer") {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 440ebde343..063f6a9826 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -212,7 +212,10 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           SingleMachineMapping{
-              OpCostMetrics{1.0 + 2.0 + 0.1, 2 + 3},
+              OpCostMetrics{
+                  /*runtime=*/1.0 + 2.0 + 0.1,
+                  /*memory=*/2 + 3,
+              },
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index bdd58f8717..3a28576193 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -52,16 +52,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        2.0,
-        2,
+        /*runtime=*/2.0,
+        /*memory=*/2,
     };
     OpCostMetrics cost2 = OpCostMetrics{
-        4.0,
-        1,
+        /*runtime=*/4.0,
+        /*memory=*/1,
     };
     OpCostMetrics cost3 = OpCostMetrics{
-        2.0,
-        3,
+        /*runtime=*/2.0,
+        /*memory=*/3,
     };
 
     SingleMachineMapping mm1 = SingleMachineMapping{
@@ -101,40 +101,42 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("empty") {
-      MachineMappingWithMemoryResult to_remove =
+      MachineMappingWithMemoryResult before_remove =
           empty_machine_mapping_with_memory_result();
       MachineMappingWithMemoryResult result =
-          remove_non_pareto_optimal_machine_mapping_result(to_remove);
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
       MachineMappingWithMemoryResult correct =
           empty_machine_mapping_with_memory_result();
 
       CHECK(result == correct);
     }
 
-    SUBCASE("no non-pareto_optimal") {
-      MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{
-          {
-              mm1,
-              mm2,
-          },
-      };
+    SUBCASE("all solutions are pareto-optimal") {
+      MachineMappingWithMemoryResult before_remove =
+          MachineMappingWithMemoryResult{
+              {
+                  mm1,
+                  mm2,
+              },
+          };
       MachineMappingWithMemoryResult result =
-          remove_non_pareto_optimal_machine_mapping_result(to_remove);
-      MachineMappingWithMemoryResult correct = to_remove;
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
+      MachineMappingWithMemoryResult correct = before_remove;
 
       CHECK(result == correct);
     }
 
-    SUBCASE("non-pareto_optimal") {
-      MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{
-          {
-              mm1,
-              mm2,
-              mm3,
-          },
-      };
+    SUBCASE("there exists a non-pareto-optimal solution") {
+      MachineMappingWithMemoryResult before_remove =
+          MachineMappingWithMemoryResult{
+              {
+                  mm1,
+                  mm2,
+                  mm3,
+              },
+          };
       MachineMappingWithMemoryResult result =
-          remove_non_pareto_optimal_machine_mapping_result(to_remove);
+          remove_non_pareto_optimal_machine_mapping_result(before_remove);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
           {
               mm1,
@@ -146,7 +148,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
-  TEST_CASE("series_combine(memory)") {
+  TEST_CASE("series_combine(float, MachineMappingWithMemoryResult const &, "
+            "MachineMappingWithMemoryResult const &, "
+            "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
             /*node_idx=*/0,
@@ -178,8 +182,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics pre_cost = OpCostMetrics{
-        2.0,
-        2,
+        /*runtime=*/2.0,
+        /*memory=*/2,
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
@@ -204,8 +208,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics post_cost = OpCostMetrics{
-        4.0,
-        1,
+        /*runtime=*/4.0,
+        /*memory=*/1,
     };
 
     MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
@@ -249,8 +253,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               {
                   SingleMachineMapping{
                       /*cost=*/OpCostMetrics{
-                          pre_cost.runtime + comm_cost + post_cost.runtime,
-                          pre_cost.memory + post_cost.memory,
+                          /*runtime=*/pre_cost.runtime + comm_cost +
+                              post_cost.runtime,
+                          /*memory=*/pre_cost.memory + post_cost.memory,
                       },
                       /*machine_mapping=*/
                       ParallelLayerGuidObliviousMachineMapping{{
@@ -302,8 +307,9 @@ TEST_SUITE(FF_TEST_SUITE) {
             {
                 SingleMachineMapping{
                     /*cost=*/OpCostMetrics{
-                        pre_cost.runtime + comm_cost + post_cost.runtime,
-                        pre_cost.memory + post_cost.memory,
+                        /*runtime=*/pre_cost.runtime + comm_cost +
+                            post_cost.runtime,
+                        /*memory=*/pre_cost.memory + post_cost.memory,
                     },
                     /*machine_mapping=*/
                     ParallelLayerGuidObliviousMachineMapping{{
@@ -337,7 +343,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
-  TEST_CASE("parallel_combine(memory)") {
+  TEST_CASE("parallel_combine(float, MachineMappingWithMemoryResult const &, "
+            "MachineMappingWithMemoryResult const &, "
+            "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
             /*node_idx=*/0,
@@ -369,8 +377,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics lhs_cost = OpCostMetrics{
-        2.0,
-        2,
+        /*runtime=*/2.0,
+        /*memory=*/2,
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
@@ -395,8 +403,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics rhs_cost = OpCostMetrics{
-        4.0,
-        1,
+        /*runtime=*/4.0,
+        /*memory=*/1,
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
         SingleMachineMapping{
@@ -434,8 +442,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           SingleMachineMapping{
               /*cost=*/OpCostMetrics{
-                  std::max(lhs_cost.runtime, rhs_cost.runtime),
-                  std::max(lhs_cost.memory, rhs_cost.memory),
+                  /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime),
+                  /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
               },
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{
@@ -510,16 +518,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        2.0,
-        2,
+        /*runtime=*/2.0,
+        /*memory=*/2,
     };
     OpCostMetrics cost2 = OpCostMetrics{
-        4.0,
-        1,
+        /*runtime=*/4.0,
+        /*memory=*/1,
     };
     OpCostMetrics cost3 = OpCostMetrics{
-        2.0,
-        3,
+        /*runtime=*/2.0,
+        /*memory=*/3,
     };
 
     SingleMachineMapping mm1 = SingleMachineMapping{

From 2b4e127e92e99b4f9a89e18056474129aeea69cd Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 8 Jan 2025 20:39:30 -0500
Subject: [PATCH 10/16] rename single machine mapping

---
 ...hine_mapping_for_single_layer.struct.toml} |  2 +-
 ...ine_mapping_with_memory_result.struct.toml |  4 +--
 .../machine_mapping_with_memory_result.cc     | 28 +++++++++----------
 ...get_optimal_machine_mapping_with_memory.cc | 10 +++----
 .../machine_mapping_result_with_memory.cc     | 26 ++++++++---------
 5 files changed, 35 insertions(+), 35 deletions(-)
 rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{single_machine_mapping.struct.toml => machine_mapping_for_single_layer.struct.toml} (90%)

diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
similarity index 90%
rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
index f33e320e3b..b61dd134c0 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "SingleMachineMapping"
+name = "MachineMappingForSingleLayer"
 features = [
   "eq",
   "hash",
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
index 50de145b36..c1e1ee1cac 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "compiler/machine_mapping/memory_optimization/single_machine_mapping.dtg.h",
+  "compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.dtg.h",
 ]
 
 src_includes = [
@@ -17,4 +17,4 @@ src_includes = [
 
 [[fields]]
 name = "machine_mappings"
-type = "std::unordered_set<::FlexFlow::SingleMachineMapping>"
+type = "std::unordered_set<::FlexFlow::MachineMappingForSingleLayer>"
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index d38e4a7b6a..2f443e4fc5 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -25,10 +25,10 @@ MachineMappingWithMemoryResult get_mapping_with_minimal_runtime(
 
 MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
     MachineMappingWithMemoryResult const &result) {
-  std::unordered_set<SingleMachineMapping> non_pareto_optimal_mappings;
-  for (SingleMachineMapping const &mapping : result.machine_mappings) {
+  std::unordered_set<MachineMappingForSingleLayer> non_pareto_optimal_mappings;
+  for (MachineMappingForSingleLayer const &mapping : result.machine_mappings) {
     bool is_pareto_optimal = true;
-    for (SingleMachineMapping const &other_mapping : result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &other_mapping : result.machine_mappings) {
       if (mapping.cost.runtime >= other_mapping.cost.runtime &&
           mapping.cost.memory >= other_mapping.cost.memory &&
           mapping != other_mapping) {
@@ -49,8 +49,8 @@ MachineMappingWithMemoryResult
                    MachineMappingWithMemoryResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
                        &parallel_split_transformation) {
-  auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm,
-                                     SingleMachineMapping const &post_mm) {
+  auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &pre_mm,
+                                     MachineMappingForSingleLayer const &post_mm) {
     OpCostMetrics cost = OpCostMetrics{
         pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
         pre_mm.cost.memory + post_mm.cost.memory,
@@ -68,13 +68,13 @@ MachineMappingWithMemoryResult
       }
     }();
 
-    return SingleMachineMapping{cost, mapping};
+    return MachineMappingForSingleLayer{cost, mapping};
   };
 
   MachineMappingWithMemoryResult result =
       empty_machine_mapping_with_memory_result();
-  for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) {
-    for (SingleMachineMapping const &post_mm : post_result.machine_mappings) {
+  for (MachineMappingForSingleLayer const &pre_mm : pre_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &post_mm : post_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm));
     }
   }
@@ -85,8 +85,8 @@ MachineMappingWithMemoryResult
 MachineMappingWithMemoryResult
     parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
                      MachineMappingWithMemoryResult const &rhs_result) {
-  auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm,
-                                     SingleMachineMapping const &rhs_mm) {
+  auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &lhs_mm,
+                                     MachineMappingForSingleLayer const &rhs_mm) {
     OpCostMetrics cost = OpCostMetrics{
         std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
         std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
@@ -95,13 +95,13 @@ MachineMappingWithMemoryResult
     ParallelLayerGuidObliviousMachineMapping mapping =
         binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping);
 
-    return SingleMachineMapping{cost, mapping};
+    return MachineMappingForSingleLayer{cost, mapping};
   };
 
   MachineMappingWithMemoryResult result =
       empty_machine_mapping_with_memory_result();
-  for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) {
-    for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) {
+  for (MachineMappingForSingleLayer const &lhs_mm : lhs_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &rhs_mm : rhs_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm));
     }
   }
@@ -122,7 +122,7 @@ MachineMappingWithMemoryResult
     make_singleton_machine_mapping_with_memory_result(
         OpCostMetrics cost, MachineView const &machine_view) {
   return MachineMappingWithMemoryResult{{
-      SingleMachineMapping{
+      MachineMappingForSingleLayer{
           cost,
           ParallelLayerGuidObliviousMachineMapping{{
               {binary_tree_root_path(), machine_view},
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 063f6a9826..8761116be2 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -182,13 +182,13 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
-          SingleMachineMapping{
+          MachineMappingForSingleLayer{
               OpCostMetrics{1.0, 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
               }},
           },
-          SingleMachineMapping{
+          MachineMappingForSingleLayer{
               OpCostMetrics{1.5, 1},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv2},
@@ -211,7 +211,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
-          SingleMachineMapping{
+          MachineMappingForSingleLayer{
               OpCostMetrics{
                   /*runtime=*/1.0 + 2.0 + 0.1,
                   /*memory=*/2 + 3,
@@ -231,7 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                   },
               }},
           },
-          SingleMachineMapping{
+          MachineMappingForSingleLayer{
               OpCostMetrics{1.5 + 2.5 + 0.1, 1 + 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -265,7 +265,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_optimal_machine_mapping_with_memory(
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct =
-          MachineMappingWithMemoryResult{{SingleMachineMapping{
+          MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{
               OpCostMetrics{2.5, 2},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 3a28576193..a47d8713e9 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -64,7 +64,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*memory=*/3,
     };
 
-    SingleMachineMapping mm1 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
         cost1,
         ParallelLayerGuidObliviousMachineMapping{
             {
@@ -76,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    SingleMachineMapping mm2 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{
         cost2,
         ParallelLayerGuidObliviousMachineMapping{
             {
@@ -88,7 +88,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    SingleMachineMapping mm3 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{
         cost3,
         ParallelLayerGuidObliviousMachineMapping{
             {
@@ -186,7 +186,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*memory=*/2,
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
-        SingleMachineMapping{
+        MachineMappingForSingleLayer{
             pre_cost,
             ParallelLayerGuidObliviousMachineMapping{
                 {
@@ -213,7 +213,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
-        SingleMachineMapping{
+        MachineMappingForSingleLayer{
             post_cost,
             ParallelLayerGuidObliviousMachineMapping{
                 {
@@ -251,7 +251,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult no_parallel_split_transform =
           MachineMappingWithMemoryResult{
               {
-                  SingleMachineMapping{
+                  MachineMappingForSingleLayer{
                       /*cost=*/OpCostMetrics{
                           /*runtime=*/pre_cost.runtime + comm_cost +
                               post_cost.runtime,
@@ -305,7 +305,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             comm_cost, pre, post, ParallelSplitTransformation::RthenL);
         MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{
             {
-                SingleMachineMapping{
+                MachineMappingForSingleLayer{
                     /*cost=*/OpCostMetrics{
                         /*runtime=*/pre_cost.runtime + comm_cost +
                             post_cost.runtime,
@@ -381,7 +381,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*memory=*/2,
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
-        SingleMachineMapping{
+        MachineMappingForSingleLayer{
             lhs_cost,
             ParallelLayerGuidObliviousMachineMapping{
                 {
@@ -407,7 +407,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*memory=*/1,
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
-        SingleMachineMapping{
+        MachineMappingForSingleLayer{
             rhs_cost,
             ParallelLayerGuidObliviousMachineMapping{
                 {
@@ -440,7 +440,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("both are nonempty") {
       MachineMappingWithMemoryResult result = parallel_combine(lhs, rhs);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
-          SingleMachineMapping{
+          MachineMappingForSingleLayer{
               /*cost=*/OpCostMetrics{
                   /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime),
                   /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
@@ -530,7 +530,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*memory=*/3,
     };
 
-    SingleMachineMapping mm1 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
         cost1,
         ParallelLayerGuidObliviousMachineMapping{
             {
@@ -542,7 +542,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    SingleMachineMapping mm2 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{
         cost2,
         ParallelLayerGuidObliviousMachineMapping{
             {
@@ -554,7 +554,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    SingleMachineMapping mm3 = SingleMachineMapping{
+    MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{
         cost3,
         ParallelLayerGuidObliviousMachineMapping{
             {

From 50bae937f6acd0e77cb77733bfbd23da167747db Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 8 Jan 2025 21:08:47 -0500
Subject: [PATCH 11/16] format

---
 .../machine_mapping_with_memory_result.cc     | 80 ++++++++++---------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index 2f443e4fc5..a6c2d1ed04 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -28,7 +28,8 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
   std::unordered_set<MachineMappingForSingleLayer> non_pareto_optimal_mappings;
   for (MachineMappingForSingleLayer const &mapping : result.machine_mappings) {
     bool is_pareto_optimal = true;
-    for (MachineMappingForSingleLayer const &other_mapping : result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &other_mapping :
+         result.machine_mappings) {
       if (mapping.cost.runtime >= other_mapping.cost.runtime &&
           mapping.cost.memory >= other_mapping.cost.memory &&
           mapping != other_mapping) {
@@ -49,32 +50,35 @@ MachineMappingWithMemoryResult
                    MachineMappingWithMemoryResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
                        &parallel_split_transformation) {
-  auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &pre_mm,
-                                     MachineMappingForSingleLayer const &post_mm) {
-    OpCostMetrics cost = OpCostMetrics{
-        pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
-        pre_mm.cost.memory + post_mm.cost.memory,
-    };
-
-    ParallelLayerGuidObliviousMachineMapping mapping = [&] {
-      if (parallel_split_transformation.has_value() &&
-          parallel_split_transformation.value() ==
-              ParallelSplitTransformation::RthenL) {
-        return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping,
-                                       /*rhs=*/pre_mm.machine_mapping);
-      } else {
-        return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping,
-                                       /*rhs=*/post_mm.machine_mapping);
-      }
-    }();
-
-    return MachineMappingForSingleLayer{cost, mapping};
-  };
+  auto combine_machine_mapping =
+      [&](MachineMappingForSingleLayer const &pre_mm,
+          MachineMappingForSingleLayer const &post_mm) {
+        OpCostMetrics cost = OpCostMetrics{
+            pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
+            pre_mm.cost.memory + post_mm.cost.memory,
+        };
+
+        ParallelLayerGuidObliviousMachineMapping mapping = [&] {
+          if (parallel_split_transformation.has_value() &&
+              parallel_split_transformation.value() ==
+                  ParallelSplitTransformation::RthenL) {
+            return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping,
+                                           /*rhs=*/pre_mm.machine_mapping);
+          } else {
+            return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping,
+                                           /*rhs=*/post_mm.machine_mapping);
+          }
+        }();
+
+        return MachineMappingForSingleLayer{cost, mapping};
+      };
 
   MachineMappingWithMemoryResult result =
       empty_machine_mapping_with_memory_result();
-  for (MachineMappingForSingleLayer const &pre_mm : pre_result.machine_mappings) {
-    for (MachineMappingForSingleLayer const &post_mm : post_result.machine_mappings) {
+  for (MachineMappingForSingleLayer const &pre_mm :
+       pre_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &post_mm :
+         post_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm));
     }
   }
@@ -85,23 +89,27 @@ MachineMappingWithMemoryResult
 MachineMappingWithMemoryResult
     parallel_combine(MachineMappingWithMemoryResult const &lhs_result,
                      MachineMappingWithMemoryResult const &rhs_result) {
-  auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &lhs_mm,
-                                     MachineMappingForSingleLayer const &rhs_mm) {
-    OpCostMetrics cost = OpCostMetrics{
-        std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
-        std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
-    };
+  auto combine_machine_mapping =
+      [&](MachineMappingForSingleLayer const &lhs_mm,
+          MachineMappingForSingleLayer const &rhs_mm) {
+        OpCostMetrics cost = OpCostMetrics{
+            std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
+            std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
+        };
 
-    ParallelLayerGuidObliviousMachineMapping mapping =
-        binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping);
+        ParallelLayerGuidObliviousMachineMapping mapping =
+            binary_combine_mappings(lhs_mm.machine_mapping,
+                                    rhs_mm.machine_mapping);
 
-    return MachineMappingForSingleLayer{cost, mapping};
-  };
+        return MachineMappingForSingleLayer{cost, mapping};
+      };
 
   MachineMappingWithMemoryResult result =
       empty_machine_mapping_with_memory_result();
-  for (MachineMappingForSingleLayer const &lhs_mm : lhs_result.machine_mappings) {
-    for (MachineMappingForSingleLayer const &rhs_mm : rhs_result.machine_mappings) {
+  for (MachineMappingForSingleLayer const &lhs_mm :
+       lhs_result.machine_mappings) {
+    for (MachineMappingForSingleLayer const &rhs_mm :
+         rhs_result.machine_mappings) {
       result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm));
     }
   }

From d96b678a9fbf883fb228377df898b4c8d1aab2b7 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 15 Jan 2025 17:07:28 -0500
Subject: [PATCH 12/16] top-level loop for compiler

---
 .../compiler/algorithm_config.variant.toml    |  18 ++
 lib/compiler/include/compiler/compiler.h      |  32 +---
 .../data_parallelism_config.struct.toml       |  14 ++
 .../graph_optimize_result.struct.toml         |  16 --
 .../machine_mapping/machine_mapping.h         |   5 +
 ...ne_mapping_problem_tree_result.struct.toml |  21 +++
 .../compiler/search_result.struct.toml        |  17 ++
 ...get_pcg_balanced_binary_sp_decomposition.h |   2 +
 .../include/compiler/unity_algorithm.h        |  24 ---
 .../allowed_machine_views.h                   |   0
 .../graph_optimize_state.h                    |  10 +-
 .../unity_algorithm/unity_algorithm.h         |  21 +++
 .../unity_search_config.struct.toml}          |   2 +-
 lib/compiler/src/compiler/compiler.cc         |  32 ++++
 .../src/compiler/graph_optimize_state.cc      |  85 ----------
 .../machine_mapping/machine_mapping.cc        |  39 +++++
 .../allowed_machine_views.cc                  |   2 +-
 .../unity_algorithm/graph_optimize_state.cc   |  49 ++++++
 .../unity_algorithm/unity_algorithm.cc        | 157 ++++++++++++++++++
 lib/compiler/src/unity_algorithm.cc           |  93 -----------
 .../test/src/allowed_machine_views.cc         |   2 +-
 lib/compiler/test/src/graph_optimize_state.cc | 129 +++++++-------
 lib/compiler/test/src/unity_algorithm.cc      |   3 +-
 lib/pcg/include/pcg/operator_task_space.h     |   5 +
 .../parallel_computation_graph.h              |   4 +
 lib/pcg/src/pcg/operator_task_space.cc        |   5 +
 .../parallel_computation_graph.cc             |   5 +
 .../binary_sp_decomposition_tree.h            |   6 +
 .../binary_sp_decomposition_tree.cc           |   7 +
 29 files changed, 489 insertions(+), 316 deletions(-)
 create mode 100644 lib/compiler/include/compiler/algorithm_config.variant.toml
 create mode 100644 lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
 delete mode 100644 lib/compiler/include/compiler/graph_optimize_result.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml
 create mode 100644 lib/compiler/include/compiler/search_result.struct.toml
 delete mode 100644 lib/compiler/include/compiler/unity_algorithm.h
 rename lib/compiler/include/compiler/{ => unity_algorithm}/allowed_machine_views.h (100%)
 rename lib/compiler/include/compiler/{ => unity_algorithm}/graph_optimize_state.h (67%)
 create mode 100644 lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
 rename lib/compiler/include/compiler/{optimizer_config.struct.toml => unity_algorithm/unity_search_config.struct.toml} (90%)
 create mode 100644 lib/compiler/src/compiler/compiler.cc
 delete mode 100644 lib/compiler/src/compiler/graph_optimize_state.cc
 rename lib/compiler/src/compiler/{ => unity_algorithm}/allowed_machine_views.cc (98%)
 create mode 100644 lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
 create mode 100644 lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
 delete mode 100644 lib/compiler/src/unity_algorithm.cc

diff --git a/lib/compiler/include/compiler/algorithm_config.variant.toml b/lib/compiler/include/compiler/algorithm_config.variant.toml
new file mode 100644
index 0000000000..4e58104875
--- /dev/null
+++ b/lib/compiler/include/compiler/algorithm_config.variant.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "AlgorithmConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/data_parallelism/data_parallelism_config.dtg.h",
+  "compiler/unity_algorithm/unity_search_config.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::DataParallelismConfig"
+
+[[values]]
+type = "::FlexFlow::UnitySearchConfig"
diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h
index 178ab19a53..3faacd8f16 100644
--- a/lib/compiler/include/compiler/compiler.h
+++ b/lib/compiler/include/compiler/compiler.h
@@ -1,42 +1,24 @@
 #ifndef _FLEXFLOW_COMPILER_COMPILER_H
 #define _FLEXFLOW_COMPILER_COMPILER_H
 
-#include "pcg/cost_values.h"
-#include "pcg/machine_view.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
-#include "pcg/tensor_mapping.h"
+#include "compiler/algorithm_config.dtg.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "pcg/machine_specification.dtg.h"
 
 namespace FlexFlow {
 
 enum class SearchAlgorithm {
   DATA_PARALLEL,
-};
-
-using SearchAlgorithmConfig = std::variant<>;
-using SearchSolution = std::variant<>;
-
-struct SearchResult {
-  ParallelComputationGraph pcg;
-  TensorMapping tensor_mapping;
-  SearchSolution solution;
-  CostValues cost_values;
+  UNITY,
 };
 
 SearchResult optimize(ComputationGraph const &,
                       MachineSpecification const &,
                       CostEstimator const &,
                       SearchAlgorithm,
-                      optional<AlgorithmConfig> const &);
-
-// struct SearchSolution {
-//   LabelledMultiDiGraph<PCGOperatorAttrs, ParallelTensorShape> optimized_pcg;
-//   std::unordered_map<Node, MachineView> device_assignments;
-//   /* std::unordered_map<tensor_guid_t,
-//   std::unordered_set<parallel_tensor_guid_t>> tensor_mappings; */
-// };
-//
-// SearchSolution run_data_parallelize(ComputationGraph const &,
-// MachineSpecification const &);
+                      AlgorithmConfig const &,
+                      DeviceType);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
new file mode 100644
index 0000000000..68512fa473
--- /dev/null
+++ b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
@@ -0,0 +1,14 @@
+namespace = "FlexFlow"
+name = "DataParallelismConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "degree"
+type = "int"
diff --git a/lib/compiler/include/compiler/graph_optimize_result.struct.toml b/lib/compiler/include/compiler/graph_optimize_result.struct.toml
deleted file mode 100644
index 22f29cbd59..0000000000
--- a/lib/compiler/include/compiler/graph_optimize_result.struct.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-namespace = "FlexFlow"
-name = "GraphOptimizeResult"
-features = [ ]
-
-includes = [ 
-  "compiler/machine_mapping/machine_mapping.dtg.h",
-  "pcg/parallel_computation_graph/parallel_computation_graph.h"
-]
-
-[[fields]]
-name = "pcg"
-type = "::FlexFlow::ParallelComputationGraph"
-
-[[fields]]
-name = "machine_mapping"
-type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 06cbbf942d..8f9fe23c1c 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
 
 namespace FlexFlow {
 
@@ -10,6 +12,9 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &,
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2);
 
+MachineMapping get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &, MachineMappingResult const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml
new file mode 100644
index 0000000000..252cd88276
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml
@@ -0,0 +1,21 @@
+namespace = "FlexFlow"
+name = "GetMachineMappingProblemTreeResult"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h",
+  "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h",
+  "utils/bidict/bidict.h"
+]
+
+[[fields]]
+type = "::FlexFlow::MachineMappingProblemTree"
+name = "mm_problem_tree"
+
+[[fields]]
+type = "::FlexFlow::bidict<::FlexFlow::UnmappedOpCostEstimateKey, ::FlexFlow::parallel_layer_guid_t>"
+name = "mapping"
diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml
new file mode 100644
index 0000000000..3776ec5568
--- /dev/null
+++ b/lib/compiler/include/compiler/search_result.struct.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SearchResult"
+features = [
+]
+
+includes = [
+  "pcg/parallel_computation_graph/parallel_computation_graph.h",
+  "machine_mapping/machine_mapping.h",
+]
+
+[[fields]]
+name = "pcg"
+type = "::FlexFlow::ParallelComputationGraph"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
index d43edaa79d..bb7459c767 100644
--- a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
@@ -1,6 +1,8 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
+
 namespace FlexFlow {
 
 std::optional<PCGBinarySPDecomposition>
diff --git a/lib/compiler/include/compiler/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm.h
deleted file mode 100644
index 232f2b9563..0000000000
--- a/lib/compiler/include/compiler/unity_algorithm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-
-#include "compiler/cost_estimator/cost_estimator.h"
-#include "compiler/graph_optimize_result.dtg.h"
-#include "optimizer_config.dtg.h"
-#include "pcg/computation_graph.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/allowed_machine_views.h b/lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h
similarity index 100%
rename from lib/compiler/include/compiler/allowed_machine_views.h
rename to lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h
diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
similarity index 67%
rename from lib/compiler/include/compiler/graph_optimize_state.h
rename to lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
index 2de2321ba6..3a2823c46d 100644
--- a/lib/compiler/include/compiler/graph_optimize_state.h
+++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
@@ -1,16 +1,16 @@
 #ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H
 #define _FLEXFLOW_COMPILER_MCMC_STATE_H
 
-#include "compiler/graph_optimize_result.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 
 namespace FlexFlow {
 
 struct GraphOptimizeState {
-  GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result,
-                     float runtime);
+  GraphOptimizeState(ParallelComputationGraph const &pcg,
+                     float runtime_with_optimal_mm);
 
-  GraphOptimizeResult graph_optimize_result;
-  float runtime;
+  ParallelComputationGraph pcg;
+  float runtime_with_optimal_mm;
 
   bool operator==(GraphOptimizeState const &other) const;
   bool operator!=(GraphOptimizeState const &other) const;
diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
new file mode 100644
index 0000000000..4396bef734
--- /dev/null
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "compiler/unity_algorithm/unity_search_config.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            std::vector<Substitution> const &substitutions,
+                            UnitySearchConfig const &search_config,
+                            DeviceType device_type);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/optimizer_config.struct.toml b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
similarity index 90%
rename from lib/compiler/include/compiler/optimizer_config.struct.toml
rename to lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
index b7f4f71e9c..9ec22cf916 100644
--- a/lib/compiler/include/compiler/optimizer_config.struct.toml
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "OptimizerConfig"
+name = "UnitySearchConfig"
 features = [
   "eq",
   "hash",
diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc
new file mode 100644
index 0000000000..a428c51abc
--- /dev/null
+++ b/lib/compiler/src/compiler/compiler.cc
@@ -0,0 +1,32 @@
+#include "compiler/compiler.h"
+#include "compiler/unity_algorithm/unity_algorithm.h"
+
+namespace FlexFlow {
+
+SearchResult optimize(ComputationGraph const &computation_graph,
+                      MachineSpecification const &machine_specification,
+                      CostEstimator const &cost_estimator,
+                      SearchAlgorithm search_algorithm,
+                      UnitySearchConfig const &search_config,
+                      DeviceType device_type) {
+  switch (search_algorithm) {
+    case SearchAlgorithm::DATA_PARALLEL:
+      throw std::runtime_error(
+          "Data parallel search algorithm is not implemented yet");
+    case SearchAlgorithm::UNITY: {
+      ParallelComputationGraph pcg =
+          parallel_computation_graph_from_computation_graph(computation_graph);
+      std::vector<Substitution> substitutions; // TODO: Implement this
+      return graph_optimize(pcg,
+                            cost_estimator,
+                            machine_specification,
+                            substitutions,
+                            search_config,
+                            device_type);
+    }
+    default:
+      throw std::runtime_error("Unknown search algorithm");
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc
deleted file mode 100644
index 4b4f323ea4..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_state.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-#include "compiler/graph_optimize_state.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-
-namespace FlexFlow {
-
-GraphOptimizeState::GraphOptimizeState(
-    GraphOptimizeResult const &graph_optimize_result, float runtime)
-    : graph_optimize_result(graph_optimize_result), runtime(runtime) {}
-
-bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
-  // Note(@wmdi): This is a hack to implement a partially correct homomorphism
-  // check. Switch to the homomorphism check used in substitutions right after
-  // https://github.com/flexflow/FlexFlow/pull/1471 is merged.
-  auto layers1 = topological_ordering(graph_optimize_result.pcg);
-  auto layers2 = topological_ordering(other.graph_optimize_result.pcg);
-  if (layers1.size() != layers2.size()) {
-    return false;
-  }
-  std::unordered_map<parallel_tensor_guid_t, parallel_tensor_guid_t> mapping;
-  for (size_t i = 0; i < layers1.size(); ++i) {
-    if (get_parallel_layer_attrs(graph_optimize_result.pcg, layers1[i]) !=
-        get_parallel_layer_attrs(other.graph_optimize_result.pcg, layers2[i])) {
-      return false;
-    }
-    auto inputs1 = get_incoming_tensors(graph_optimize_result.pcg, layers1[i]);
-    auto inputs2 =
-        get_incoming_tensors(other.graph_optimize_result.pcg, layers2[i]);
-    if (inputs1.size() != inputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < inputs1.size(); ++j) {
-      if (inputs1[j] != mapping.at(inputs2[j])) {
-        return false;
-      }
-    }
-    auto outputs1 = get_layer_outputs(graph_optimize_result.pcg, layers1[i]);
-    auto outputs2 =
-        get_layer_outputs(other.graph_optimize_result.pcg, layers2[i]);
-    if (outputs1.size() != outputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < outputs1.size(); ++j) {
-      mapping.emplace(outputs2[j], outputs1[j]);
-    }
-  }
-  return true;
-}
-
-bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
-  return !(*this == other);
-}
-
-bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
-  return runtime < other.runtime;
-}
-
-} // namespace FlexFlow
-
-namespace std {
-
-size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
-    ::FlexFlow::GraphOptimizeState const &state) const {
-  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
-  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
-  size_t seed = 0;
-  auto layers = topological_ordering(state.graph_optimize_result.pcg);
-  ::FlexFlow::hash_combine(seed, layers.size());
-  for (auto layer : layers) {
-    ::FlexFlow::hash_combine(
-        seed, get_parallel_layer_attrs(state.graph_optimize_result.pcg, layer));
-    auto inputs = get_incoming_tensors(state.graph_optimize_result.pcg, layer);
-    ::FlexFlow::hash_combine(seed, inputs.size());
-    for (auto input : inputs) {
-      for (size_t i = 0; i < layers.size(); ++i) {
-        if (get_source_layer(input) == layers[i]) {
-          ::FlexFlow::hash_combine(seed, i);
-          break;
-        }
-      }
-    }
-  }
-  return seed;
-}
-
-} // namespace std
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 57e82684e9..39222b91ac 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,7 +1,9 @@
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/merge_maps.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 
 namespace FlexFlow {
 
@@ -14,4 +16,41 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
   return are_disjoint(keys(m1.machine_views), keys(m2.machine_views));
 }
 
+MachineMapping get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &sp_decomposition,
+    MachineMappingResult const &mm_result) {
+
+  BinarySPDecompositionTree sp_tree =
+      binary_sp_tree_from_pcg_sp_tree(sp_decomposition);
+
+  auto get_layer_from_path =
+      [&](BinaryTreePath const &path) -> parallel_layer_guid_t {
+    std::optional<BinarySPDecompositionTree> subtree_optional =
+        binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path);
+    if (!subtree_optional.has_value()) {
+      throw std::runtime_error("Invalid tree path");
+    }
+    BinarySPDecompositionTree subtree = subtree_optional.value();
+    if (!subtree.is_node()) {
+      throw std::runtime_error("Invalid tree path to a leaf");
+    }
+    return parallel_layer_guid_t{
+        subtree.get<Node>(),
+    };
+  };
+
+  std::unordered_map<parallel_layer_guid_t, MachineView> mm;
+
+  if (mm_result.raw_result) {
+    FeasibleMachineMappingResult const &feasible_mm_result =
+        mm_result.raw_result.value();
+    for (auto const &[path, mv] :
+         feasible_mm_result.machine_mapping.raw_mapping) {
+      mm.insert({get_layer_from_path(path), mv});
+    }
+  }
+
+  return MachineMapping{mm};
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc
similarity index 98%
rename from lib/compiler/src/compiler/allowed_machine_views.cc
rename to lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc
index 1c226f79b0..d6fca79403 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/allowed_machine_views.h"
+#include "compiler/unity_algorithm/allowed_machine_views.h"
 #include "pcg/machine_specification.h"
 #include "pcg/machine_view.h"
 #include "pcg/multi_dimensional_stride.dtg.h"
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
new file mode 100644
index 0000000000..bf8f089cc0
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -0,0 +1,49 @@
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+
+namespace FlexFlow {
+
+GraphOptimizeState::GraphOptimizeState(ParallelComputationGraph const &pcg,
+                                       float runtime_with_optimal_mm)
+    : pcg(pcg), runtime_with_optimal_mm(runtime_with_optimal_mm) {}
+
+bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
+  return pcgs_are_isomorphic(pcg, other.pcg);
+}
+
+bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
+  return !(*this == other);
+}
+
+bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
+  return runtime_with_optimal_mm < other.runtime_with_optimal_mm;
+}
+
+} // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
+    ::FlexFlow::GraphOptimizeState const &state) const {
+  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
+  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
+  size_t seed = 0;
+  auto layers = topological_ordering(state.pcg);
+  ::FlexFlow::hash_combine(seed, layers.size());
+  for (auto layer : layers) {
+    ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer));
+    auto inputs = get_incoming_tensors(state.pcg, layer);
+    ::FlexFlow::hash_combine(seed, inputs.size());
+    for (auto input : inputs) {
+      for (size_t i = 0; i < layers.size(); ++i) {
+        if (get_source_layer(input) == layers[i]) {
+          ::FlexFlow::hash_combine(seed, i);
+          break;
+        }
+      }
+    }
+  }
+  return seed;
+}
+
+} // namespace std
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..2f2caf11ec
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,157 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
+#include "compiler/unity_algorithm/allowed_machine_views.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/graph/node/algorithms.h"
+
+namespace FlexFlow {
+
+/*
+ * Applies a substitution to all possible positions in PCG
+ */
+std::vector<ParallelComputationGraph>
+    all_pcgs_obtained_by_applying_a_substitution(
+        ParallelComputationGraph const &pcg,
+        std::vector<Substitution> const &substitutions) {
+  std::vector<ParallelComputationGraph> results;
+  SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(pcg);
+  for (Substitution const &substitution : substitutions) {
+    for (PCGPatternMatch const &pattern_match :
+         find_pattern_matches(substitution.pcg_pattern, subpcg)) {
+      SubParallelComputationGraph subpcg_from_substitution =
+          apply_substitution(subpcg, substitution, pattern_match);
+      results.push_back(
+          pcg_from_sub_pcg_by_dropping_inputs(subpcg_from_substitution));
+    }
+  }
+  return results;
+}
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            std::vector<Substitution> const &substitutions,
+                            UnitySearchConfig const &search_config,
+                            DeviceType device_type) {
+
+  // NOTE(@wmdi): This mapping is only used for allowed_machine_views
+  std::unordered_map<UnmappedOpCostEstimateKey, parallel_layer_guid_t>
+      mapping_from_unmapped_op_cost_estimate_key_parallel_layer = [&] {
+        std::unordered_map<UnmappedOpCostEstimateKey, parallel_layer_guid_t>
+            mapping;
+        for (parallel_layer_guid_t layer : get_parallel_layers(pcg)) {
+          // NOTE(@wmdi): Assume layers with the same key have the same allowed
+          // machine views
+          mapping.insert(
+              {get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), layer});
+        }
+        return mapping;
+      }();
+
+  MachineMappingCache cached_subgraph_costs = MachineMappingCache{
+      {},
+  };
+  DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
+
+  MachineMappingContext context = MachineMappingContext{
+      /*cost_estimator=*/cost_estimator,
+      /*allowed_machine_views=*/
+      [&](UnmappedOpCostEstimateKey const &key,
+          MachineSpecification const &resources)
+          -> std::unordered_set<MachineView> {
+        return get_allowed_machine_views(
+            resources,
+            get_operator_task_space(
+                pcg,
+                mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at(
+                    key)),
+            device_type);
+      },
+  };
+
+  auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
+      -> std::pair<GraphOptimizeState, MachineMapping> {
+    std::optional<PCGBinarySPDecomposition> maybe_sp_decomp =
+        get_pcg_balanced_binary_sp_decomposition(pcg);
+
+    if (!maybe_sp_decomp.has_value()) {
+      throw std::runtime_error("Fail to SP-ize PCG");
+    }
+
+    PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value();
+
+    MachineMappingConstraints constraints = MachineMappingConstraints{
+        /*machine_views=*/{},
+    };
+
+    MachineMappingResult mm_result = get_optimal_machine_mapping(
+        cached_subgraph_costs,
+        context,
+        get_machine_mapping_problem_tree(pcg, sp_decomp),
+        resources,
+        constraints);
+
+    float runtime_with_optimal_mm;
+    if (mm_result.raw_result == std::nullopt) {
+      runtime_with_optimal_mm = std::numeric_limits<float>::infinity();
+    } else {
+      runtime_with_optimal_mm = mm_result.raw_result.value().runtime;
+    }
+    return {
+        GraphOptimizeState{
+            /*pcg=*/pcg,
+            /*runtime_with_optimal_mm=*/runtime_with_optimal_mm,
+        },
+        get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result),
+    };
+  };
+
+  GraphOptimizeState best_state = optimize_pcg(pcg).first;
+  candidates.push(best_state);
+
+  for (int iteration = 0;
+       !candidates.empty() && iteration < search_config.budget;
+       ++iteration) {
+    GraphOptimizeState current_state = candidates.top();
+    candidates.pop();
+
+    if (current_state < best_state) {
+      best_state = current_state;
+    } else if (current_state.runtime_with_optimal_mm >
+               best_state.runtime_with_optimal_mm * search_config.alpha) {
+      continue;
+    }
+
+    for (ParallelComputationGraph const &new_pcg :
+         all_pcgs_obtained_by_applying_a_substitution(current_state.pcg,
+                                                      substitutions)) {
+      std::optional<GraphOptimizeState> new_pcg_optimize_result =
+          optimize_pcg(new_pcg).first;
+      if (new_pcg_optimize_result == std::nullopt) {
+        continue;
+      }
+      GraphOptimizeState new_state = new_pcg_optimize_result.value();
+      if (new_state.runtime_with_optimal_mm <= search_config.threshold &&
+          get_nodes(new_pcg.raw_graph).size() <= search_config.max_num_ops) {
+        candidates.push(new_state);
+      }
+    }
+  }
+
+  return SearchResult{
+      /*pcg=*/best_state.pcg,
+      /*machine_mapping=*/optimize_pcg(best_state.pcg).second,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/unity_algorithm.cc b/lib/compiler/src/unity_algorithm.cc
deleted file mode 100644
index 86a211c535..0000000000
--- a/lib/compiler/src/unity_algorithm.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "compiler/unity_algorithm.h"
-#include "compiler/graph_optimize_state.h"
-#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/substitution.h"
-#include "utils/deduplicated_priority_queue.h"
-#include "utils/graph/node/algorithms.h"
-namespace FlexFlow {
-
-/*
- * Gets all substitutions applicable to a PCG
- */
-std::vector<Substitution>
-    get_all_applicable_substitutions(ParallelComputationGraph const &pcg) {
-  NOT_IMPLEMENTED();
-}
-
-/*
- * Applies a substitution to all possible positions in PCG
- */
-std::vector<ParallelComputationGraph>
-    apply_substitution(ParallelComputationGraph const &pcg,
-                       Substitution const &) {
-  NOT_IMPLEMENTED();
-}
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config) {
-  NOT_IMPLEMENTED();
-
-  // std::vector<Substitution> substitutions =
-  //     get_all_applicable_substitutions(pcg);
-  //
-  // MachineMappingCache cached_subgraph_costs;
-  // DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
-  //
-  // MachineMappingResult original_pcg_cost =
-  //     get_optimal_machine_mapping(pcg,
-  //                                 allowed_machine_views,
-  //                                 cost_estimator,
-  //                                 resources,
-  //                                 cached_subgraph_costs);
-  //
-  // GraphOptimizeState initial_state = {
-  //     GraphOptimizeResult(pcg, original_pcg_cost.machine_mapping),
-  //     original_pcg_cost.runtime};
-  //
-  // GraphOptimizeState best_state = initial_state;
-  // candidates.push(initial_state);
-  //
-  // for (int iteration = 0; !candidates.empty() && iteration <
-  // opt_config.budget;
-  //      ++iteration) {
-  //   GraphOptimizeState current_state = candidates.top();
-  //   candidates.pop();
-  //
-  //   if (current_state.runtime < best_state.runtime) {
-  //     best_state = current_state;
-  //   } else if (current_state.runtime > best_state.runtime * opt_config.alpha)
-  //   {
-  //     continue;
-  //   }
-  //
-  //   for (Substitution const &substitution : substitutions) {
-  //     for (ParallelComputationGraph const &new_pcg : apply_substitution(
-  //              current_state.graph_optimize_result.pcg, substitution)) {
-  //       MachineMappingResult new_pcg_cost =
-  //           get_optimal_machine_mapping(new_pcg,
-  //                                       allowed_machine_views,
-  //                                       cost_estimator,
-  //                                       resources,
-  //                                       cached_subgraph_costs);
-  //       GraphOptimizeState new_state{
-  //           GraphOptimizeResult(new_pcg, new_pcg_cost.machine_mapping),
-  //           new_pcg_cost.runtime};
-  //       if (new_pcg_cost.runtime <= opt_config.threshold &&
-  //           get_nodes(new_pcg.raw_graph).size() <= opt_config.max_num_ops) {
-  //         candidates.push(new_state);
-  //       }
-  //     }
-  //   }
-  // }
-
-  // return best_state.graph_optimize_result;
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
index 936894ad2d..b885f4f8ea 100644
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/allowed_machine_views.h"
+#include "compiler/unity_algorithm/allowed_machine_views.h"
 #include "doctest/doctest.h"
 #include "utils/containers/extend.h"
 #include "utils/containers/range.h"
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 46177ad420..3bc9893f18 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -1,80 +1,81 @@
-#include "compiler/graph_optimize_state.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
 #include "doctest/doctest.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("GraphOptimizeState::operator==") {
-    ParallelComputationGraphBuilder builder;
+    // TODO(@wmdi): to be udpated
+//   TEST_CASE("GraphOptimizeState::operator==") {
+//     ParallelComputationGraphBuilder builder;
 
-    ParallelTensorShape input_shape =
-        ParallelTensorShape{ParallelTensorDims{
-                                FFOrdered<ShardParallelDim>{
-                                    ShardParallelDim{32, 2},
-                                    ShardParallelDim{16, 1},
-                                },
-                                ReplicaParallelDimSet{
-                                    SumDegree{1},
-                                    DiscardCopyDegree{1},
-                                },
-                            },
-                            DataType::FLOAT};
+//     ParallelTensorShape input_shape =
+//         ParallelTensorShape{ParallelTensorDims{
+//                                 FFOrdered<ShardParallelDim>{
+//                                     ShardParallelDim{32, 2},
+//                                     ShardParallelDim{16, 1},
+//                                 },
+//                                 ReplicaParallelDimSet{
+//                                     SumDegree{1},
+//                                     DiscardCopyDegree{1},
+//                                 },
+//                             },
+//                             DataType::FLOAT};
 
-    parallel_tensor_guid_t input0 =
-        builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0 = builder.dense(input0,
-                                                  8,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense0");
+//     parallel_tensor_guid_t input0 =
+//         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
+//     parallel_tensor_guid_t dense0 = builder.dense(input0,
+//                                                   8,
+//                                                   Activation::RELU,
+//                                                   true,
+//                                                   DataType::FLOAT,
+//                                                   std::nullopt,
+//                                                   std::nullopt,
+//                                                   "dense0");
 
-    parallel_tensor_guid_t dense1 = builder.dense(dense0,
-                                                  4,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense1");
+//     parallel_tensor_guid_t dense1 = builder.dense(dense0,
+//                                                   4,
+//                                                   Activation::RELU,
+//                                                   true,
+//                                                   DataType::FLOAT,
+//                                                   std::nullopt,
+//                                                   std::nullopt,
+//                                                   "dense1");
 
-    ParallelComputationGraph pcg = builder.pcg;
+//     ParallelComputationGraph pcg = builder.pcg;
 
-    // `machine_mapping` is determined by the PCG and the device mapping
-    // algorithm, and `runtime` is determined by the PCG and the device mapping,
-    // so their values here do not matter.
-    std::unordered_map<parallel_layer_guid_t, MachineView> empty_machine_views;
-    MachineMapping empty_machine_mapping(empty_machine_views);
-    bool result1 =
-        GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-                           0) ==
-        GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0);
-    bool correct1 = true;
-    CHECK(result1 == correct1);
+//     // `machine_mapping` is determined by the PCG and the device mapping
+//     // algorithm, and `runtime` is determined by the PCG and the device mapping,
+//     // so their values here do not matter.
+//     std::unordered_map<parallel_layer_guid_t, MachineView> empty_machine_views;
+//     MachineMapping empty_machine_mapping(empty_machine_views);
+//     bool result1 =
+//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
+//                            0) ==
+//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0);
+//     bool correct1 = true;
+//     CHECK(result1 == correct1);
 
-    ParallelComputationGraphBuilder builder_;
+//     ParallelComputationGraphBuilder builder_;
 
-    parallel_tensor_guid_t input0_ =
-        builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0_ = builder.dense(input0,
-                                                   8,
-                                                   Activation::RELU,
-                                                   true,
-                                                   DataType::FLOAT,
-                                                   std::nullopt,
-                                                   std::nullopt,
-                                                   "dense0");
+//     parallel_tensor_guid_t input0_ =
+//         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
+//     parallel_tensor_guid_t dense0_ = builder.dense(input0,
+//                                                    8,
+//                                                    Activation::RELU,
+//                                                    true,
+//                                                    DataType::FLOAT,
+//                                                    std::nullopt,
+//                                                    std::nullopt,
+//                                                    "dense0");
 
-    ParallelComputationGraph pcg_ = builder.pcg;
+//     ParallelComputationGraph pcg_ = builder.pcg;
 
-    bool result2 =
-        GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-                           0) ==
-        GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0);
-    bool correct2 = false;
-    CHECK(result2 == correct2);
-  }
+//     bool result2 =
+//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
+//                            0) ==
+//         GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0);
+//     bool correct2 = false;
+//     CHECK(result2 == correct2);
+//   }
 }
diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc
index 8ff0978ea5..d8523f6659 100644
--- a/lib/compiler/test/src/unity_algorithm.cc
+++ b/lib/compiler/test/src/unity_algorithm.cc
@@ -1,7 +1,8 @@
-#include "compiler/unity_algorithm.h"
+#include "compiler/unity_algorithm/unity_algorithm.h"
 #include "doctest/doctest.h"
 
 TEST_SUITE(FF_TEST_SUITE) {
+  // TODO: to be udpated
   // Rapidcheck does not work for now
   // TEST_CASE("graph_optimize") {
   //   RC_SUBCASE([](ComputationGraph const &g,
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index 61cab4eff1..1a19397c72 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_PCG_INCLUDE_OPERATOR_TASK_SPACE_H
 
 #include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/task_space_coordinate.dtg.h"
 #include <cstddef>
 #include <unordered_set>
@@ -17,6 +19,9 @@ TaskSpaceCoordinate
 size_t num_dims(OperatorTaskSpace const &task);
 size_t num_tasks(OperatorTaskSpace const &task);
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index c740e1ffd2..3cbd1f1977 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 #define _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 
+#include "pcg/computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
@@ -66,6 +67,9 @@ ParallelComputationGraph without_layer_names(ParallelComputationGraph const &);
 bool pcgs_are_isomorphic(ParallelComputationGraph const &,
                          ParallelComputationGraph const &);
 
+ParallelComputationGraph
+    parallel_computation_graph_from_computation_graph(ComputationGraph const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 2538cb4ea0..d50cce2af3 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -36,4 +36,9 @@ size_t num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 781c44640c..704f1fa48b 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -203,4 +203,9 @@ bool pcgs_are_isomorphic(ParallelComputationGraph const &lhs,
       .has_value();
 }
 
+ParallelComputationGraph parallel_computation_graph_from_computation_graph(
+    ComputationGraph const &) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace FlexFlow
diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
index de48cd17e9..9b4ea6cd20 100644
--- a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
+++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
@@ -1,11 +1,13 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 
+#include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_parallel_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_series_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
+#include <optional>
 #include <unordered_set>
 
 namespace FlexFlow {
@@ -23,6 +25,10 @@ std::unordered_multiset<Node> get_leaves(BinarySPDecompositionTree const &);
 
 SPDecompositionTreeNodeType get_node_type(BinarySPDecompositionTree const &);
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &, BinaryTreePath const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
index 62489ff75f..3e4bc13289 100644
--- a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
+++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
@@ -1,5 +1,6 @@
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_left_associative.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_right_associative.h"
 
@@ -82,4 +83,10 @@ SPDecompositionTreeNodeType
   });
 }
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &tree, BinaryTreePath const &path) {
+  return get_subtree_at_path(tree, generic_impl_for_binary_sp_tree(), path);
+}
+
 } // namespace FlexFlow

From c16bcf605f824b6292e89247ac40d6ed1acb0d13 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Tue, 21 Jan 2025 17:40:59 -0500
Subject: [PATCH 13/16] fixes

---
 lib/compiler/include/compiler/compiler.h      |  4 +--
 ...ne_mapping_problem_tree_result.struct.toml | 21 --------------
 .../compiler/search_result.struct.toml        |  2 +-
 lib/compiler/src/compiler/compiler.cc         | 23 +++++++--------
 .../machine_mapping/machine_mapping.cc        | 12 ++++----
 .../unity_algorithm/graph_optimize_state.cc   | 10 +++----
 .../unity_algorithm/unity_algorithm.cc        | 29 ++++++++++---------
 lib/pcg/src/pcg/operator_task_space.cc        | 17 ++++++++++-
 8 files changed, 55 insertions(+), 63 deletions(-)
 delete mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml

diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h
index 3faacd8f16..8697c06beb 100644
--- a/lib/compiler/include/compiler/compiler.h
+++ b/lib/compiler/include/compiler/compiler.h
@@ -16,9 +16,7 @@ enum class SearchAlgorithm {
 SearchResult optimize(ComputationGraph const &,
                       MachineSpecification const &,
                       CostEstimator const &,
-                      SearchAlgorithm,
-                      AlgorithmConfig const &,
-                      DeviceType);
+                      AlgorithmConfig const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml
deleted file mode 100644
index 252cd88276..0000000000
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-namespace = "FlexFlow"
-name = "GetMachineMappingProblemTreeResult"
-features = [
-  "eq",
-  "hash",
-  "fmt",
-]
-
-includes = [
-  "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h",
-  "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h",
-  "utils/bidict/bidict.h"
-]
-
-[[fields]]
-type = "::FlexFlow::MachineMappingProblemTree"
-name = "mm_problem_tree"
-
-[[fields]]
-type = "::FlexFlow::bidict<::FlexFlow::UnmappedOpCostEstimateKey, ::FlexFlow::parallel_layer_guid_t>"
-name = "mapping"
diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml
index 3776ec5568..120d182c75 100644
--- a/lib/compiler/include/compiler/search_result.struct.toml
+++ b/lib/compiler/include/compiler/search_result.struct.toml
@@ -5,7 +5,7 @@ features = [
 
 includes = [
   "pcg/parallel_computation_graph/parallel_computation_graph.h",
-  "machine_mapping/machine_mapping.h",
+  "compiler/machine_mapping/machine_mapping.h",
 ]
 
 [[fields]]
diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc
index a428c51abc..f2ff32b944 100644
--- a/lib/compiler/src/compiler/compiler.cc
+++ b/lib/compiler/src/compiler/compiler.cc
@@ -1,19 +1,19 @@
 #include "compiler/compiler.h"
 #include "compiler/unity_algorithm/unity_algorithm.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
 SearchResult optimize(ComputationGraph const &computation_graph,
                       MachineSpecification const &machine_specification,
                       CostEstimator const &cost_estimator,
-                      SearchAlgorithm search_algorithm,
-                      UnitySearchConfig const &search_config,
-                      DeviceType device_type) {
-  switch (search_algorithm) {
-    case SearchAlgorithm::DATA_PARALLEL:
+                      AlgorithmConfig const &search_config) {
+  return search_config.visit<SearchResult>(overload{
+    [&](DataParallelismConfig const &config) -> SearchResult {
       throw std::runtime_error(
           "Data parallel search algorithm is not implemented yet");
-    case SearchAlgorithm::UNITY: {
+    },
+    [&](UnitySearchConfig const &config) {
       ParallelComputationGraph pcg =
           parallel_computation_graph_from_computation_graph(computation_graph);
       std::vector<Substitution> substitutions; // TODO: Implement this
@@ -21,12 +21,11 @@ SearchResult optimize(ComputationGraph const &computation_graph,
                             cost_estimator,
                             machine_specification,
                             substitutions,
-                            search_config,
-                            device_type);
-    }
-    default:
-      throw std::runtime_error("Unknown search algorithm");
-  }
+                            config,
+                            DeviceType::GPU);
+
+    },
+  });
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 39222b91ac..33a8f686f5 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -3,6 +3,7 @@
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/merge_maps.h"
+#include "utils/containers/map_keys.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 
 namespace FlexFlow {
@@ -28,14 +29,14 @@ MachineMapping get_machine_mapping_from_machine_mapping_result(
     std::optional<BinarySPDecompositionTree> subtree_optional =
         binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path);
     if (!subtree_optional.has_value()) {
-      throw std::runtime_error("Invalid tree path");
+      throw std::runtime_error(fmt::format("Invalid tree path {}", path));
     }
     BinarySPDecompositionTree subtree = subtree_optional.value();
     if (!subtree.is_node()) {
-      throw std::runtime_error("Invalid tree path to a leaf");
+      throw std::runtime_error(fmt::format("Invalid tree path to a leaf: found {} instead", subtree));
     }
     return parallel_layer_guid_t{
-        subtree.get<Node>(),
+        subtree.require_node(),
     };
   };
 
@@ -44,10 +45,7 @@ MachineMapping get_machine_mapping_from_machine_mapping_result(
   if (mm_result.raw_result) {
     FeasibleMachineMappingResult const &feasible_mm_result =
         mm_result.raw_result.value();
-    for (auto const &[path, mv] :
-         feasible_mm_result.machine_mapping.raw_mapping) {
-      mm.insert({get_layer_from_path(path), mv});
-    }
+    mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, get_layer_from_path);
   }
 
   return MachineMapping{mm};
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
index bf8f089cc0..a8fa303ff6 100644
--- a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -28,15 +28,15 @@ size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
   // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
   // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
   size_t seed = 0;
-  auto layers = topological_ordering(state.pcg);
+  std::vector<::FlexFlow::parallel_layer_guid_t> layers = topological_ordering(state.pcg);
   ::FlexFlow::hash_combine(seed, layers.size());
-  for (auto layer : layers) {
+  for (::FlexFlow::parallel_layer_guid_t const & layer : layers) {
     ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer));
-    auto inputs = get_incoming_tensors(state.pcg, layer);
+    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = get_incoming_tensors(state.pcg, layer);
     ::FlexFlow::hash_combine(seed, inputs.size());
-    for (auto input : inputs) {
+    for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
       for (size_t i = 0; i < layers.size(); ++i) {
-        if (get_source_layer(input) == layers[i]) {
+        if (get_source_layer(input) == layers.at(i)) {
           ::FlexFlow::hash_combine(seed, i);
           break;
         }
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
index 2f2caf11ec..e7df440f7b 100644
--- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -13,6 +13,9 @@
 #include "substitutions/substitution.h"
 #include "utils/deduplicated_priority_queue.h"
 #include "utils/graph/node/algorithms.h"
+#include "compiler/machine_mapping/machine_mapping_cache.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 
 namespace FlexFlow {
 
@@ -58,9 +61,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
         return mapping;
       }();
 
-  MachineMappingCache cached_subgraph_costs = MachineMappingCache{
-      {},
-  };
+  MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache();
   DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
 
   MachineMappingContext context = MachineMappingContext{
@@ -79,6 +80,14 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
       },
   };
 
+  auto get_runtime_cost = [](MachineMappingResult const &mm_result) {
+    if (mm_result.raw_result == std::nullopt) {
+      return std::numeric_limits<float>::infinity();
+    } else {
+      return mm_result.raw_result.value().runtime;
+    }
+  };
+
   auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
       -> std::pair<GraphOptimizeState, MachineMapping> {
     std::optional<PCGBinarySPDecomposition> maybe_sp_decomp =
@@ -90,9 +99,9 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
 
     PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value();
 
-    MachineMappingConstraints constraints = MachineMappingConstraints{
-        /*machine_views=*/{},
-    };
+    MachineMappingProblemTree problem_tree = get_machine_mapping_problem_tree(pcg, sp_decomp);
+    MachineMappingConstraints constraints = 
+      get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree));
 
     MachineMappingResult mm_result = get_optimal_machine_mapping(
         cached_subgraph_costs,
@@ -101,16 +110,10 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
         resources,
         constraints);
 
-    float runtime_with_optimal_mm;
-    if (mm_result.raw_result == std::nullopt) {
-      runtime_with_optimal_mm = std::numeric_limits<float>::infinity();
-    } else {
-      runtime_with_optimal_mm = mm_result.raw_result.value().runtime;
-    }
     return {
         GraphOptimizeState{
             /*pcg=*/pcg,
-            /*runtime_with_optimal_mm=*/runtime_with_optimal_mm,
+            /*runtime_with_optimal_mm=*/get_runtime_cost(mm_result),
         },
         get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result),
     };
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index d50cce2af3..046571855e 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -1,10 +1,18 @@
 #include "pcg/operator_task_space.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "utils/containers/cartesian_product.h"
+#include "utils/containers/extend.h"
 #include "utils/containers/maximum.h"
 #include "utils/containers/product.h"
 #include "utils/containers/range.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_set_of.h"
+#include "utils/containers/vector_of.h"
 #include "utils/fmt/unordered_set.h"
 
 namespace FlexFlow {
@@ -38,7 +46,14 @@ size_t num_tasks(OperatorTaskSpace const &task) {
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer) {
-  NOT_IMPLEMENTED();
+  parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
+  ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
+
+  std::vector<int> degrees;
+  extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+  degrees.push_back(get_sum_degree(shape));
+  degrees.push_back(get_discard_copy_degree(shape));
+  return OperatorTaskSpace{degrees};
 }
 
 } // namespace FlexFlow

From 62389ad919b0ead5de247b527eaec2f077cd0dbc Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 22 Jan 2025 18:02:37 -0500
Subject: [PATCH 14/16] upd

---
 .../allowed_machine_views.h                   |   0
 .../machine_mapping_problem_tree.h            |   1 +
 .../unity_algorithm/unity_algorithm.h         |   3 +-
 lib/compiler/src/compiler/compiler.cc         |  28 ++--
 .../allowed_machine_views.cc                  |   2 +-
 .../machine_mapping/machine_mapping.cc        |   8 +-
 .../unity_algorithm/graph_optimize_state.cc   |   8 +-
 .../unity_algorithm/unity_algorithm.cc        |  35 +++--
 .../test/src/allowed_machine_views.cc         |   2 +-
 lib/compiler/test/src/graph_optimize_state.cc | 133 +++++++++---------
 lib/utils/include/utils/optional.h            |   5 +
 11 files changed, 116 insertions(+), 109 deletions(-)
 rename lib/compiler/include/compiler/{unity_algorithm => machine_mapping}/allowed_machine_views.h (100%)
 rename lib/compiler/src/compiler/{unity_algorithm => machine_mapping}/allowed_machine_views.cc (98%)

diff --git a/lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h b/lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
similarity index 100%
rename from lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h
rename to lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
index 29e9e7c90b..2976a55bf1 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
@@ -4,6 +4,7 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
index 4396bef734..223c4961eb 100644
--- a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
@@ -13,8 +13,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
                             CostEstimator const &cost_estimator,
                             MachineSpecification const &resources,
                             std::vector<Substitution> const &substitutions,
-                            UnitySearchConfig const &search_config,
-                            DeviceType device_type);
+                            UnitySearchConfig const &search_config);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc
index f2ff32b944..1c56e58796 100644
--- a/lib/compiler/src/compiler/compiler.cc
+++ b/lib/compiler/src/compiler/compiler.cc
@@ -9,22 +9,18 @@ SearchResult optimize(ComputationGraph const &computation_graph,
                       CostEstimator const &cost_estimator,
                       AlgorithmConfig const &search_config) {
   return search_config.visit<SearchResult>(overload{
-    [&](DataParallelismConfig const &config) -> SearchResult {
-      throw std::runtime_error(
-          "Data parallel search algorithm is not implemented yet");
-    },
-    [&](UnitySearchConfig const &config) {
-      ParallelComputationGraph pcg =
-          parallel_computation_graph_from_computation_graph(computation_graph);
-      std::vector<Substitution> substitutions; // TODO: Implement this
-      return graph_optimize(pcg,
-                            cost_estimator,
-                            machine_specification,
-                            substitutions,
-                            config,
-                            DeviceType::GPU);
-
-    },
+      [&](DataParallelismConfig const &config) -> SearchResult {
+        throw std::runtime_error(
+            "Data parallel search algorithm is not implemented yet");
+      },
+      [&](UnitySearchConfig const &config) {
+        ParallelComputationGraph pcg =
+            parallel_computation_graph_from_computation_graph(
+                computation_graph);
+        std::vector<Substitution> substitutions; // TODO: Implement this
+        return graph_optimize(
+            pcg, cost_estimator, machine_specification, substitutions, config);
+      },
   });
 }
 
diff --git a/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
similarity index 98%
rename from lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc
rename to lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
index d6fca79403..bcd8a63f84 100644
--- a/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/unity_algorithm/allowed_machine_views.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
 #include "pcg/machine_specification.h"
 #include "pcg/machine_view.h"
 #include "pcg/multi_dimensional_stride.dtg.h"
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 33a8f686f5..e54ed925de 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -2,8 +2,8 @@
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
-#include "utils/containers/merge_maps.h"
 #include "utils/containers/map_keys.h"
+#include "utils/containers/merge_maps.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 
 namespace FlexFlow {
@@ -33,7 +33,8 @@ MachineMapping get_machine_mapping_from_machine_mapping_result(
     }
     BinarySPDecompositionTree subtree = subtree_optional.value();
     if (!subtree.is_node()) {
-      throw std::runtime_error(fmt::format("Invalid tree path to a leaf: found {} instead", subtree));
+      throw std::runtime_error(fmt::format(
+          "Invalid tree path to a leaf: found {} instead", subtree));
     }
     return parallel_layer_guid_t{
         subtree.require_node(),
@@ -45,7 +46,8 @@ MachineMapping get_machine_mapping_from_machine_mapping_result(
   if (mm_result.raw_result) {
     FeasibleMachineMappingResult const &feasible_mm_result =
         mm_result.raw_result.value();
-    mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, get_layer_from_path);
+    mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping,
+                  get_layer_from_path);
   }
 
   return MachineMapping{mm};
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
index a8fa303ff6..1aa7f05655 100644
--- a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -28,11 +28,13 @@ size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
   // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
   // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
   size_t seed = 0;
-  std::vector<::FlexFlow::parallel_layer_guid_t> layers = topological_ordering(state.pcg);
+  std::vector<::FlexFlow::parallel_layer_guid_t> layers =
+      topological_ordering(state.pcg);
   ::FlexFlow::hash_combine(seed, layers.size());
-  for (::FlexFlow::parallel_layer_guid_t const & layer : layers) {
+  for (::FlexFlow::parallel_layer_guid_t const &layer : layers) {
     ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer));
-    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = get_incoming_tensors(state.pcg, layer);
+    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs =
+        get_incoming_tensors(state.pcg, layer);
     ::FlexFlow::hash_combine(seed, inputs.size());
     for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
       for (size_t i = 0; i < layers.size(); ++i) {
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
index e7df440f7b..01c9c645a6 100644
--- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -1,21 +1,23 @@
 #include "compiler/unity_algorithm/unity_algorithm.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_cache.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
 #include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
-#include "compiler/unity_algorithm/allowed_machine_views.h"
 #include "compiler/unity_algorithm/graph_optimize_state.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/operator_task_space.h"
 #include "substitutions/pcg_pattern.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "substitutions/substitution.h"
+#include "utils/containers/generate_map.h"
 #include "utils/deduplicated_priority_queue.h"
 #include "utils/graph/node/algorithms.h"
-#include "compiler/machine_mapping/machine_mapping_cache.h"
-#include "compiler/machine_mapping/machine_mapping_constraints.h"
-#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "utils/optional.h"
 
 namespace FlexFlow {
 
@@ -44,8 +46,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
                             CostEstimator const &cost_estimator,
                             MachineSpecification const &resources,
                             std::vector<Substitution> const &substitutions,
-                            UnitySearchConfig const &search_config,
-                            DeviceType device_type) {
+                            UnitySearchConfig const &search_config) {
 
   // NOTE(@wmdi): This mapping is only used for allowed_machine_views
   std::unordered_map<UnmappedOpCostEstimateKey, parallel_layer_guid_t>
@@ -76,7 +77,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
                 pcg,
                 mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at(
                     key)),
-            device_type);
+            DeviceType::GPU);
       },
   };
 
@@ -90,18 +91,14 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
 
   auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
       -> std::pair<GraphOptimizeState, MachineMapping> {
-    std::optional<PCGBinarySPDecomposition> maybe_sp_decomp =
-        get_pcg_balanced_binary_sp_decomposition(pcg);
-
-    if (!maybe_sp_decomp.has_value()) {
-      throw std::runtime_error("Fail to SP-ize PCG");
-    }
-
-    PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value();
-
-    MachineMappingProblemTree problem_tree = get_machine_mapping_problem_tree(pcg, sp_decomp);
-    MachineMappingConstraints constraints = 
-      get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree));
+    PCGBinarySPDecomposition sp_decomp =
+        expect(get_pcg_balanced_binary_sp_decomposition(pcg),
+               "Failed to get SP decomposition of PCG");
+
+    MachineMappingProblemTree problem_tree =
+        get_machine_mapping_problem_tree(pcg, sp_decomp);
+    MachineMappingConstraints constraints =
+        get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree));
 
     MachineMappingResult mm_result = get_optimal_machine_mapping(
         cached_subgraph_costs,
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
index b885f4f8ea..2481d84283 100644
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/unity_algorithm/allowed_machine_views.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
 #include "doctest/doctest.h"
 #include "utils/containers/extend.h"
 #include "utils/containers/range.h"
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 3bc9893f18..0be6d0a048 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -5,77 +5,82 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-    // TODO(@wmdi): to be udpated
-//   TEST_CASE("GraphOptimizeState::operator==") {
-//     ParallelComputationGraphBuilder builder;
+  // TODO(@wmdi): to be udpated
+  //   TEST_CASE("GraphOptimizeState::operator==") {
+  //     ParallelComputationGraphBuilder builder;
 
-//     ParallelTensorShape input_shape =
-//         ParallelTensorShape{ParallelTensorDims{
-//                                 FFOrdered<ShardParallelDim>{
-//                                     ShardParallelDim{32, 2},
-//                                     ShardParallelDim{16, 1},
-//                                 },
-//                                 ReplicaParallelDimSet{
-//                                     SumDegree{1},
-//                                     DiscardCopyDegree{1},
-//                                 },
-//                             },
-//                             DataType::FLOAT};
+  //     ParallelTensorShape input_shape =
+  //         ParallelTensorShape{ParallelTensorDims{
+  //                                 FFOrdered<ShardParallelDim>{
+  //                                     ShardParallelDim{32, 2},
+  //                                     ShardParallelDim{16, 1},
+  //                                 },
+  //                                 ReplicaParallelDimSet{
+  //                                     SumDegree{1},
+  //                                     DiscardCopyDegree{1},
+  //                                 },
+  //                             },
+  //                             DataType::FLOAT};
 
-//     parallel_tensor_guid_t input0 =
-//         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-//     parallel_tensor_guid_t dense0 = builder.dense(input0,
-//                                                   8,
-//                                                   Activation::RELU,
-//                                                   true,
-//                                                   DataType::FLOAT,
-//                                                   std::nullopt,
-//                                                   std::nullopt,
-//                                                   "dense0");
+  //     parallel_tensor_guid_t input0 =
+  //         builder.create_input_tensor(input_shape, CreateGrad::YES,
+  //         "input0");
+  //     parallel_tensor_guid_t dense0 = builder.dense(input0,
+  //                                                   8,
+  //                                                   Activation::RELU,
+  //                                                   true,
+  //                                                   DataType::FLOAT,
+  //                                                   std::nullopt,
+  //                                                   std::nullopt,
+  //                                                   "dense0");
 
-//     parallel_tensor_guid_t dense1 = builder.dense(dense0,
-//                                                   4,
-//                                                   Activation::RELU,
-//                                                   true,
-//                                                   DataType::FLOAT,
-//                                                   std::nullopt,
-//                                                   std::nullopt,
-//                                                   "dense1");
+  //     parallel_tensor_guid_t dense1 = builder.dense(dense0,
+  //                                                   4,
+  //                                                   Activation::RELU,
+  //                                                   true,
+  //                                                   DataType::FLOAT,
+  //                                                   std::nullopt,
+  //                                                   std::nullopt,
+  //                                                   "dense1");
 
-//     ParallelComputationGraph pcg = builder.pcg;
+  //     ParallelComputationGraph pcg = builder.pcg;
 
-//     // `machine_mapping` is determined by the PCG and the device mapping
-//     // algorithm, and `runtime` is determined by the PCG and the device mapping,
-//     // so their values here do not matter.
-//     std::unordered_map<parallel_layer_guid_t, MachineView> empty_machine_views;
-//     MachineMapping empty_machine_mapping(empty_machine_views);
-//     bool result1 =
-//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-//                            0) ==
-//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0);
-//     bool correct1 = true;
-//     CHECK(result1 == correct1);
+  //     // `machine_mapping` is determined by the PCG and the device mapping
+  //     // algorithm, and `runtime` is determined by the PCG and the device
+  //     mapping,
+  //     // so their values here do not matter.
+  //     std::unordered_map<parallel_layer_guid_t, MachineView>
+  //     empty_machine_views; MachineMapping
+  //     empty_machine_mapping(empty_machine_views); bool result1 =
+  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
+  //                            0) ==
+  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
+  //         0);
+  //     bool correct1 = true;
+  //     CHECK(result1 == correct1);
 
-//     ParallelComputationGraphBuilder builder_;
+  //     ParallelComputationGraphBuilder builder_;
 
-//     parallel_tensor_guid_t input0_ =
-//         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-//     parallel_tensor_guid_t dense0_ = builder.dense(input0,
-//                                                    8,
-//                                                    Activation::RELU,
-//                                                    true,
-//                                                    DataType::FLOAT,
-//                                                    std::nullopt,
-//                                                    std::nullopt,
-//                                                    "dense0");
+  //     parallel_tensor_guid_t input0_ =
+  //         builder.create_input_tensor(input_shape, CreateGrad::YES,
+  //         "input0");
+  //     parallel_tensor_guid_t dense0_ = builder.dense(input0,
+  //                                                    8,
+  //                                                    Activation::RELU,
+  //                                                    true,
+  //                                                    DataType::FLOAT,
+  //                                                    std::nullopt,
+  //                                                    std::nullopt,
+  //                                                    "dense0");
 
-//     ParallelComputationGraph pcg_ = builder.pcg;
+  //     ParallelComputationGraph pcg_ = builder.pcg;
 
-//     bool result2 =
-//         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-//                            0) ==
-//         GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0);
-//     bool correct2 = false;
-//     CHECK(result2 == correct2);
-//   }
+  //     bool result2 =
+  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
+  //                            0) ==
+  //         GraphOptimizeState(GraphOptimizeResult(pcg_,
+  //         empty_machine_mapping), 0);
+  //     bool correct2 = false;
+  //     CHECK(result2 == correct2);
+  //   }
 }
diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h
index 377561d70c..8673264d36 100644
--- a/lib/utils/include/utils/optional.h
+++ b/lib/utils/include/utils/optional.h
@@ -32,6 +32,11 @@ T const &assert_unwrap(std::optional<T> const &o) {
   return o.value();
 }
 
+template <typename T>
+T expect(std::optional<T> const &x, std::string const &err) {
+  return unwrap(x, [&]() { throw mk_runtime_error(err); });
+}
+
 } // namespace FlexFlow
 
 #endif

From 6d2fe50d4c9b59063949f3f10dead54c102646f1 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 29 Jan 2025 14:58:49 -0500
Subject: [PATCH 15/16] fixes

---
 .../machine_mapping/machine_mapping.h         |  6 +-
 .../unmapped_op_cost_estimate_key.struct.toml |  4 ++
 .../machine_mapping/machine_mapping_result.h  |  2 +
 .../machine_mapping/machine_mapping.cc        | 64 +++++++++----------
 .../unmapped_op_cost_estimate_key.cc          |  3 +
 .../machine_mapping/machine_mapping_result.cc |  8 +++
 .../unity_algorithm/unity_algorithm.cc        | 41 ++++--------
 7 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 8f9fe23c1c..f17e921f2b 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -12,7 +12,11 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &,
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2);
 
-MachineMapping get_machine_mapping_from_machine_mapping_result(
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path);
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
     PCGBinarySPDecomposition const &, MachineMappingResult const &);
 
 } // namespace FlexFlow
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
index fe76683eb7..7493c68387 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "op-attrs/parallel_tensor_shape.dtg.h",
   "<vector>",
   "pcg/machine_view.dtg.h",
+  "pcg/operator_task_space.dtg.h",
 ]
 
 src_includes = [
@@ -34,3 +35,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>"
 name = "output_shapes"
 type = "std::vector<::FlexFlow::ParallelTensorShape>"
 
+[[fields]]
+name = "op_task_space"
+type = "::FlexFlow::OperatorTaskSpace"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index b21fea5f24..db2f4e6f0d 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -31,6 +31,8 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     make_singleton_machine_mapping_result(float runtime,
                                           MachineView const &machine_view);
 
+[[nodiscard]] float get_runtime_cost(MachineMappingResult const &mm_result);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index e54ed925de..5bcab18930 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -4,7 +4,8 @@
 #include "utils/containers/keys.h"
 #include "utils/containers/map_keys.h"
 #include "utils/containers/merge_maps.h"
-#include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 
 namespace FlexFlow {
 
@@ -17,40 +18,39 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
   return are_disjoint(keys(m1.machine_views), keys(m2.machine_views));
 }
 
-MachineMapping get_machine_mapping_from_machine_mapping_result(
-    PCGBinarySPDecomposition const &sp_decomposition,
-    MachineMappingResult const &mm_result) {
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path) {
+  std::optional<PCGBinarySPDecomposition> subtree_optional =
+      get_subtree_at_path(
+          sp_decomposition, generic_impl_for_pcg_sp_tree(), path);
+
+  if (!subtree_optional.has_value()) {
+    throw std::runtime_error(fmt::format("Invalid tree path {}", path));
+  }
 
-  BinarySPDecompositionTree sp_tree =
-      binary_sp_tree_from_pcg_sp_tree(sp_decomposition);
-
-  auto get_layer_from_path =
-      [&](BinaryTreePath const &path) -> parallel_layer_guid_t {
-    std::optional<BinarySPDecompositionTree> subtree_optional =
-        binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path);
-    if (!subtree_optional.has_value()) {
-      throw std::runtime_error(fmt::format("Invalid tree path {}", path));
-    }
-    BinarySPDecompositionTree subtree = subtree_optional.value();
-    if (!subtree.is_node()) {
-      throw std::runtime_error(fmt::format(
-          "Invalid tree path to a leaf: found {} instead", subtree));
-    }
-    return parallel_layer_guid_t{
-        subtree.require_node(),
-    };
-  };
-
-  std::unordered_map<parallel_layer_guid_t, MachineView> mm;
-
-  if (mm_result.raw_result) {
-    FeasibleMachineMappingResult const &feasible_mm_result =
-        mm_result.raw_result.value();
-    mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping,
-                  get_layer_from_path);
+  PCGBinarySPDecomposition subtree = subtree_optional.value();
+  if (!subtree.is_leaf()) {
+    throw std::runtime_error(
+        fmt::format("Invalid tree path to a leaf: found {} instead", subtree));
   }
+  return subtree.require_leaf();
+}
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &sp_decomposition,
+    MachineMappingResult const &mm_result) {
 
-  return MachineMapping{mm};
+  return transform(
+      mm_result.raw_result,
+      [&](FeasibleMachineMappingResult const &feasible_mm_result) {
+        return MachineMapping{
+            map_keys(feasible_mm_result.machine_mapping.raw_mapping,
+                     [&](BinaryTreePath const &path) {
+                       return get_layer_from_path(sp_decomposition, path);
+                     }),
+        };
+      });
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
index 990b287f8b..b6d701cb98 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "pcg/operator_task_space.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 
@@ -18,6 +19,8 @@ UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer(
       transform(get_incoming_weights(pcg, layer), get_tensor_shape),
       /*output_shapes=*/
       transform(get_layer_outputs(pcg, layer), get_tensor_shape),
+      /*op_task_space=*/
+      get_operator_task_space(pcg, layer),
   };
 }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 3409f7f871..031b7f7fc5 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -135,4 +135,12 @@ MachineMappingResult
   };
 }
 
+float get_runtime_cost(MachineMappingResult const &mm_result) {
+  if (mm_result.raw_result == std::nullopt) {
+    return std::numeric_limits<float>::infinity();
+  } else {
+    return mm_result.raw_result.value().runtime;
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
index 01c9c645a6..3e2b2188b4 100644
--- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -7,6 +7,7 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
 #include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
 #include "compiler/unity_algorithm/graph_optimize_state.h"
 #include "pcg/machine_specification.dtg.h"
@@ -48,20 +49,6 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
                             std::vector<Substitution> const &substitutions,
                             UnitySearchConfig const &search_config) {
 
-  // NOTE(@wmdi): This mapping is only used for allowed_machine_views
-  std::unordered_map<UnmappedOpCostEstimateKey, parallel_layer_guid_t>
-      mapping_from_unmapped_op_cost_estimate_key_parallel_layer = [&] {
-        std::unordered_map<UnmappedOpCostEstimateKey, parallel_layer_guid_t>
-            mapping;
-        for (parallel_layer_guid_t layer : get_parallel_layers(pcg)) {
-          // NOTE(@wmdi): Assume layers with the same key have the same allowed
-          // machine views
-          mapping.insert(
-              {get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), layer});
-        }
-        return mapping;
-      }();
-
   MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache();
   DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
 
@@ -72,25 +59,12 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
           MachineSpecification const &resources)
           -> std::unordered_set<MachineView> {
         return get_allowed_machine_views(
-            resources,
-            get_operator_task_space(
-                pcg,
-                mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at(
-                    key)),
-            DeviceType::GPU);
+            resources, key.op_task_space, DeviceType::GPU);
       },
   };
 
-  auto get_runtime_cost = [](MachineMappingResult const &mm_result) {
-    if (mm_result.raw_result == std::nullopt) {
-      return std::numeric_limits<float>::infinity();
-    } else {
-      return mm_result.raw_result.value().runtime;
-    }
-  };
-
   auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
-      -> std::pair<GraphOptimizeState, MachineMapping> {
+      -> std::pair<GraphOptimizeState, std::optional<MachineMapping>> {
     PCGBinarySPDecomposition sp_decomp =
         expect(get_pcg_balanced_binary_sp_decomposition(pcg),
                "Failed to get SP decomposition of PCG");
@@ -148,9 +122,16 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg,
     }
   }
 
+  std::optional<MachineMapping> best_mapping =
+      optimize_pcg(best_state.pcg).second;
+
+  if (best_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
   return SearchResult{
       /*pcg=*/best_state.pcg,
-      /*machine_mapping=*/optimize_pcg(best_state.pcg).second,
+      /*machine_mapping=*/best_mapping.value(),
   };
 }
 

From 45a931c997e696f577e9a97599b95c2db95a66a8 Mon Sep 17 00:00:00 2001
From: wmdi <mengdiwu@andrew.cmu.edu>
Date: Wed, 29 Jan 2025 21:01:00 -0500
Subject: [PATCH 16/16] fix

---
 .../machine_mapping}/allowed_machine_views.cc |   0
 .../get_optimal_machine_mapping.cc            |   5 +
 .../get_machine_mapping_problem_tree.cc       |  18 +++
 ...get_optimal_machine_mapping_with_memory.cc |   4 +
 .../unity_algorithm/graph_optimize_state.cc   |  92 ++++++++++++++
 .../unity_algorithm/unity_algorithm.cc        | 115 ++++++++++++++++++
 lib/compiler/test/src/graph_optimize_state.cc |  86 -------------
 lib/compiler/test/src/unity_algorithm.cc      |  27 ----
 8 files changed, 234 insertions(+), 113 deletions(-)
 rename lib/compiler/test/src/{ => compiler/machine_mapping}/allowed_machine_views.cc (100%)
 create mode 100644 lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
 create mode 100644 lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
 delete mode 100644 lib/compiler/test/src/graph_optimize_state.cc
 delete mode 100644 lib/compiler/test/src/unity_algorithm.cc

diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
similarity index 100%
rename from lib/compiler/test/src/allowed_machine_views.cc
rename to lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index f5d5a5ee1b..4b77b3eebd 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -97,11 +97,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
     };
 
+    // Operator task spaces are not used in this test. Just make a placeholder.
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace {{}};
+
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}},
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -114,6 +118,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     ParallelTensorShape tensor_shape1 = ParallelTensorShape{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 06ab1e5b8c..ee71222fe3 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,7 +1,11 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/extend.h"
+#include "pcg/operator_task_space.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -93,6 +97,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{}};
 
+    auto make_operator_task_space = [&](ParallelTensorShape const &shape) {
+      std::vector<int> degrees;
+      extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+      degrees.push_back(get_sum_degree(shape));
+      degrees.push_back(get_discard_copy_degree(shape));
+      return OperatorTaskSpace{degrees};
+    };
+
     auto make_input_key =
         [&](ParallelTensorShape const &parallel_tensor_shape) {
           return UnmappedOpCostEstimateKey{
@@ -100,6 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*input_shapes=*/{},
               /*weight_shapes=*/{},
               /*output_shapes=*/{parallel_tensor_shape},
+              /*op_task_space=*/make_operator_task_space(parallel_tensor_shape),
           };
         };
 
@@ -149,11 +162,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t relu_layer = relu_added.parallel_layer;
       parallel_tensor_guid_t relu_output = get_only(relu_added.outputs);
 
+      OperatorTaskSpace relu_task_space = get_operator_task_space(pcg, relu_layer);
+
       UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/relu_attrs,
           /*input_shapes=*/{input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{relu_output_shape},
+          /*op_task_space=*/relu_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition = pcg_make_series(
@@ -234,11 +250,13 @@ TEST_SUITE(FF_TEST_SUITE) {
                              {input1_tensor, input2_tensor},
                              {make_output_attrs(ew_op_output_shape)});
       parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer;
+      OperatorTaskSpace ew_op_task_space = get_operator_task_space(pcg, ew_op_layer);
       UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/ew_op_attrs,
           /*input_shapes=*/{input_shape, input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{ew_op_output_shape},
+          /*op_task_space=*/ew_op_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition =
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8761116be2..cc1a1043cf 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -96,12 +96,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         return std::unordered_set<MachineView>{mv2};
       }
     };
+        
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace {{}};
 
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}},
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -114,6 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     ParallelTensorShape tensor_shape1 = ParallelTensorShape{
diff --git a/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
new file mode 100644
index 0000000000..0d28cecac7
--- /dev/null
+++ b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -0,0 +1,92 @@
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("GraphOptimizeState::operator== and operator!=") {
+    ParallelComputationGraph pcg1 = empty_parallel_computation_graph();
+    ParallelComputationGraph pcg2 = empty_parallel_computation_graph();
+
+
+    ParallelTensorShape input_shape = ParallelTensorShape{
+        ParallelTensorDims{
+            FFOrdered<ShardParallelDim>{
+                ShardParallelDim{10, 1},
+            },
+            ReplicaParallelDimSet{
+                SumDegree{1},
+                DiscardCopyDegree{1},
+            },
+        },
+        DataType::FLOAT,
+    };
+
+    auto make_output_attrs = [](ParallelTensorShape const &shape) {
+      return ParallelTensorAttrs{
+          /*shape=*/shape,
+          /*sync_type=*/std::nullopt,
+          /*initializer=*/std::nullopt,
+          /*create_gradients=*/CreateGrad::YES,
+      };
+    };
+
+    auto make_layer_attrs = [](PCGOperatorAttrs const &op_attrs) {
+      return ParallelLayerAttrs{
+          /*op_attrs=*/op_attrs,
+          /*name=*/std::nullopt,
+      };
+    };
+
+    PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{}};
+    
+    add_parallel_layer(
+          pcg2,
+          /*layer_attrs=*/make_layer_attrs(input_attrs),
+          /*inputs=*/{},
+          /*output_labels=*/{make_output_attrs(input_shape)});
+    
+    SUBCASE("same pcgs") {
+      GraphOptimizeState state1 = GraphOptimizeState(pcg1, 0.0);
+      GraphOptimizeState state2 = GraphOptimizeState(pcg1, 0.0);
+      bool result_eq = state1 == state2;
+      bool expected_eq = true;
+      CHECK(result_eq == expected_eq);
+      bool result_neq = state1 != state2;
+      bool expected_neq = false;
+      CHECK(result_neq == expected_neq);
+    }
+
+    SUBCASE("different pcgs with the same runtime") {
+      GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0);
+      GraphOptimizeState state2 = GraphOptimizeState(pcg2, 1.0);
+      bool result_eq = state1 == state2;
+      bool expected_eq = false;
+      CHECK(result_eq == expected_eq);
+      bool result_neq = state1 != state2;
+      bool expected_neq = true;
+      CHECK(result_neq == expected_neq);
+    }
+
+    SUBCASE("different pcgs with different runtime") {
+      GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0);
+      GraphOptimizeState state2 = GraphOptimizeState(pcg2, 2.0);
+      bool result_eq = state1 == state2;
+      bool expected_eq = false;
+      CHECK(result_eq == expected_eq);
+      bool result_neq = state1 != state2;
+      bool expected_neq = true;
+      CHECK(result_neq == expected_neq);
+    }
+  }
+
+  TEST_CASE("GraphOptimizeState::operator<") {
+    ParallelComputationGraph pcg1 = empty_parallel_computation_graph();
+    ParallelComputationGraph pcg2 = empty_parallel_computation_graph();
+    GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0);
+    GraphOptimizeState state2 = GraphOptimizeState(pcg2, 2.0);
+    bool result = state1 < state2;
+    bool expected = true;
+    CHECK(result == expected);
+  }
+}
diff --git a/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..447a6a04b2
--- /dev/null
+++ b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,115 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "pcg/computation_graph_builder.h"
+#include "../machine_mapping/cost_estimator_for_test.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "utils/integer_conversions.h"
+#include "doctest/doctest.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("graph_optimize") {
+    // TODO: recover this by implementing parallel_computation_graph_from_computation_graph
+    // ComputationGraph cg = [&] {
+    //   ComputationGraphBuilder b;
+    //   TensorShape input_tensor_shape = TensorShape{
+    //     TensorDims{
+    //       FFOrdered<size_t> {32, 64},
+    //     },
+    //     DataType::FLOAT,
+    //   };
+    //   tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+    //   t = b.dense(t,
+    //               /*outDim=*/16,
+    //               /*activation=*/std::nullopt);
+    //   t = b.gelu(t);
+    //   t = b.dense(t,
+    //               /*outDim=*/12,
+    //               /*activation=*/std::nullopt,
+    //               /*use_bias=*/false,
+    //               /*data_type=*/DataType::FLOAT,
+    //               /*kernel_initializer=*/std::nullopt,
+    //               /*bias_initializer=*/std::nullopt);
+    //   t = b.relu(t);
+    //   t = b.dense(t,
+    //               /*outDim=*/8,
+    //               /*activation=*/Activation::RELU);
+    //   return b.computation_graph;
+    // }();
+
+    // ParallelComputationGraph pcg = parallel_computation_graph_from_computation_graph(cg);
+
+    ParallelComputationGraph pcg = [&] {
+      ParallelComputationGraphBuilder b;
+      int in_channels = 24;
+      int batch_size = 4;
+      int batch_degree = 2;
+      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
+                  ShardParallelDim{size_t_from_int(in_channels), 1},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      });
+      t = b.dense(t,
+                  /*outDim=*/16,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12,
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/8,
+                  /*activation=*/Activation::RELU);
+
+      return b.pcg;
+    }();
+
+    CostEstimator cost_estimator = make_fake_cost_estimator([](OpCostEstimateKey const &k) {
+      return OpCostMetrics{
+        /*runtime=*/1.0,
+        /*memory=*/1,
+      };
+    },
+    [](TensorSetMovement const &) {
+      return 1.0;
+    });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/2,
+        /*num_cpus_per_node=*/1,
+        /*num_gpus_per_node=*/1,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    // TODO: set up substitutions
+    std::vector<Substitution> substitutions = {};
+
+    UnitySearchConfig search_config = UnitySearchConfig{
+        /*alpha=*/1.0,
+        /*budget=*/20,
+        /*threshold=*/1000.0,
+        /*max_num_ops=*/100,
+    };
+
+    // SearchResult result = graph_optimize(pcg, cost_estimator, full_machine_spec, substitutions, search_config);
+
+    // TODO: check the result
+  }
+}
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
deleted file mode 100644
index 0be6d0a048..0000000000
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "compiler/unity_algorithm/graph_optimize_state.h"
-#include "doctest/doctest.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
-
-using namespace FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  // TODO(@wmdi): to be udpated
-  //   TEST_CASE("GraphOptimizeState::operator==") {
-  //     ParallelComputationGraphBuilder builder;
-
-  //     ParallelTensorShape input_shape =
-  //         ParallelTensorShape{ParallelTensorDims{
-  //                                 FFOrdered<ShardParallelDim>{
-  //                                     ShardParallelDim{32, 2},
-  //                                     ShardParallelDim{16, 1},
-  //                                 },
-  //                                 ReplicaParallelDimSet{
-  //                                     SumDegree{1},
-  //                                     DiscardCopyDegree{1},
-  //                                 },
-  //                             },
-  //                             DataType::FLOAT};
-
-  //     parallel_tensor_guid_t input0 =
-  //         builder.create_input_tensor(input_shape, CreateGrad::YES,
-  //         "input0");
-  //     parallel_tensor_guid_t dense0 = builder.dense(input0,
-  //                                                   8,
-  //                                                   Activation::RELU,
-  //                                                   true,
-  //                                                   DataType::FLOAT,
-  //                                                   std::nullopt,
-  //                                                   std::nullopt,
-  //                                                   "dense0");
-
-  //     parallel_tensor_guid_t dense1 = builder.dense(dense0,
-  //                                                   4,
-  //                                                   Activation::RELU,
-  //                                                   true,
-  //                                                   DataType::FLOAT,
-  //                                                   std::nullopt,
-  //                                                   std::nullopt,
-  //                                                   "dense1");
-
-  //     ParallelComputationGraph pcg = builder.pcg;
-
-  //     // `machine_mapping` is determined by the PCG and the device mapping
-  //     // algorithm, and `runtime` is determined by the PCG and the device
-  //     mapping,
-  //     // so their values here do not matter.
-  //     std::unordered_map<parallel_layer_guid_t, MachineView>
-  //     empty_machine_views; MachineMapping
-  //     empty_machine_mapping(empty_machine_views); bool result1 =
-  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-  //                            0) ==
-  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-  //         0);
-  //     bool correct1 = true;
-  //     CHECK(result1 == correct1);
-
-  //     ParallelComputationGraphBuilder builder_;
-
-  //     parallel_tensor_guid_t input0_ =
-  //         builder.create_input_tensor(input_shape, CreateGrad::YES,
-  //         "input0");
-  //     parallel_tensor_guid_t dense0_ = builder.dense(input0,
-  //                                                    8,
-  //                                                    Activation::RELU,
-  //                                                    true,
-  //                                                    DataType::FLOAT,
-  //                                                    std::nullopt,
-  //                                                    std::nullopt,
-  //                                                    "dense0");
-
-  //     ParallelComputationGraph pcg_ = builder.pcg;
-
-  //     bool result2 =
-  //         GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping),
-  //                            0) ==
-  //         GraphOptimizeState(GraphOptimizeResult(pcg_,
-  //         empty_machine_mapping), 0);
-  //     bool correct2 = false;
-  //     CHECK(result2 == correct2);
-  //   }
-}
diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc
deleted file mode 100644
index d8523f6659..0000000000
--- a/lib/compiler/test/src/unity_algorithm.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "compiler/unity_algorithm/unity_algorithm.h"
-#include "doctest/doctest.h"
-
-TEST_SUITE(FF_TEST_SUITE) {
-  // TODO: to be udpated
-  // Rapidcheck does not work for now
-  // TEST_CASE("graph_optimize") {
-  //   RC_SUBCASE([](ComputationGraph const &g,
-  //                float alpha,
-  //                int budget,
-  //                float threshold,
-  //                int max_num_ops) {
-  //     Strategy s = graph_optimize(
-  //         g,
-  //         TestCostEstimator{},
-  //         MachineSpecification{1, 1, 4, 0.1, 0.2},
-  //         [](Operator const &, MachineSpecification const &) {
-  //           return std::unordered_set<MachineView>{make_1d_machine_view(0, 1,
-  //           1)};
-  //         },
-  //         OptimizerConfig{alpha, budget, threshold, max_num_ops});
-  //     RC_ASSERT(get_nodes(s.pcg).size() > 0);
-  //     RC_ASSERT(s.machine_mapping.runtime > 0);
-  //     RC_ASSERT(keys(s.machine_mapping.machine_views) == get_nodes(s.pcg));
-  //   });
-  // }
-}