From 0d639aca9d6dc6ac975f427da13b25c5bd8d4c35 Mon Sep 17 00:00:00 2001 From: wmdi Date: Sun, 13 Oct 2024 18:10:01 -0400 Subject: [PATCH 01/16] initial implmentation of meomry algorithm --- .../compiler/cost_estimator/cost_estimator.h | 9 +-- .../compiler/cost_estimator/cost_metric.h | 28 ++++++++ .../cost_estimator/cost_metric.struct.toml | 18 ++++++ ...easible_machine_mapping_result.struct.toml | 5 +- .../get_optimal_machine_mapping.h | 18 ++++-- .../machine_mapping_config.struct.toml | 13 ++++ .../machine_mapping/machine_mapping_result.h | 20 ++++-- .../machine_mapping_state.struct.toml | 10 +++ .../machine_memory_constraints.struct.toml | 13 ++++ .../compiler/cost_estimator/cost_estimator.cc | 4 +- .../compiler/cost_estimator/cost_metric.cc | 55 ++++++++++++++++ .../get_optimal_machine_mapping.cc | 56 +++++++++++----- .../machine_mapping/machine_mapping_result.cc | 64 +++++++++++++++---- 13 files changed, 269 insertions(+), 44 deletions(-) create mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.h create mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml create mode 100644 lib/compiler/src/compiler/cost_estimator/cost_metric.cc diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h index 65bae0c76a..55311af83b 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h +++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H +#include "compiler/cost_estimator/cost_metric.dtg.h" #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" #include "compiler/cost_estimator/tensor_set_movement.dtg.h" #include "op-attrs/parallel_tensor_shape.dtg.h" @@ -11,8 +12,8 @@ namespace FlexFlow { struct ICostEstimator { - virtual float estimate_cost(OpCostEstimateKey const &) const = 0; - virtual float estimate_cost(TensorSetMovement const &) const = 0; + virtual CostMetric estimate_cost(OpCostEstimateKey const &) const = 0; + virtual CostMetric estimate_cost(TensorSetMovement const &) const = 0; ICostEstimator() = default; ICostEstimator(ICostEstimator const &) = delete; @@ -23,8 +24,8 @@ struct ICostEstimator { CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator); struct CostEstimator { - float estimate_cost(OpCostEstimateKey const &k) const; - float estimate_cost(TensorSetMovement const &m) const; + CostMetric estimate_cost(OpCostEstimateKey const &k) const; + CostMetric estimate_cost(TensorSetMovement const &m) const; template static typename std::enable_if::value, diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.h b/lib/compiler/include/compiler/cost_estimator/cost_metric.h new file mode 100644 index 0000000000..98b0cb228d --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/cost_metric.h @@ -0,0 +1,28 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H + +#include "compiler/cost_estimator/cost_metric.dtg.h" +#include + +namespace FlexFlow { + +CostMetric zero_cost_metric(); + +CostMetric combine_cost_metrics_inter_device(CostMetric const &c1, + CostMetric const &c2); +CostMetric + combine_cost_metrics_inter_device(std::vector const &costs); + +CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1, + CostMetric const &c2); +CostMetric combine_cost_metrics_intra_device_sequential( + std::vector const &costs); + +CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1, + CostMetric const &c2); +CostMetric combine_cost_metrics_intra_device_parallel( + std::vector const &costs); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml b/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml new file mode 100644 index 0000000000..0666bb9e11 --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "CostMetric" +features = [ + "eq", + "fmt", + "hash", +] + +includes = [ +] + +[[fields]] +name = "runtime" +type = "float" + +[[fields]] +name = "memory" +type = "size_t" diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml index e71cfc540f..07dc30d2fc 100644 --- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml @@ -8,11 +8,12 @@ features = [ includes = [ "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h", + "compiler/cost_estimator/cost_metric.dtg.h", ] [[fields]] -name = "runtime" -type = "float" +name = "cost" +type = "::FlexFlow::CostMetric" [[fields]] name = "machine_mapping" diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h index 62da90bfcb..cd4896e260 100644 --- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h @@ -2,11 +2,13 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_GET_OPTIMAL_MACHINE_MAPPING_H #include "compiler/machine_mapping/machine_mapping_cache.dtg.h" +#include "compiler/machine_mapping/machine_mapping_config.dtg.h" #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h" #include "compiler/machine_mapping/machine_mapping_context.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" +#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" #include "pcg/machine_specification.dtg.h" @@ -17,7 +19,9 @@ MachineMappingResult MachineMappingContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, - MachineMappingConstraints const &constraints); + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config); MachineMappingResult get_optimal_machine_mapping(MachineMappingCache &result_cache, @@ -25,22 +29,28 @@ MachineMappingResult MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, std::optional const - ¶llel_split_transformation); + ¶llel_split_transformation, + MachineMappingConfig const &config); MachineMappingResult get_optimal_machine_mapping( MachineMappingCache &result_cache, MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, - MachineMappingConstraints const &constraints); + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config); MachineMappingResult get_optimal_machine_mapping(MachineMappingCache &result_cache, MachineMappingContext const &, UnmappedOpCostEstimateKey const &leaf, MachineSpecification const &resources, - MachineMappingConstraints const &constraints); + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml new file mode 100644 index 0000000000..f4c0b61291 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "MachineMappingConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [] + +[[fields]] +name = "enable_memory_optimization" +type = "bool" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index b21fea5f24..642d48ec02 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -3,6 +3,8 @@ #include "compiler/machine_mapping/machine_mapping_result.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" +#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h" +#include "compiler/machine_mapping/machine_mapping_config.dtg.h" namespace FlexFlow { @@ -14,22 +16,32 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); std::unordered_set const &); [[nodiscard]] MachineMappingResult - series_combine(float comm_cost, + series_combine(MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + CostMetric const &comm_cost, MachineMappingResult const &pre_result, MachineMappingResult const &post_result, std::optional const ¶llel_split_transformation); [[nodiscard]] MachineMappingResult - parallel_combine(MachineMappingResult const &lhs_result, + parallel_combine(MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &lhs_result, MachineMappingResult const &rhs_result); [[nodiscard]] MachineMappingResult minimize_runtime(MachineMappingResult const &m1, MachineMappingResult const &m2); +[[nodiscard]] MachineMappingResult make_singleton_machine_mapping_result( + MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + CostMetric const &cost, + MachineView const &machine_view); + [[nodiscard]] MachineMappingResult - make_singleton_machine_mapping_result(float runtime, - MachineView const &machine_view); + machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &result); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml index 1346f6ebe7..b4a6147b5a 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml @@ -9,7 +9,9 @@ features = [ includes = [ "pcg/machine_specification.dtg.h", "compiler/machine_mapping/machine_mapping_constraints.dtg.h", + "compiler/machine_mapping/machine_mapping_config.dtg.h", "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h", + "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h", ] [[fields]] @@ -23,3 +25,11 @@ type = "::FlexFlow::MachineSpecification" [[fields]] name = "constraints" type = "::FlexFlow::MachineMappingConstraints" + +[[fields]] +name = "memory_constraints" +type = "::FlexFlow::MachineMemoryConstraints" + +[[fields]] +name = "config" +type = "::FlexFlow::MachineMappingConfig" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml new file mode 100644 index 0000000000..0d2572c783 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "MachineMemoryConstraints" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [] + +[[fields]] +name = "memory_limit" +type = "size_t" diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc index 051ffcd190..10e999dc1a 100644 --- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc +++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc @@ -5,11 +5,11 @@ namespace FlexFlow { CostEstimator::CostEstimator(std::shared_ptr implementation_ptr) : implementation_ptr(implementation_ptr) {} -float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +CostMetric CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->implementation_ptr->estimate_cost(k); } -float CostEstimator::estimate_cost(TensorSetMovement const &m) const { +CostMetric CostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->implementation_ptr->estimate_cost(m); } diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc new file mode 100644 index 0000000000..370afab406 --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc @@ -0,0 +1,55 @@ +#include "compiler/cost_estimator/cost_metric.h" + +namespace FlexFlow { + +CostMetric zero_cost_metric() { + return CostMetric{ + /*runtime=*/0, + /*memory=*/0, + }; +} + +CostMetric combine_cost_metrics_inter_device(CostMetric const &c1, + CostMetric const &c2) { + return CostMetric{c1.runtime + c2.runtime, c1.memory + c2.memory}; +} + +CostMetric + combine_cost_metrics_inter_device(std::vector const &costs) { + CostMetric result = zero_cost_metric(); + for (CostMetric const &cost : costs) { + result = combine_cost_metrics_inter_device(result, cost); + } + return result; +} + +CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1, + CostMetric const &c2) { + return CostMetric{c1.runtime + c2.runtime, std::max(c1.memory, c2.memory)}; +} + +CostMetric combine_cost_metrics_intra_device_sequential( + std::vector const &costs) { + CostMetric result = zero_cost_metric(); + for (CostMetric const &cost : costs) { + result = combine_cost_metrics_intra_device_sequential(result, cost); + } + return result; +} + +CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1, + CostMetric const &c2) { + return CostMetric{std::max(c1.runtime, c2.runtime), + std::max(c1.memory, c2.memory)}; +} + +CostMetric combine_cost_metrics_intra_device_parallel( + std::vector const &costs) { + CostMetric result = zero_cost_metric(); + for (CostMetric const &cost : costs) { + result = combine_cost_metrics_intra_device_parallel(result, cost); + } + return result; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 10abd7ff90..a1a1595d98 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -29,12 +29,16 @@ MachineMappingResult MachineMappingContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, - MachineMappingConstraints const &constraints) { + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config) { MachineMappingState state = MachineMappingState{ problem_tree, resources, constraints, + memory_constraints, + config, }; { @@ -54,14 +58,18 @@ MachineMappingResult series_split, resources, constraints, - /*parallel_split_transformation=*/std::nullopt); + memory_constraints, + /*parallel_split_transformation=*/std::nullopt, + config); }, [&](auto const &decomp_tree_node) { return get_optimal_machine_mapping(result_cache, context, decomp_tree_node, resources, - constraints); + constraints, + memory_constraints, + config); }, }); @@ -75,8 +83,10 @@ MachineMappingResult MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, std::optional const - ¶llel_split_transformation) { + ¶llel_split_transformation, + MachineMappingConfig const &config) { auto get_boundary_machine_view_assignments = [&](std::unordered_set const &boundary_layers) @@ -110,7 +120,9 @@ MachineMappingResult context, series_split.get_left_child(), resources, - pre_candidate); + pre_candidate, + memory_constraints, + config); return pre_result; }; @@ -126,7 +138,9 @@ MachineMappingResult context, series_split.get_right_child(), resources, - post_candidate); + post_candidate, + memory_constraints, + config); return post_result; }; @@ -155,11 +169,13 @@ MachineMappingResult tensor_movement, /*pre_mapping=*/assigned_pre_machine_views, /*post_mapping=*/assigned_post_machine_views); - float cost_across_split = + CostMetric cost_across_split = context.cost_estimator.estimate_cost(comm_across_split); result = minimize_runtime(result, - series_combine(cost_across_split, + series_combine(config, + memory_constraints, + cost_across_split, pre_result, post_result, parallel_split_transformation)); @@ -174,7 +190,9 @@ MachineMappingResult get_optimal_machine_mapping( MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, - MachineMappingConstraints const &constraints) { + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config) { MachineMappingProblemTree lhs = parallel_split.get_left_child(); MachineMappingProblemTree rhs = parallel_split.get_right_child(); @@ -191,7 +209,9 @@ MachineMappingResult get_optimal_machine_mapping( series_split, resources, constraints, - ParallelSplitTransformation::LthenR); + memory_constraints, + ParallelSplitTransformation::LthenR, + config); }(); MachineMappingConstraints left_constraints = @@ -203,15 +223,17 @@ MachineMappingResult get_optimal_machine_mapping( [&](std::pair const &resource_split) { MachineMappingResult left_result = get_optimal_machine_mapping( - result_cache, context, lhs, resource_split.first, left_constraints); + result_cache, context, lhs, resource_split.first, left_constraints, memory_constraints, config); MachineMappingResult right_result = get_optimal_machine_mapping(result_cache, context, rhs, resource_split.second, - right_constraints); + right_constraints, + memory_constraints, + config); - return parallel_combine(left_result, right_result); + return parallel_combine(config, memory_constraints, left_result, right_result); }; std::unordered_set parallel_results = transform( @@ -226,7 +248,9 @@ MachineMappingResult MachineMappingContext const &context, UnmappedOpCostEstimateKey const &leaf, MachineSpecification const &resource, - MachineMappingConstraints const &constraints) { + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config) { std::unordered_set candidates = [&] { std::optional machine_view = require_only_root(constraints); @@ -240,9 +264,9 @@ MachineMappingResult auto get_mapping_result = [&](MachineView const &machine_view) { OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(leaf, machine_view); - float cost = context.cost_estimator.estimate_cost(mapped); + CostMetric cost = context.cost_estimator.estimate_cost(mapped); - return make_singleton_machine_mapping_result(cost, machine_view); + return make_singleton_machine_mapping_result(config, memory_constraints, cost, machine_view); }; std::unordered_set candidate_results = diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index 3409f7f871..18e5049022 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -1,4 +1,5 @@ #include "compiler/machine_mapping/machine_mapping_result.h" +#include "compiler/cost_estimator/cost_metric.h" #include "compiler/machine_mapping/machine_mapping.h" #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h" #include "utils/containers/map_keys.h" @@ -32,7 +33,9 @@ FeasibleMachineMappingResult } MachineMappingResult - series_combine(float comm_cost, + series_combine(MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + CostMetric const &comm_cost, MachineMappingResult const &maybe_pre_result, MachineMappingResult const &maybe_post_result, std::optional const @@ -63,16 +66,25 @@ MachineMappingResult } }(); - return MachineMappingResult{ + MachineMappingResult result_without_memory_check = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/pre_result.runtime + comm_cost + post_result.runtime, + /*cost=*/combine_cost_metrics_inter_device( + {pre_result.cost, comm_cost, post_result.cost}), /*machine_mapping=*/mapping, }, }; + + if (config.enable_memory_optimization) { + return machine_mapping_memory_check(memory_constraints, result_without_memory_check); + } else { + return result_without_memory_check; + } } MachineMappingResult - parallel_combine(MachineMappingResult const &maybe_lhs_result, + parallel_combine(MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &maybe_lhs_result, MachineMappingResult const &maybe_rhs_result) { FeasibleMachineMappingResult lhs_result = ({ if (is_infeasible(maybe_lhs_result)) { @@ -88,14 +100,21 @@ MachineMappingResult require_feasible(maybe_rhs_result); }); - return MachineMappingResult{ + MachineMappingResult result_without_memory_check = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/std::max(lhs_result.runtime, rhs_result.runtime), + /*cost=*/combine_cost_metrics_intra_device_parallel(lhs_result.cost, + rhs_result.cost), /*machine_mapping=*/ binary_combine_mappings(/*lhs=*/lhs_result.machine_mapping, /*rhs=*/rhs_result.machine_mapping), }, }; + + if (config.enable_memory_optimization) { + return machine_mapping_memory_check(memory_constraints, result_without_memory_check); + } else { + return result_without_memory_check; + } } MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1, @@ -114,25 +133,46 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1, require_feasible(maybe_m2); }); - if (m2.runtime < m1.runtime) { + if (m2.cost.runtime < m1.cost.runtime) { return maybe_m2; } else { return maybe_m1; } } -MachineMappingResult - make_singleton_machine_mapping_result(float runtime, - MachineView const &machine_view) { - return MachineMappingResult{ +MachineMappingResult make_singleton_machine_mapping_result( + MachineMappingConfig const &config, + MachineMemoryConstraints const &memory_constraints, + CostMetric const &cost, + MachineView const &machine_view) { + MachineMappingResult result_without_memory_check = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/runtime, + /*cost=*/cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), machine_view}, }}, }, }; + + return machine_mapping_memory_check(memory_constraints, result_without_memory_check); +} + +MachineMappingResult + machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &result) { + FeasibleMachineMappingResult feasible_result = ({ + if (is_infeasible(result)) { + return infeasible_machine_mapping_result(); + } + require_feasible(result); + }); + + if (feasible_result.cost.memory > memory_constraints.memory_limit) { + return infeasible_machine_mapping_result(); + } else { + return result; + } } } // namespace FlexFlow From da857a5e1e2e888773772f09bcc7d003cd2d95d5 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 16 Oct 2024 00:39:56 -0400 Subject: [PATCH 02/16] fmt --- .../get_optimal_machine_mapping.h | 54 +++++++-------- .../machine_mapping/machine_mapping_result.h | 10 +-- .../compiler/cost_estimator/cost_metric.cc | 4 +- .../get_optimal_machine_mapping.cc | 68 +++++++++++-------- .../machine_mapping/machine_mapping_result.cc | 15 ++-- 5 files changed, 81 insertions(+), 70 deletions(-) diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h index cd4896e260..e8b3771430 100644 --- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h @@ -14,25 +14,25 @@ namespace FlexFlow { -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - MachineMappingProblemTree const &problem_tree, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config); - -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - MMProblemTreeSeriesSplit const &series_split, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - std::optional const - ¶llel_split_transformation, - MachineMappingConfig const &config); +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config); + +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + std::optional const + ¶llel_split_transformation, + MachineMappingConfig const &config); MachineMappingResult get_optimal_machine_mapping( MachineMappingCache &result_cache, @@ -43,14 +43,14 @@ MachineMappingResult get_optimal_machine_mapping( MachineMemoryConstraints const &memory_constraints, MachineMappingConfig const &config); -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config); +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index 642d48ec02..c240d68f2b 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H +#include "compiler/machine_mapping/machine_mapping_config.dtg.h" #include "compiler/machine_mapping/machine_mapping_result.dtg.h" -#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" #include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h" -#include "compiler/machine_mapping/machine_mapping_config.dtg.h" +#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" namespace FlexFlow { @@ -39,9 +39,9 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); CostMetric const &cost, MachineView const &machine_view); -[[nodiscard]] MachineMappingResult - machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &result); +[[nodiscard]] MachineMappingResult machine_mapping_memory_check( + MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &result); } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc index 370afab406..dfaf0702c9 100644 --- a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc +++ b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc @@ -4,8 +4,8 @@ namespace FlexFlow { CostMetric zero_cost_metric() { return CostMetric{ - /*runtime=*/0, - /*memory=*/0, + /*runtime=*/0, + /*memory=*/0, }; } diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index a1a1595d98..3321d53e98 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -24,14 +24,14 @@ namespace FlexFlow { -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - MachineMappingProblemTree const &problem_tree, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config) { +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config) { MachineMappingState state = MachineMappingState{ problem_tree, @@ -77,16 +77,16 @@ MachineMappingResult return result; } -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - MMProblemTreeSeriesSplit const &series_split, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - std::optional const - ¶llel_split_transformation, - MachineMappingConfig const &config) { +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + std::optional const + ¶llel_split_transformation, + MachineMappingConfig const &config) { auto get_boundary_machine_view_assignments = [&](std::unordered_set const &boundary_layers) @@ -222,8 +222,14 @@ MachineMappingResult get_optimal_machine_mapping( auto evaluate_resource_split = [&](std::pair const &resource_split) { - MachineMappingResult left_result = get_optimal_machine_mapping( - result_cache, context, lhs, resource_split.first, left_constraints, memory_constraints, config); + MachineMappingResult left_result = + get_optimal_machine_mapping(result_cache, + context, + lhs, + resource_split.first, + left_constraints, + memory_constraints, + config); MachineMappingResult right_result = get_optimal_machine_mapping(result_cache, context, @@ -233,7 +239,8 @@ MachineMappingResult get_optimal_machine_mapping( memory_constraints, config); - return parallel_combine(config, memory_constraints, left_result, right_result); + return parallel_combine( + config, memory_constraints, left_result, right_result); }; std::unordered_set parallel_results = transform( @@ -243,14 +250,14 @@ MachineMappingResult get_optimal_machine_mapping( get_mapping_with_minimal_runtime(parallel_results)); } -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resource, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config) { +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resource, + MachineMappingConstraints const &constraints, + MachineMemoryConstraints const &memory_constraints, + MachineMappingConfig const &config) { std::unordered_set candidates = [&] { std::optional machine_view = require_only_root(constraints); @@ -266,7 +273,8 @@ MachineMappingResult map_unmapped_op_cost_estimate_key(leaf, machine_view); CostMetric cost = context.cost_estimator.estimate_cost(mapped); - return make_singleton_machine_mapping_result(config, memory_constraints, cost, machine_view); + return make_singleton_machine_mapping_result( + config, memory_constraints, cost, machine_view); }; std::unordered_set candidate_results = diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index 18e5049022..fc9f747743 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -75,7 +75,8 @@ MachineMappingResult }; if (config.enable_memory_optimization) { - return machine_mapping_memory_check(memory_constraints, result_without_memory_check); + return machine_mapping_memory_check(memory_constraints, + result_without_memory_check); } else { return result_without_memory_check; } @@ -111,7 +112,8 @@ MachineMappingResult }; if (config.enable_memory_optimization) { - return machine_mapping_memory_check(memory_constraints, result_without_memory_check); + return machine_mapping_memory_check(memory_constraints, + result_without_memory_check); } else { return result_without_memory_check; } @@ -155,12 +157,13 @@ MachineMappingResult make_singleton_machine_mapping_result( }, }; - return machine_mapping_memory_check(memory_constraints, result_without_memory_check); + return machine_mapping_memory_check(memory_constraints, + result_without_memory_check); } -MachineMappingResult - machine_mapping_memory_check(MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &result) { +MachineMappingResult machine_mapping_memory_check( + MachineMemoryConstraints const &memory_constraints, + MachineMappingResult const &result) { FeasibleMachineMappingResult feasible_result = ({ if (is_infeasible(result)) { return infeasible_machine_mapping_result(); From ef8c5c2f2d6eafec7fc9ec95da417879368457cf Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 16 Oct 2024 18:18:14 -0400 Subject: [PATCH 03/16] pass existing tests --- .../cost_estimator_for_test.cc | 18 +-- .../machine_mapping/cost_estimator_for_test.h | 18 +-- .../get_optimal_machine_mapping.cc | 82 +++++++++--- .../machine_mapping/machine_mapping_result.cc | 123 ++++++++++++++---- 4 files changed, 179 insertions(+), 62 deletions(-) diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc index 9ee596af3e..7607132832 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc @@ -5,23 +5,25 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( - std::function const &get_operator_cost, - std::function const + std::function const + &get_operator_cost, + std::function const &get_communication_cost) : get_operator_cost(get_operator_cost), get_communication_cost(get_communication_cost) {} -float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +CostMetric TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->get_operator_cost(k); } -float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { +CostMetric TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->get_communication_cost(m); } CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, - std::function const + std::function const + &get_operator_cost, + std::function const &get_communication_cost) { return CostEstimator::create(get_operator_cost, @@ -29,8 +31,8 @@ CostEstimator make_fake_cost_estimator( } CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map) { + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map) { return make_fake_cost_estimator( [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, [comm_cost_map](TensorSetMovement const &m) { diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h index 7c1d06207a..1b2cc9e91e 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h @@ -11,27 +11,27 @@ namespace FlexFlow { struct TestCostEstimator : public ICostEstimator { - std::function get_operator_cost; - std::function get_communication_cost; + std::function get_operator_cost; + std::function get_communication_cost; TestCostEstimator() = delete; TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost, decltype(get_communication_cost) const &get_communication_cost); - float estimate_cost(OpCostEstimateKey const &) const override; - - float estimate_cost(TensorSetMovement const &) const override; + CostMetric estimate_cost(OpCostEstimateKey const &) const override; + CostMetric estimate_cost(TensorSetMovement const &) const override; }; CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, - std::function const + std::function const + &get_operator_cost, + std::function const &get_communication_cost); CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map); + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map); } // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 0a874948e4..440e8506c4 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -118,22 +118,22 @@ TEST_SUITE(FF_TEST_SUITE) { }}; CostEstimator cost_estimator = make_fake_cost_estimator( - std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0}, - {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0}, - {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5}, - {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5}, + std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 1}}, + {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 2}}, + {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 3}}, + {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 3}}, }}, - std::unordered_map{{ - {TensorSetMovement{{}}, 0.0}, + std::unordered_map{{ + {TensorSetMovement{{}}, CostMetric{0.0, 0}}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - 0.1}, + CostMetric{0.1, 0}}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - 0.2}, + CostMetric{0.2, 0}}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - 0.3}, + CostMetric{0.3, 0}}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - 0.4}, + CostMetric{0.4, 0}}, }}); MachineMappingContext context = MachineMappingContext{ @@ -150,11 +150,25 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResult result = get_optimal_machine_mapping( - cache, context, problem_tree, full_machine_spec, constraints); + MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ + /*memory_limit=*/10, + }; + + MachineMappingConfig config = MachineMappingConfig{ + /*enable_memory_optimization=*/false, + }; + + MachineMappingResult result = + get_optimal_machine_mapping(cache, + context, + problem_tree, + full_machine_spec, + constraints, + memory_constraints, + config); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/1.0, + /*cost=*/CostMetric{1.0, 1}, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, @@ -173,11 +187,25 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResult result = get_optimal_machine_mapping( - cache, context, problem_tree, full_machine_spec, constraints); + MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ + /*memory_limit=*/10, + }; + + MachineMappingConfig config = MachineMappingConfig{ + /*enable_memory_optimization=*/false, + }; + + MachineMappingResult result = + get_optimal_machine_mapping(cache, + context, + problem_tree, + full_machine_spec, + constraints, + memory_constraints, + config); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/1.0 + 2.0 + 0.1, + /*cost=*/CostMetric{1.0 + 2.0 + 0.1, 1 + 2 + 0}, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -207,11 +235,25 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResult result = get_optimal_machine_mapping( - cache, context, problem_tree, full_machine_spec, constraints); + MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ + /*memory_limit=*/10, + }; + + MachineMappingConfig config = MachineMappingConfig{ + /*enable_memory_optimization=*/false, + }; + + MachineMappingResult result = + get_optimal_machine_mapping(cache, + context, + problem_tree, + full_machine_spec, + constraints, + memory_constraints, + config); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.5, + /*cost=*/CostMetric{2.5, 3}, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc index 254d6b2784..7665f929f2 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc @@ -6,10 +6,20 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("series_combine") { + MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ + /*memory_limit=*/10, + }; + MachineMappingConfig config = MachineMappingConfig{ + /*enable_memory_optimization=*/false, + }; + MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); - float pre_cost = 2.0; + CostMetric pre_cost = CostMetric{ + /*runtime=*/2.0, + /*memory=*/2, + }; MachineMappingResult pre = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/pre_cost, @@ -31,7 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - float post_cost = 4.0; + CostMetric post_cost = CostMetric{ + /*runtime=*/4.0, + /*memory=*/1, + }; MachineMappingResult post = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/post_cost, @@ -47,19 +60,32 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); - float comm_cost = 3.0; + CostMetric comm_cost = CostMetric{ + /*runtime=*/3.0, + /*memory=*/0, + }; SUBCASE("pre is infeasbile") { - MachineMappingResult result = series_combine( - comm_cost, infeasible, post, ParallelSplitTransformation::LthenR); + MachineMappingResult result = + series_combine(config, + memory_constraints, + comm_cost, + infeasible, + post, + ParallelSplitTransformation::LthenR); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("post is infeasbile") { - MachineMappingResult result = series_combine( - comm_cost, pre, infeasible, ParallelSplitTransformation::LthenR); + MachineMappingResult result = + series_combine(config, + memory_constraints, + comm_cost, + pre, + infeasible, + ParallelSplitTransformation::LthenR); MachineMappingResult correct = infeasible; CHECK(result == correct); @@ -67,7 +93,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("both are infeasible") { MachineMappingResult result = - series_combine(comm_cost, + series_combine(config, + memory_constraints, + comm_cost, infeasible, infeasible, ParallelSplitTransformation::LthenR); @@ -77,9 +105,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("both are feasible") { + CostMetric no_parallel_split_transform_cost = CostMetric{ + /*runtime=*/pre_cost.runtime + post_cost.runtime + comm_cost.runtime, + /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory, + }; MachineMappingResult no_parallel_split_transform = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/pre_cost + comm_cost + post_cost, + /*cost=*/no_parallel_split_transform_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -107,27 +139,42 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("parallel_split_transformation = std::nullopt") { - MachineMappingResult result = - series_combine(comm_cost, pre, post, std::nullopt); + MachineMappingResult result = series_combine( + config, memory_constraints, comm_cost, pre, post, std::nullopt); MachineMappingResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = LthenR") { - MachineMappingResult result = series_combine( - comm_cost, pre, post, ParallelSplitTransformation::LthenR); + MachineMappingResult result = + series_combine(config, + memory_constraints, + comm_cost, + pre, + post, + ParallelSplitTransformation::LthenR); MachineMappingResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = RthenL") { - MachineMappingResult result = series_combine( - comm_cost, pre, post, ParallelSplitTransformation::RthenL); + MachineMappingResult result = + series_combine(config, + memory_constraints, + comm_cost, + pre, + post, + ParallelSplitTransformation::RthenL); + CostMetric correct_cost = CostMetric{ + /*runtime=*/pre_cost.runtime + post_cost.runtime + + comm_cost.runtime, + /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory, + }; MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/pre_cost + comm_cost + post_cost, + /*runtime=*/correct_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -160,12 +207,29 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("parallel_combine") { + MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ + /*memory_limit=*/10, + }; + MachineMappingConfig config = MachineMappingConfig{ + /*enable_memory_optimization=*/false, + }; + MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); + CostMetric lhs_cost = CostMetric{ + /*runtime=*/2.0, + /*memory=*/2, + }; + + CostMetric rhs_cost = CostMetric{ + /*runtime=*/4.0, + /*memory=*/1, + }; + MachineMappingResult lhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.0, + /*cost=*/lhs_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -186,7 +250,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult rhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*cost=*/rhs_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -200,31 +264,40 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); SUBCASE("lhs is infeasbile") { - MachineMappingResult result = parallel_combine(infeasible, rhs); + MachineMappingResult result = + parallel_combine(config, memory_constraints, infeasible, rhs); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("rhs is infeasbile") { - MachineMappingResult result = parallel_combine(lhs, infeasible); + MachineMappingResult result = + parallel_combine(config, memory_constraints, lhs, infeasible); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("both are infeasible") { - MachineMappingResult result = parallel_combine(infeasible, infeasible); + MachineMappingResult result = + parallel_combine(config, memory_constraints, infeasible, infeasible); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("both are feasible") { - MachineMappingResult result = parallel_combine(lhs, rhs); + MachineMappingResult result = + parallel_combine(config, memory_constraints, lhs, rhs); + + CostMetric correct_cost = CostMetric{ + /*runtime=*/4.0, + /*memory=*/2, + }; MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*cost=*/correct_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -261,7 +334,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult faster = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.0, + /*cost=*/CostMetric{2.0, 2}, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -282,7 +355,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult slower = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*cost=*/CostMetric{4.0, 1}, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { From 982f1f5a711c0d7e6708f2e574257ef02645fea6 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 30 Oct 2024 20:15:31 -0400 Subject: [PATCH 04/16] initialize memory algorithm --- .../compiler/cost_estimator/cost_estimator.h | 11 +- ...easible_machine_mapping_result.struct.toml | 5 +- .../get_optimal_machine_mapping.h | 55 ++-- .../machine_mapping/machine_mapping_result.h | 22 +- .../machine_mapping_state.struct.toml | 10 - .../get_optimal_machine_mapping_with_memory.h | 49 ++++ .../machine_mapping_cache_with_memory.h | 19 ++ ...hine_mapping_cache_with_memory.struct.toml | 22 ++ .../machine_mapping_result_with_memory.h | 40 +++ ...ine_mapping_result_with_memory.struct.toml | 20 ++ .../machine_memory_constraints.struct.toml | 0 .../single_machine_mapping.struct.toml | 20 ++ ...lel_layer_guid_oblivious_machine_mapping.h | 1 + .../compiler/cost_estimator/cost_estimator.cc | 9 +- .../get_optimal_machine_mapping.cc | 100 +++---- .../machine_mapping/machine_mapping.cc | 1 - .../machine_mapping/machine_mapping_result.cc | 67 +---- ...get_optimal_machine_mapping_with_memory.cc | 264 ++++++++++++++++++ .../machine_mapping_cache_with_memory.cc | 32 +++ .../machine_mapping_result_with_memory.cc | 134 +++++++++ .../test/src/allowed_machine_views.cc | 104 +++++++ .../cost_estimator_for_test.cc | 18 +- .../machine_mapping/cost_estimator_for_test.h | 18 +- .../get_optimal_machine_mapping.cc | 113 ++++---- .../get_tensor_set_movement_across_split.cc | 63 ++++- .../machine_mapping/machine_mapping.cc | 84 +++++- .../machine_mapping/machine_mapping_result.cc | 216 +++++++------- 27 files changed, 1102 insertions(+), 395 deletions(-) create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml rename lib/compiler/include/compiler/machine_mapping/{machine_memory_constraints => memory_optimization}/machine_memory_constraints.struct.toml (100%) create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc create mode 100644 lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc create mode 100644 lib/compiler/test/src/allowed_machine_views.cc diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h index 55311af83b..828200cc6a 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h +++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h @@ -12,8 +12,10 @@ namespace FlexFlow { struct ICostEstimator { - virtual CostMetric estimate_cost(OpCostEstimateKey const &) const = 0; - virtual CostMetric estimate_cost(TensorSetMovement const &) const = 0; + virtual float estimate_cost(OpCostEstimateKey const &) const = 0; + virtual float estimate_cost(TensorSetMovement const &) const = 0; + virtual CostMetric + estimate_cost_with_memory(OpCostEstimateKey const &) const = 0; ICostEstimator() = default; ICostEstimator(ICostEstimator const &) = delete; @@ -24,8 +26,9 @@ struct ICostEstimator { CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator); struct CostEstimator { - CostMetric estimate_cost(OpCostEstimateKey const &k) const; - CostMetric estimate_cost(TensorSetMovement const &m) const; + float estimate_cost(OpCostEstimateKey const &k) const; + float estimate_cost(TensorSetMovement const &m) const; + CostMetric estimate_cost_with_memory(OpCostEstimateKey const &k) const; template static typename std::enable_if::value, diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml index 07dc30d2fc..e71cfc540f 100644 --- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml @@ -8,12 +8,11 @@ features = [ includes = [ "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h", - "compiler/cost_estimator/cost_metric.dtg.h", ] [[fields]] -name = "cost" -type = "::FlexFlow::CostMetric" +name = "runtime" +type = "float" [[fields]] name = "machine_mapping" diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h index e8b3771430..f69e6ab91b 100644 --- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h @@ -8,49 +8,40 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" -#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" #include "pcg/machine_specification.dtg.h" namespace FlexFlow { -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &context, - MachineMappingProblemTree const &problem_tree, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config); - -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &context, - MMProblemTreeSeriesSplit const &series_split, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - std::optional const - ¶llel_split_transformation, - MachineMappingConfig const &config); +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); + +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + std::optional const + ¶llel_split_transformation); MachineMappingResult get_optimal_machine_mapping( MachineMappingCache &result_cache, MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config); - -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config); + MachineMappingConstraints const &constraints); + +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index c240d68f2b..b21fea5f24 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -1,9 +1,7 @@ #ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_RESULT_H -#include "compiler/machine_mapping/machine_mapping_config.dtg.h" #include "compiler/machine_mapping/machine_mapping_result.dtg.h" -#include "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" namespace FlexFlow { @@ -16,32 +14,22 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); std::unordered_set const &); [[nodiscard]] MachineMappingResult - series_combine(MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - CostMetric const &comm_cost, + series_combine(float comm_cost, MachineMappingResult const &pre_result, MachineMappingResult const &post_result, std::optional const ¶llel_split_transformation); [[nodiscard]] MachineMappingResult - parallel_combine(MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &lhs_result, + parallel_combine(MachineMappingResult const &lhs_result, MachineMappingResult const &rhs_result); [[nodiscard]] MachineMappingResult minimize_runtime(MachineMappingResult const &m1, MachineMappingResult const &m2); -[[nodiscard]] MachineMappingResult make_singleton_machine_mapping_result( - MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - CostMetric const &cost, - MachineView const &machine_view); - -[[nodiscard]] MachineMappingResult machine_mapping_memory_check( - MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &result); +[[nodiscard]] MachineMappingResult + make_singleton_machine_mapping_result(float runtime, + MachineView const &machine_view); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml index b4a6147b5a..1346f6ebe7 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_state.struct.toml @@ -9,9 +9,7 @@ features = [ includes = [ "pcg/machine_specification.dtg.h", "compiler/machine_mapping/machine_mapping_constraints.dtg.h", - "compiler/machine_mapping/machine_mapping_config.dtg.h", "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h", - "compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.dtg.h", ] [[fields]] @@ -25,11 +23,3 @@ type = "::FlexFlow::MachineSpecification" [[fields]] name = "constraints" type = "::FlexFlow::MachineMappingConstraints" - -[[fields]] -name = "memory_constraints" -type = "::FlexFlow::MachineMemoryConstraints" - -[[fields]] -name = "config" -type = "::FlexFlow::MachineMappingConfig" diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h new file mode 100644 index 0000000000..f8a2e4d75a --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H +#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H + +#include "compiler/machine_mapping/machine_mapping_cache.dtg.h" +#include "compiler/machine_mapping/machine_mapping_config.dtg.h" +#include "compiler/machine_mapping/machine_mapping_constraints.dtg.h" +#include "compiler/machine_mapping/machine_mapping_context.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h" +#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" +#include "pcg/machine_specification.dtg.h" + +namespace FlexFlow { + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + std::optional const + ¶llel_split_transformation); + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MMProblemTreeParallelSplit const ¶llel_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h new file mode 100644 index 0000000000..2c45c04d3d --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H + +#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h" + +namespace FlexFlow { + +MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory(); +std::optional + machine_mapping_cache_with_memory_load( + MachineMappingCacheWithMemory const &, MachineMappingState const &); +void machine_mapping_cache_with_memory_save( + MachineMappingCacheWithMemory &, + MachineMappingState const &, + MachineMappingResultWithMemory const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml new file mode 100644 index 0000000000..e7afa26bb3 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml @@ -0,0 +1,22 @@ +namespace = "FlexFlow" +name = "MachineMappingCacheWithMemory" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "", + "compiler/machine_mapping/machine_mapping_state.dtg.h", + "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h", +] + +src_includes = [ + "utils/fmt/unordered_map.h", + "utils/hash/unordered_map.h", +] + +[[fields]] +name = "raw_map" +type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingResultWithMemory>" diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h new file mode 100644 index 0000000000..6203b99e55 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h @@ -0,0 +1,40 @@ +#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H +#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H + +#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h" +#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" + +namespace FlexFlow { + +[[nodiscard]] MachineMappingResultWithMemory + empty_machine_mapping_result_with_memory(); +[[nodiscard]] bool is_empty(MachineMappingResultWithMemory const &); + +[[nodiscard]] MachineMappingResultWithMemory get_mapping_with_minimal_runtime( + std::unordered_set const &); + +[[nodiscard]] MachineMappingResultWithMemory + remove_non_dominating_machine_mapping_result( + MachineMappingResultWithMemory const &); + +[[nodiscard]] MachineMappingResultWithMemory + series_combine(float comm_cost, + MachineMappingResultWithMemory const &pre_result, + MachineMappingResultWithMemory const &post_result, + std::optional const + ¶llel_split_transformation); +[[nodiscard]] MachineMappingResultWithMemory + parallel_combine(MachineMappingResultWithMemory const &lhs_result, + MachineMappingResultWithMemory const &rhs_result); + +[[nodiscard]] MachineMappingResultWithMemory + minimize_runtime(MachineMappingResultWithMemory const &m1, + MachineMappingResultWithMemory const &m2); + +[[nodiscard]] MachineMappingResultWithMemory + make_singleton_machine_mapping_result_with_memory( + CostMetric cost, MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml new file mode 100644 index 0000000000..f3b2895b83 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml @@ -0,0 +1,20 @@ +namespace = "FlexFlow" +name = "MachineMappingResultWithMemory" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/machine_mapping/memory_optimization/single_machine_mapping.dtg.h", +] + +src_includes = [ + "utils/hash/unordered_set.h", + "utils/fmt/unordered_set.h", +] + +[[fields]] +name = "machine_mappings" +type = "std::unordered_set<::FlexFlow::SingleMachineMapping>" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml similarity index 100% rename from lib/compiler/include/compiler/machine_mapping/machine_memory_constraints/machine_memory_constraints.struct.toml rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_memory_constraints.struct.toml diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml new file mode 100644 index 0000000000..05a23e905a --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml @@ -0,0 +1,20 @@ +namespace = "FlexFlow" +name = "SingleMachineMapping" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h", + "compiler/cost_estimator/cost_metric.dtg.h", +] + +[[fields]] +name = "cost" +type = "::FlexFlow::CostMetric" + +[[fields]] +name = "machine_mapping" +type = "::FlexFlow::ParallelLayerGuidObliviousMachineMapping" diff --git a/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h index accd96af4c..cb3af9c689 100644 --- a/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_PARALLEL_LAYER_GUID_OBLIVIOUS_MACHINE_MAPPING_H #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h" +#include namespace FlexFlow { diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc index 10e999dc1a..40a0f4e2a4 100644 --- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc +++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc @@ -5,12 +5,17 @@ namespace FlexFlow { CostEstimator::CostEstimator(std::shared_ptr implementation_ptr) : implementation_ptr(implementation_ptr) {} -CostMetric CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->implementation_ptr->estimate_cost(k); } -CostMetric CostEstimator::estimate_cost(TensorSetMovement const &m) const { +float CostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->implementation_ptr->estimate_cost(m); } +CostMetric + CostEstimator::estimate_cost_with_memory(OpCostEstimateKey const &k) const { + return this->implementation_ptr->estimate_cost_with_memory(k); +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 3321d53e98..10abd7ff90 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -24,21 +24,17 @@ namespace FlexFlow { -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &context, - MachineMappingProblemTree const &problem_tree, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config) { +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints) { MachineMappingState state = MachineMappingState{ problem_tree, resources, constraints, - memory_constraints, - config, }; { @@ -58,18 +54,14 @@ MachineMappingResult get_optimal_machine_mapping( series_split, resources, constraints, - memory_constraints, - /*parallel_split_transformation=*/std::nullopt, - config); + /*parallel_split_transformation=*/std::nullopt); }, [&](auto const &decomp_tree_node) { return get_optimal_machine_mapping(result_cache, context, decomp_tree_node, resources, - constraints, - memory_constraints, - config); + constraints); }, }); @@ -77,16 +69,14 @@ MachineMappingResult get_optimal_machine_mapping( return result; } -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &context, - MMProblemTreeSeriesSplit const &series_split, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - std::optional const - ¶llel_split_transformation, - MachineMappingConfig const &config) { +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + std::optional const + ¶llel_split_transformation) { auto get_boundary_machine_view_assignments = [&](std::unordered_set const &boundary_layers) @@ -120,9 +110,7 @@ MachineMappingResult get_optimal_machine_mapping( context, series_split.get_left_child(), resources, - pre_candidate, - memory_constraints, - config); + pre_candidate); return pre_result; }; @@ -138,9 +126,7 @@ MachineMappingResult get_optimal_machine_mapping( context, series_split.get_right_child(), resources, - post_candidate, - memory_constraints, - config); + post_candidate); return post_result; }; @@ -169,13 +155,11 @@ MachineMappingResult get_optimal_machine_mapping( tensor_movement, /*pre_mapping=*/assigned_pre_machine_views, /*post_mapping=*/assigned_post_machine_views); - CostMetric cost_across_split = + float cost_across_split = context.cost_estimator.estimate_cost(comm_across_split); result = minimize_runtime(result, - series_combine(config, - memory_constraints, - cost_across_split, + series_combine(cost_across_split, pre_result, post_result, parallel_split_transformation)); @@ -190,9 +174,7 @@ MachineMappingResult get_optimal_machine_mapping( MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config) { + MachineMappingConstraints const &constraints) { MachineMappingProblemTree lhs = parallel_split.get_left_child(); MachineMappingProblemTree rhs = parallel_split.get_right_child(); @@ -209,9 +191,7 @@ MachineMappingResult get_optimal_machine_mapping( series_split, resources, constraints, - memory_constraints, - ParallelSplitTransformation::LthenR, - config); + ParallelSplitTransformation::LthenR); }(); MachineMappingConstraints left_constraints = @@ -222,25 +202,16 @@ MachineMappingResult get_optimal_machine_mapping( auto evaluate_resource_split = [&](std::pair const &resource_split) { - MachineMappingResult left_result = - get_optimal_machine_mapping(result_cache, - context, - lhs, - resource_split.first, - left_constraints, - memory_constraints, - config); + MachineMappingResult left_result = get_optimal_machine_mapping( + result_cache, context, lhs, resource_split.first, left_constraints); MachineMappingResult right_result = get_optimal_machine_mapping(result_cache, context, rhs, resource_split.second, - right_constraints, - memory_constraints, - config); + right_constraints); - return parallel_combine( - config, memory_constraints, left_result, right_result); + return parallel_combine(left_result, right_result); }; std::unordered_set parallel_results = transform( @@ -250,14 +221,12 @@ MachineMappingResult get_optimal_machine_mapping( get_mapping_with_minimal_runtime(parallel_results)); } -MachineMappingResult get_optimal_machine_mapping( - MachineMappingCache &result_cache, - MachineMappingContext const &context, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resource, - MachineMappingConstraints const &constraints, - MachineMemoryConstraints const &memory_constraints, - MachineMappingConfig const &config) { +MachineMappingResult + get_optimal_machine_mapping(MachineMappingCache &result_cache, + MachineMappingContext const &context, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resource, + MachineMappingConstraints const &constraints) { std::unordered_set candidates = [&] { std::optional machine_view = require_only_root(constraints); @@ -271,10 +240,9 @@ MachineMappingResult get_optimal_machine_mapping( auto get_mapping_result = [&](MachineView const &machine_view) { OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(leaf, machine_view); - CostMetric cost = context.cost_estimator.estimate_cost(mapped); + float cost = context.cost_estimator.estimate_cost(mapped); - return make_singleton_machine_mapping_result( - config, memory_constraints, cost, machine_view); + return make_singleton_machine_mapping_result(cost, machine_view); }; std::unordered_set candidate_results = diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 6f350d8773..57e82684e9 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -1,5 +1,4 @@ #include "compiler/machine_mapping/machine_mapping.h" -#include "utils/containers.h" #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" #include "utils/containers/merge_maps.h" diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index fc9f747743..3409f7f871 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -1,5 +1,4 @@ #include "compiler/machine_mapping/machine_mapping_result.h" -#include "compiler/cost_estimator/cost_metric.h" #include "compiler/machine_mapping/machine_mapping.h" #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h" #include "utils/containers/map_keys.h" @@ -33,9 +32,7 @@ FeasibleMachineMappingResult } MachineMappingResult - series_combine(MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - CostMetric const &comm_cost, + series_combine(float comm_cost, MachineMappingResult const &maybe_pre_result, MachineMappingResult const &maybe_post_result, std::optional const @@ -66,26 +63,16 @@ MachineMappingResult } }(); - MachineMappingResult result_without_memory_check = MachineMappingResult{ + return MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/combine_cost_metrics_inter_device( - {pre_result.cost, comm_cost, post_result.cost}), + /*runtime=*/pre_result.runtime + comm_cost + post_result.runtime, /*machine_mapping=*/mapping, }, }; - - if (config.enable_memory_optimization) { - return machine_mapping_memory_check(memory_constraints, - result_without_memory_check); - } else { - return result_without_memory_check; - } } MachineMappingResult - parallel_combine(MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &maybe_lhs_result, + parallel_combine(MachineMappingResult const &maybe_lhs_result, MachineMappingResult const &maybe_rhs_result) { FeasibleMachineMappingResult lhs_result = ({ if (is_infeasible(maybe_lhs_result)) { @@ -101,22 +88,14 @@ MachineMappingResult require_feasible(maybe_rhs_result); }); - MachineMappingResult result_without_memory_check = MachineMappingResult{ + return MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/combine_cost_metrics_intra_device_parallel(lhs_result.cost, - rhs_result.cost), + /*runtime=*/std::max(lhs_result.runtime, rhs_result.runtime), /*machine_mapping=*/ binary_combine_mappings(/*lhs=*/lhs_result.machine_mapping, /*rhs=*/rhs_result.machine_mapping), }, }; - - if (config.enable_memory_optimization) { - return machine_mapping_memory_check(memory_constraints, - result_without_memory_check); - } else { - return result_without_memory_check; - } } MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1, @@ -135,47 +114,25 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1, require_feasible(maybe_m2); }); - if (m2.cost.runtime < m1.cost.runtime) { + if (m2.runtime < m1.runtime) { return maybe_m2; } else { return maybe_m1; } } -MachineMappingResult make_singleton_machine_mapping_result( - MachineMappingConfig const &config, - MachineMemoryConstraints const &memory_constraints, - CostMetric const &cost, - MachineView const &machine_view) { - MachineMappingResult result_without_memory_check = MachineMappingResult{ +MachineMappingResult + make_singleton_machine_mapping_result(float runtime, + MachineView const &machine_view) { + return MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/cost, + /*runtime=*/runtime, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), machine_view}, }}, }, }; - - return machine_mapping_memory_check(memory_constraints, - result_without_memory_check); -} - -MachineMappingResult machine_mapping_memory_check( - MachineMemoryConstraints const &memory_constraints, - MachineMappingResult const &result) { - FeasibleMachineMappingResult feasible_result = ({ - if (is_infeasible(result)) { - return infeasible_machine_mapping_result(); - } - require_feasible(result); - }); - - if (feasible_result.cost.memory > memory_constraints.memory_limit) { - return infeasible_machine_mapping_result(); - } else { - return result; - } } } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc new file mode 100644 index 0000000000..676f3a6c8e --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -0,0 +1,264 @@ +#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h" +#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" +#include "compiler/machine_mapping/get_machine_resource_splits.h" +#include "compiler/machine_mapping/machine_mapping_constraints.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "compiler/machine_mapping/transitive_reduced_pcg.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/machine_specification.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/machine_view.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "utils/containers/contains.h" +#include "utils/containers/flatmap.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/get_all_assignments.h" +#include "utils/containers/unordered_set_of.h" +#include "utils/exception.h" +#include "utils/overload.h" + +namespace FlexFlow { + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MachineMappingProblemTree const &problem_tree, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints) { + + MachineMappingState state = MachineMappingState{ + problem_tree, + resources, + constraints, + }; + + { + std::optional cached_result = + machine_mapping_cache_with_memory_load(result_cache, state); + if (cached_result) { + return cached_result.value(); + } + } + + MachineMappingResultWithMemory result = + problem_tree.visit(overload{ + [&](MMProblemTreeSeriesSplit const &series_split) { + return get_optimal_machine_mapping_with_memory( + result_cache, + context, + series_split, + resources, + constraints, + /*parallel_split_transformation=*/std::nullopt); + }, + [&](auto const &decomp_tree_node) { + return get_optimal_machine_mapping_with_memory(result_cache, + context, + decomp_tree_node, + resources, + constraints); + }, + }); + + machine_mapping_cache_with_memory_save(result_cache, state, result); + return result; +} + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MMProblemTreeSeriesSplit const &series_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints, + std::optional const + ¶llel_split_transformation) { + + auto get_boundary_machine_view_assignments = + [&](std::unordered_set const &boundary_layers) + -> std::unordered_set { + std::unordered_map> + allowed = generate_map( + boundary_layers, + [&](BinaryTreePath const &l) -> std::unordered_set { + UnmappedOpCostEstimateKey leaf = + mm_problem_tree_get_subtree_at_path( + MachineMappingProblemTree{series_split}, l) + .value() + .get(); + return context.allowed_machine_views(leaf, resources); + }); + return transform( + get_all_assignments(allowed), + [](std::unordered_map const &m) { + return ParallelLayerGuidObliviousMachineMapping{m}; + }); + }; + + auto eval_pre_boundary_mapping = + [&](ParallelLayerGuidObliviousMachineMapping const + &assigned_pre_machine_views) { + MachineMappingConstraints pre_candidate = with_additional_constraints( + restrict_to_left_child(constraints), assigned_pre_machine_views); + + MachineMappingResultWithMemory pre_result = + get_optimal_machine_mapping_with_memory( + result_cache, + context, + series_split.get_left_child(), + resources, + pre_candidate); + + return pre_result; + }; + + auto eval_post_boundary_mapping = + [&](ParallelLayerGuidObliviousMachineMapping const + &assigned_post_machine_views) { + MachineMappingConstraints post_candidate = with_additional_constraints( + restrict_to_right_child(constraints), assigned_post_machine_views); + + MachineMappingResultWithMemory post_result = + get_optimal_machine_mapping_with_memory( + result_cache, + context, + series_split.get_right_child(), + resources, + post_candidate); + + return post_result; + }; + + MachineMappingResultWithMemory result = + empty_machine_mapping_result_with_memory(); + AbstractedTensorSetMovement tensor_movement = + series_split.tensor_set_movement; + + for (ParallelLayerGuidObliviousMachineMapping const + &assigned_pre_machine_views : + get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) { + + MachineMappingResultWithMemory pre_result = + eval_pre_boundary_mapping(assigned_pre_machine_views); + + for (ParallelLayerGuidObliviousMachineMapping const + &assigned_post_machine_views : + get_boundary_machine_view_assignments( + get_dst_layers(tensor_movement))) { + + MachineMappingResultWithMemory post_result = + eval_post_boundary_mapping(assigned_post_machine_views); + + TensorSetMovement comm_across_split = + concretize_abstracted_tensor_set_movement( + tensor_movement, + /*pre_mapping=*/assigned_pre_machine_views, + /*post_mapping=*/assigned_post_machine_views); + float cost_across_split = + context.cost_estimator.estimate_cost(comm_across_split); + + result = minimize_runtime(result, + series_combine(cost_across_split, + pre_result, + post_result, + parallel_split_transformation)); + } + } + + return result; +} + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + MMProblemTreeParallelSplit const ¶llel_split, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints) { + + MachineMappingProblemTree lhs = parallel_split.get_left_child(); + MachineMappingProblemTree rhs = parallel_split.get_right_child(); + + MachineMappingResultWithMemory series_result = [&] { + MMProblemTreeSeriesSplit series_split = MMProblemTreeSeriesSplit{ + /*tensor_set_movement=*/empty_abstracted_tensor_set_movement(), + /*left_child=*/lhs, + /*right_child=*/rhs, + }; + + return get_optimal_machine_mapping_with_memory( + result_cache, + context, + series_split, + resources, + constraints, + ParallelSplitTransformation::LthenR); + }(); + + MachineMappingConstraints left_constraints = + restrict_to_left_child(constraints); + MachineMappingConstraints right_constraints = + restrict_to_right_child(constraints); + + auto evaluate_resource_split = + [&](std::pair const + &resource_split) { + MachineMappingResultWithMemory left_result = + get_optimal_machine_mapping_with_memory(result_cache, + context, + lhs, + resource_split.first, + left_constraints); + MachineMappingResultWithMemory right_result = + get_optimal_machine_mapping_with_memory(result_cache, + context, + rhs, + resource_split.second, + right_constraints); + + return parallel_combine(left_result, right_result); + }; + + std::unordered_set parallel_results = + transform(get_machine_resource_splits(resources), + evaluate_resource_split); + + return minimize_runtime(series_result, + get_mapping_with_minimal_runtime(parallel_results)); +} + +MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( + MachineMappingCacheWithMemory &result_cache, + MachineMappingContext const &context, + UnmappedOpCostEstimateKey const &leaf, + MachineSpecification const &resource, + MachineMappingConstraints const &constraints) { + + std::unordered_set candidates = [&] { + std::optional machine_view = require_only_root(constraints); + if (machine_view.has_value()) { + return std::unordered_set{machine_view.value()}; + } else { + return context.allowed_machine_views(leaf, resource); + } + }(); + + auto get_mapping_result = [&](MachineView const &machine_view) { + OpCostEstimateKey mapped = + map_unmapped_op_cost_estimate_key(leaf, machine_view); + CostMetric cost = context.cost_estimator.estimate_cost_with_memory(mapped); + + return make_singleton_machine_mapping_result_with_memory(cost, + machine_view); + }; + + std::unordered_set candidate_results = + transform(candidates, get_mapping_result); + + return get_mapping_with_minimal_runtime(candidate_results); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc new file mode 100644 index 0000000000..e74612250e --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc @@ -0,0 +1,32 @@ +#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/try_at.h" + +namespace FlexFlow { + +MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory() { + return MachineMappingCacheWithMemory{{}}; +} + +std::optional + machine_mapping_cache_with_memory_load( + MachineMappingCacheWithMemory const &cache, + MachineMappingState const &k) { + return try_at(cache.raw_map, k); +} + +void machine_mapping_cache_with_memory_save( + MachineMappingCacheWithMemory &cache, + MachineMappingState const &k, + MachineMappingResultWithMemory const &v) { + if (contains_key(cache.raw_map, k)) { + throw mk_runtime_error(fmt::format( + "machine_mapping_cache_with_memory_save expected key to not already " + "exist, but received existing key {}", + k)); + } + + cache.raw_map.emplace(k, v); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc new file mode 100644 index 0000000000..1c4f8e1142 --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -0,0 +1,134 @@ +#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h" +#include "utils/containers/set_union.h" +#include "utils/full_binary_tree/binary_tree_path.h" + +namespace FlexFlow { + +MachineMappingResultWithMemory empty_machine_mapping_result_with_memory() { + return MachineMappingResultWithMemory{ + {}, + }; +} + +MachineMappingResultWithMemory get_mapping_with_minimal_runtime( + std::unordered_set const &candidates) { + MachineMappingResultWithMemory result = + empty_machine_mapping_result_with_memory(); + + for (MachineMappingResultWithMemory const &candidate : candidates) { + result = minimize_runtime(result, candidate); + } + + return result; +} + +MachineMappingResultWithMemory remove_non_dominating_machine_mapping_result( + MachineMappingResultWithMemory const &result) { + std::unordered_set non_dominating_mappings; + for (SingleMachineMapping const &mapping : result.machine_mappings) { + bool is_dominating = true; + for (SingleMachineMapping const &other_mapping : result.machine_mappings) { + if (mapping.cost.runtime >= other_mapping.cost.runtime && + mapping.cost.memory >= other_mapping.cost.memory && + mapping != other_mapping) { + is_dominating = false; + break; + } + } + if (is_dominating) { + non_dominating_mappings.insert(mapping); + } + } + return MachineMappingResultWithMemory{std::move(non_dominating_mappings)}; +} + +MachineMappingResultWithMemory + series_combine(float comm_cost, + MachineMappingResultWithMemory const &pre_result, + MachineMappingResultWithMemory const &post_result, + std::optional const + ¶llel_split_transformation) { + auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm, + SingleMachineMapping const &post_mm) { + CostMetric cost = CostMetric{ + pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, + pre_mm.cost.memory + post_mm.cost.memory, + }; + + ParallelLayerGuidObliviousMachineMapping mapping = [&] { + if (parallel_split_transformation.has_value() && + parallel_split_transformation.value() == + ParallelSplitTransformation::RthenL) { + return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping, + /*rhs=*/pre_mm.machine_mapping); + } else { + return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping, + /*rhs=*/post_mm.machine_mapping); + } + }(); + + return SingleMachineMapping{cost, mapping}; + }; + + MachineMappingResultWithMemory result = + empty_machine_mapping_result_with_memory(); + for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) { + for (SingleMachineMapping const &post_mm : post_result.machine_mappings) { + result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm)); + } + } + + return remove_non_dominating_machine_mapping_result(result); +} + +MachineMappingResultWithMemory + parallel_combine(MachineMappingResultWithMemory const &lhs_result, + MachineMappingResultWithMemory const &rhs_result) { + auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm, + SingleMachineMapping const &rhs_mm) { + CostMetric cost = CostMetric{ + std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), + std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), + }; + + ParallelLayerGuidObliviousMachineMapping mapping = + binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping); + + return SingleMachineMapping{cost, mapping}; + }; + + MachineMappingResultWithMemory result = + empty_machine_mapping_result_with_memory(); + for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) { + for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) { + result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm)); + } + } + + return remove_non_dominating_machine_mapping_result(result); +} + +MachineMappingResultWithMemory + minimize_runtime(MachineMappingResultWithMemory const &m1, + MachineMappingResultWithMemory const &m2) { + MachineMappingResultWithMemory result = MachineMappingResultWithMemory{ + set_union(m1.machine_mappings, m2.machine_mappings), + }; + return remove_non_dominating_machine_mapping_result(result); +} + +MachineMappingResultWithMemory + make_singleton_machine_mapping_result_with_memory( + CostMetric cost, MachineView const &machine_view) { + return MachineMappingResultWithMemory{{ + SingleMachineMapping{ + cost, + ParallelLayerGuidObliviousMachineMapping{{ + {binary_tree_root_path(), machine_view}, + }}, + }, + }}; +} + +} // namespace FlexFlow diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc new file mode 100644 index 0000000000..936894ad2d --- /dev/null +++ b/lib/compiler/test/src/allowed_machine_views.cc @@ -0,0 +1,104 @@ +#include "compiler/allowed_machine_views.h" +#include "doctest/doctest.h" +#include "utils/containers/extend.h" +#include "utils/containers/range.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" +#include "utils/containers/zip.h" +#include "utils/fmt/unordered_set.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + + TEST_CASE("get_allowed_machine_views") { + + SUBCASE("1 degree of parallelism") { + MachineSpecification ms = MachineSpecification{ + /*num_nodes=*/1, + /*num_cpus_per_node=*/5, + /*num_gpus_per_node=*/5, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0, + }; + + OperatorTaskSpace task = OperatorTaskSpace{{3}}; + + std::unordered_set correct = { + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU}, + {MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU}, + {MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU}, + {MachineViewDimension{stride_t{1}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + MachineView{ + MachineSpaceCoordinate{ + /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU}, + {MachineViewDimension{stride_t{2}, + MachineSpecificationDimension::INTRA_NODE}}, + }, + }; + + std::unordered_set result = + get_allowed_machine_views(ms, task, DeviceType::GPU); + + CHECK(correct == result); + } + + SUBCASE("2 degrees of parallelism") { + + MachineSpecification ms = MachineSpecification{ + /*num_nodes=*/3, + /*num_cpus_per_node=*/3, + /*num_gpus_per_node=*/3, + /*inter_node_bandwidth=*/0, + /*intra_node_bandwidth=*/0, + }; + OperatorTaskSpace task = OperatorTaskSpace{{2, 3}}; + + auto make_2d_view = [&](int start_node_idx, + int start_device_idx, + int stride1, + int stride2, + MachineSpecificationDimension m1, + MachineSpecificationDimension m2) { + return MachineView{ + MachineSpaceCoordinate{ + start_node_idx, start_device_idx, DeviceType::GPU}, + {MachineViewDimension{stride_t{stride1}, m1}, + MachineViewDimension{stride_t{stride2}, m2}}, + }; + }; + + auto intra = MachineSpecificationDimension::INTRA_NODE; + auto inter = MachineSpecificationDimension::INTER_NODE; + std::unordered_set correct = { + make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra), + make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra), + make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra), + + make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter), + make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter), + make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter), + }; + + std::unordered_set result = + get_allowed_machine_views(ms, task, DeviceType::GPU); + + CHECK(correct == result); + } + } +} diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc index 7607132832..9ee596af3e 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc @@ -5,25 +5,23 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( - std::function const - &get_operator_cost, - std::function const + std::function const &get_operator_cost, + std::function const &get_communication_cost) : get_operator_cost(get_operator_cost), get_communication_cost(get_communication_cost) {} -CostMetric TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->get_operator_cost(k); } -CostMetric TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { +float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->get_communication_cost(m); } CostEstimator make_fake_cost_estimator( - std::function const - &get_operator_cost, - std::function const + std::function const &get_operator_cost, + std::function const &get_communication_cost) { return CostEstimator::create(get_operator_cost, @@ -31,8 +29,8 @@ CostEstimator make_fake_cost_estimator( } CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map) { + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map) { return make_fake_cost_estimator( [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, [comm_cost_map](TensorSetMovement const &m) { diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h index 1b2cc9e91e..7c1d06207a 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h @@ -11,27 +11,27 @@ namespace FlexFlow { struct TestCostEstimator : public ICostEstimator { - std::function get_operator_cost; - std::function get_communication_cost; + std::function get_operator_cost; + std::function get_communication_cost; TestCostEstimator() = delete; TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost, decltype(get_communication_cost) const &get_communication_cost); - CostMetric estimate_cost(OpCostEstimateKey const &) const override; - CostMetric estimate_cost(TensorSetMovement const &) const override; + float estimate_cost(OpCostEstimateKey const &) const override; + + float estimate_cost(TensorSetMovement const &) const override; }; CostEstimator make_fake_cost_estimator( - std::function const - &get_operator_cost, - std::function const + std::function const &get_operator_cost, + std::function const &get_communication_cost); CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map); + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map); } // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 440e8506c4..a0d06fe930 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -42,8 +42,35 @@ TEST_SUITE(FF_TEST_SUITE) { }; }; - MachineView mv1 = make_1d_machine_view(gpu_id_t(1), gpu_id_t(2)); - MachineView mv2 = make_1d_machine_view(gpu_id_t(1), gpu_id_t(3)); + MachineView mv1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView mv2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; MachineSpecification full_machine_spec = MachineSpecification{ /*num_nodes=*/2, @@ -118,22 +145,22 @@ TEST_SUITE(FF_TEST_SUITE) { }}; CostEstimator cost_estimator = make_fake_cost_estimator( - std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 1}}, - {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 2}}, - {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 3}}, - {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 3}}, + std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0}, + {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0}, + {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5}, + {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5}, }}, - std::unordered_map{{ - {TensorSetMovement{{}}, CostMetric{0.0, 0}}, + std::unordered_map{{ + {TensorSetMovement{{}}, 0.0}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - CostMetric{0.1, 0}}, + 0.1}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - CostMetric{0.2, 0}}, + 0.2}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - CostMetric{0.3, 0}}, + 0.3}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - CostMetric{0.4, 0}}, + 0.4}, }}); MachineMappingContext context = MachineMappingContext{ @@ -150,25 +177,11 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ - /*memory_limit=*/10, - }; - - MachineMappingConfig config = MachineMappingConfig{ - /*enable_memory_optimization=*/false, - }; - - MachineMappingResult result = - get_optimal_machine_mapping(cache, - context, - problem_tree, - full_machine_spec, - constraints, - memory_constraints, - config); + MachineMappingResult result = get_optimal_machine_mapping( + cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/CostMetric{1.0, 1}, + /*runtime=*/1.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, @@ -187,25 +200,11 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ - /*memory_limit=*/10, - }; - - MachineMappingConfig config = MachineMappingConfig{ - /*enable_memory_optimization=*/false, - }; - - MachineMappingResult result = - get_optimal_machine_mapping(cache, - context, - problem_tree, - full_machine_spec, - constraints, - memory_constraints, - config); + MachineMappingResult result = get_optimal_machine_mapping( + cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/CostMetric{1.0 + 2.0 + 0.1, 1 + 2 + 0}, + /*runtime=*/1.0 + 2.0 + 0.1, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -235,25 +234,11 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ - /*memory_limit=*/10, - }; - - MachineMappingConfig config = MachineMappingConfig{ - /*enable_memory_optimization=*/false, - }; - - MachineMappingResult result = - get_optimal_machine_mapping(cache, - context, - problem_tree, - full_machine_spec, - constraints, - memory_constraints, - config); + MachineMappingResult result = get_optimal_machine_mapping( + cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/CostMetric{2.5, 3}, + /*runtime=*/2.5, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc index c66d533d0f..e22f715d82 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc @@ -64,10 +64,65 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelLayerAddedResult relu_2 = add_parallel_layer( pcg, relu_attrs, {get_only(relu_1.outputs)}, {relu_output_attrs}); - MachineView pre_mv1 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}); - MachineView pre_mv2 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{2}); - MachineView post_mv1 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{3}); - MachineView post_mv2 = make_1d_machine_view(gpu_id_t{0}, gpu_id_t{4}); + MachineView pre_mv1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView pre_mv2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView post_mv1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{3}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView post_mv2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{4}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; SUBCASE("single edge across split") { PCGBinarySeriesSplit split = PCGBinarySeriesSplit{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc index 6b16a54c1f..221cca3ae1 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc @@ -8,33 +8,89 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("combine_disjoint_mappings(MachineMapping, MachineMappping)") { - MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); - MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + MachineMapping machine_mapping_0 = MachineMapping({ - {parallel_layer_guid_t(Node(0)), machine_view_0}, + {parallel_layer_guid_t{Node{0}}, machine_view_0}, }); MachineMapping machine_mapping_1 = MachineMapping({ - {parallel_layer_guid_t(Node(1)), machine_view_1}, - }); - MachineMapping correct = MachineMapping({ - {parallel_layer_guid_t(Node(0)), machine_view_0}, - {parallel_layer_guid_t(Node(1)), machine_view_1}, + {parallel_layer_guid_t{Node{1}}, machine_view_1}, }); + MachineMapping correct = MachineMapping{{ + {parallel_layer_guid_t{Node{0}}, machine_view_0}, + {parallel_layer_guid_t{Node{1}}, machine_view_1}, + }}; MachineMapping result = combine_disjoint_mappings(machine_mapping_0, machine_mapping_1); CHECK(result == correct); } TEST_CASE("nodes_are_disjoint(MachineMapping, MachineMappping)") { - MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); - MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + MachineMapping machine_mapping_0 = MachineMapping({ - {parallel_layer_guid_t(Node(0)), machine_view_0}, + {parallel_layer_guid_t{Node{0}}, machine_view_0}, }); SUBCASE("nodes are disjoint") { MachineMapping machine_mapping_1 = MachineMapping({ - {parallel_layer_guid_t(Node(1)), machine_view_1}, + {parallel_layer_guid_t{Node{1}}, machine_view_1}, }); bool correct = true; @@ -44,8 +100,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("nodes are not disjoint") { MachineMapping machine_mapping_1 = MachineMapping({ - {parallel_layer_guid_t(Node(0)), machine_view_0}, - {parallel_layer_guid_t(Node(1)), machine_view_1}, + {parallel_layer_guid_t{Node{0}}, machine_view_0}, + {parallel_layer_guid_t{Node{1}}, machine_view_1}, }); bool correct = false; bool result = nodes_are_disjoint(machine_mapping_0, machine_mapping_1); diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc index 7665f929f2..73b921fc98 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc @@ -6,20 +6,37 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("series_combine") { - MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ - /*memory_limit=*/10, - }; - MachineMappingConfig config = MachineMappingConfig{ - /*enable_memory_optimization=*/false, + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, }; - MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); - MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); - - CostMetric pre_cost = CostMetric{ - /*runtime=*/2.0, - /*memory=*/2, + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, }; + + float pre_cost = 2.0; MachineMappingResult pre = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/pre_cost, @@ -41,10 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CostMetric post_cost = CostMetric{ - /*runtime=*/4.0, - /*memory=*/1, - }; + float post_cost = 4.0; MachineMappingResult post = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/post_cost, @@ -60,32 +74,19 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); - CostMetric comm_cost = CostMetric{ - /*runtime=*/3.0, - /*memory=*/0, - }; + float comm_cost = 3.0; - SUBCASE("pre is infeasbile") { - MachineMappingResult result = - series_combine(config, - memory_constraints, - comm_cost, - infeasible, - post, - ParallelSplitTransformation::LthenR); + SUBCASE("pre is infeasible") { + MachineMappingResult result = series_combine( + comm_cost, infeasible, post, ParallelSplitTransformation::LthenR); MachineMappingResult correct = infeasible; CHECK(result == correct); } - SUBCASE("post is infeasbile") { - MachineMappingResult result = - series_combine(config, - memory_constraints, - comm_cost, - pre, - infeasible, - ParallelSplitTransformation::LthenR); + SUBCASE("post is infeasible") { + MachineMappingResult result = series_combine( + comm_cost, pre, infeasible, ParallelSplitTransformation::LthenR); MachineMappingResult correct = infeasible; CHECK(result == correct); @@ -93,9 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("both are infeasible") { MachineMappingResult result = - series_combine(config, - memory_constraints, - comm_cost, + series_combine(comm_cost, infeasible, infeasible, ParallelSplitTransformation::LthenR); @@ -105,13 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("both are feasible") { - CostMetric no_parallel_split_transform_cost = CostMetric{ - /*runtime=*/pre_cost.runtime + post_cost.runtime + comm_cost.runtime, - /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory, - }; MachineMappingResult no_parallel_split_transform = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/no_parallel_split_transform_cost, + /*runtime=*/pre_cost + comm_cost + post_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -139,42 +134,27 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("parallel_split_transformation = std::nullopt") { - MachineMappingResult result = series_combine( - config, memory_constraints, comm_cost, pre, post, std::nullopt); + MachineMappingResult result = + series_combine(comm_cost, pre, post, std::nullopt); MachineMappingResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = LthenR") { - MachineMappingResult result = - series_combine(config, - memory_constraints, - comm_cost, - pre, - post, - ParallelSplitTransformation::LthenR); + MachineMappingResult result = series_combine( + comm_cost, pre, post, ParallelSplitTransformation::LthenR); MachineMappingResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = RthenL") { - MachineMappingResult result = - series_combine(config, - memory_constraints, - comm_cost, - pre, - post, - ParallelSplitTransformation::RthenL); - CostMetric correct_cost = CostMetric{ - /*runtime=*/pre_cost.runtime + post_cost.runtime + - comm_cost.runtime, - /*memory=*/pre_cost.memory + post_cost.memory + comm_cost.memory, - }; + MachineMappingResult result = series_combine( + comm_cost, pre, post, ParallelSplitTransformation::RthenL); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/correct_cost, + /*runtime=*/pre_cost + comm_cost + post_cost, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -207,29 +187,39 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("parallel_combine") { - MachineMemoryConstraints memory_constraints = MachineMemoryConstraints{ - /*memory_limit=*/10, - }; - MachineMappingConfig config = MachineMappingConfig{ - /*enable_memory_optimization=*/false, - }; - - MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); - MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); - - CostMetric lhs_cost = CostMetric{ - /*runtime=*/2.0, - /*memory=*/2, + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, }; - CostMetric rhs_cost = CostMetric{ - /*runtime=*/4.0, - /*memory=*/1, + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, }; MachineMappingResult lhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/lhs_cost, + /*runtime=*/2.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -250,7 +240,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult rhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/rhs_cost, + /*runtime=*/4.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -263,41 +253,32 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); - SUBCASE("lhs is infeasbile") { - MachineMappingResult result = - parallel_combine(config, memory_constraints, infeasible, rhs); + SUBCASE("lhs is infeasible") { + MachineMappingResult result = parallel_combine(infeasible, rhs); MachineMappingResult correct = infeasible; CHECK(result == correct); } - SUBCASE("rhs is infeasbile") { - MachineMappingResult result = - parallel_combine(config, memory_constraints, lhs, infeasible); + SUBCASE("rhs is infeasible") { + MachineMappingResult result = parallel_combine(lhs, infeasible); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("both are infeasible") { - MachineMappingResult result = - parallel_combine(config, memory_constraints, infeasible, infeasible); + MachineMappingResult result = parallel_combine(infeasible, infeasible); MachineMappingResult correct = infeasible; CHECK(result == correct); } SUBCASE("both are feasible") { - MachineMappingResult result = - parallel_combine(config, memory_constraints, lhs, rhs); - - CostMetric correct_cost = CostMetric{ - /*runtime=*/4.0, - /*memory=*/2, - }; + MachineMappingResult result = parallel_combine(lhs, rhs); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/correct_cost, + /*runtime=*/4.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -329,12 +310,39 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("minimize_runtime") { - MachineView machine_view_0 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(1)); - MachineView machine_view_1 = make_1d_machine_view(gpu_id_t(0), gpu_id_t(2)); + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; MachineMappingResult faster = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/CostMetric{2.0, 2}, + /*runtime=*/2.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -355,7 +363,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult slower = MachineMappingResult{ FeasibleMachineMappingResult{ - /*cost=*/CostMetric{4.0, 1}, + /*runtime=*/4.0, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -368,7 +376,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); - SUBCASE("lhs is infeasbile") { + SUBCASE("lhs is infeasible") { MachineMappingResult result = minimize_runtime(infeasible, slower); MachineMappingResult correct = slower; From 964c885c5bf667ec1285eca6c0f7746b7c2e6edc Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 30 Oct 2024 20:35:08 -0400 Subject: [PATCH 05/16] fix tests & format --- .../machine_mapping_result_with_memory.h | 1 + .../cost_estimator_for_test.cc | 47 +++++++++++++++++-- .../machine_mapping/cost_estimator_for_test.h | 22 ++++++++- 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h index 6203b99e55..d56d33f7ec 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h @@ -3,6 +3,7 @@ #include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" +#include namespace FlexFlow { diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc index 9ee596af3e..b55b4d283c 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc @@ -7,9 +7,12 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( std::function const &get_operator_cost, std::function const - &get_communication_cost) + &get_communication_cost, + std::function const + &get_operator_cost_with_memory) : get_operator_cost(get_operator_cost), - get_communication_cost(get_communication_cost) {} + get_communication_cost(get_communication_cost), + get_operator_cost_with_memory(get_operator_cost_with_memory) {} float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->get_operator_cost(k); @@ -19,13 +22,24 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->get_communication_cost(m); } +CostMetric TestCostEstimator::estimate_cost_with_memory( + OpCostEstimateKey const &k) const { + return this->get_operator_cost_with_memory(k); +} + CostEstimator make_fake_cost_estimator( std::function const &get_operator_cost, std::function const &get_communication_cost) { + auto get_operator_cost_with_memory = [=](OpCostEstimateKey const &k) { + return CostMetric{ + get_operator_cost(k), + 0, + }; + }; - return CostEstimator::create(get_operator_cost, - get_communication_cost); + return make_fake_cost_estimator( + get_operator_cost, get_communication_cost, get_operator_cost_with_memory); } CostEstimator make_fake_cost_estimator( @@ -38,4 +52,29 @@ CostEstimator make_fake_cost_estimator( }); } +CostEstimator make_fake_cost_estimator( + std::function const &get_operator_cost, + std::function const + &get_communication_cost, + std::function const + &get_operator_cost_with_memory) { + return CostEstimator::create( + get_operator_cost, get_communication_cost, get_operator_cost_with_memory); +} + +CostEstimator make_fake_cost_estimator( + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map, + std::unordered_map const + &op_cost_with_memory_map) { + return make_fake_cost_estimator( + [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, + [comm_cost_map](TensorSetMovement const &m) { + return comm_cost_map.at(m); + }, + [op_cost_with_memory_map](OpCostEstimateKey const &k) { + return op_cost_with_memory_map.at(k); + }); +} + } // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h index 7c1d06207a..302421f873 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h @@ -13,15 +13,22 @@ namespace FlexFlow { struct TestCostEstimator : public ICostEstimator { std::function get_operator_cost; std::function get_communication_cost; + std::function + get_operator_cost_with_memory; TestCostEstimator() = delete; TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost, decltype(get_communication_cost) - const &get_communication_cost); + const &get_communication_cost, + decltype(get_operator_cost_with_memory) + const &get_operator_cost_with_memory); float estimate_cost(OpCostEstimateKey const &) const override; float estimate_cost(TensorSetMovement const &) const override; + + CostMetric + estimate_cost_with_memory(OpCostEstimateKey const &) const override; }; CostEstimator make_fake_cost_estimator( @@ -33,6 +40,19 @@ CostEstimator make_fake_cost_estimator( std::unordered_map const &op_cost_map, std::unordered_map const &comm_cost_map); +CostEstimator make_fake_cost_estimator( + std::function const &get_operator_cost, + std::function const + &get_communication_cost, + std::function const + &get_operator_cost_with_memory); + +CostEstimator make_fake_cost_estimator( + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map, + std::unordered_map const + &op_cost_with_memory_map); + } // namespace FlexFlow #endif From 0c0e7b042fe9bee725e4592259b62256dac17882 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 6 Nov 2024 20:56:36 -0500 Subject: [PATCH 06/16] minimum tests for memory algorithm --- ...get_optimal_machine_mapping_with_memory.cc | 293 +++++++++ .../machine_mapping_result_with_memory.cc | 585 ++++++++++++++++++ 2 files changed, 878 insertions(+) create mode 100644 lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc create mode 100644 lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc new file mode 100644 index 0000000000..566af800ea --- /dev/null +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -0,0 +1,293 @@ +#include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h" +#include "../cost_estimator_for_test.h" +#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" +#include "compiler/machine_mapping/machine_mapping_constraints.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" +#include "pcg/machine_view.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "utils/containers/get_only.h" +#include "utils/full_binary_tree/binary_tree_path.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_optimal_machine_mapping_with_memory") { + auto make_leaf = [](UnmappedOpCostEstimateKey const &k) { + return MachineMappingProblemTree{k}; + }; + + auto make_series_split = + [](AbstractedTensorSetMovement const &tensor_set_movement, + MachineMappingProblemTree const &lhs, + MachineMappingProblemTree const &rhs) { + return MachineMappingProblemTree{ + MMProblemTreeSeriesSplit{ + /*tensor_set_movement=*/tensor_set_movement, + /*left_child=*/lhs, + /*right_child=*/rhs, + }, + }; + }; + + auto make_parallel_split = [](MachineMappingProblemTree const &lhs, + MachineMappingProblemTree const &rhs) { + return MachineMappingProblemTree{ + MMProblemTreeParallelSplit{ + /*left_child=*/lhs, + /*right_child=*/rhs, + }, + }; + }; + + MachineView mv1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView mv2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/2, + /*num_cpus_per_node=*/1, + /*num_gpus_per_node=*/1, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + MachineSpecification split_machine_spec = MachineSpecification{ + /*num_nodes=*/1, + /*num_cpus_per_node=*/1, + /*num_gpus_per_node=*/1, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &, + MachineSpecification const &resources) { + if (resources == full_machine_spec) { + return std::unordered_set{mv1, mv2}; + } else { + return std::unordered_set{mv2}; + } + }; + + UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}}, + /*input_shapes=*/{}, + /*weight_shapes=*/{}, + /*output_shapes=*/{}, + }; + + UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{ + /*type=*/OperatorType::EW_ADD, + /*compute_type=*/DataType::FLOAT, + /*should_broadcast_lhs=*/false, + /*should_broadcast_rhs=*/false, + }}, + /*input_shapes=*/{}, + /*weight_shapes=*/{}, + /*output_shapes=*/{}, + }; + + ParallelTensorShape tensor_shape1 = ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{}, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }; + + AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{ + AbstractedSingleTensorMovement{ + /*parallel_tensor_shape=*/tensor_shape1, + /*src_machine_views=*/{}, + /*dst_machine_views=*/{}, + }, + }}; + + ParallelLayerGuidObliviousMachineMapping mm1 = + ParallelLayerGuidObliviousMachineMapping{{ + {binary_tree_root_path(), mv1}, + }}; + ParallelLayerGuidObliviousMachineMapping mm2 = + ParallelLayerGuidObliviousMachineMapping{{ + {binary_tree_root_path(), mv2}, + }}; + + CostEstimator cost_estimator = make_fake_cost_estimator( + std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0}, + {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0}, + {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5}, + {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5}, + }}, + std::unordered_map{{ + {TensorSetMovement{{}}, 0.0}, + {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), + 0.1}, + {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), + 0.2}, + {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), + 0.3}, + {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), + 0.4}, + }}, + std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 2}}, + {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 3}}, + {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 1}}, + {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 2}}, + }}); + + MachineMappingContext context = MachineMappingContext{ + cost_estimator, + allowed_machine_views1, + }; + + MachineMappingCacheWithMemory cache = + empty_machine_mapping_cache_with_memory(); + + SUBCASE("single layer") { + MachineMappingProblemTree problem_tree = make_leaf(k1); + + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers( + get_all_leaf_paths(problem_tree)); + + MachineMappingResultWithMemory result = + get_optimal_machine_mapping_with_memory( + cache, context, problem_tree, full_machine_spec, constraints); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + CostMetric{1.0, 2}, + ParallelLayerGuidObliviousMachineMapping{{ + {binary_tree_root_path(), mv1}, + }}, + }, + SingleMachineMapping{ + CostMetric{1.5, 1}, + ParallelLayerGuidObliviousMachineMapping{{ + {binary_tree_root_path(), mv2}, + }}, + }, + }}; + + CHECK(result == correct); + } + + SUBCASE("pair of layers in sequence") { + MachineMappingProblemTree problem_tree = + make_series_split(movement1, make_leaf(k1), make_leaf(k2)); + + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers( + get_all_leaf_paths(problem_tree)); + + MachineMappingResultWithMemory result = + get_optimal_machine_mapping_with_memory( + cache, context, problem_tree, full_machine_spec, constraints); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + CostMetric{1.0 + 2.0 + 0.1, 2 + 3}, + ParallelLayerGuidObliviousMachineMapping{{ + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + }}, + mv1, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + }}, + mv1, + }, + }}, + }, + SingleMachineMapping{ + CostMetric{1.5 + 2.5 + 0.1, 1 + 2}, + ParallelLayerGuidObliviousMachineMapping{{ + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + }}, + mv2, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + }}, + mv2, + }, + }}, + }, + }}; + + CHECK(result == correct); + } + + SUBCASE("pair of layers in parallel") { + MachineMappingProblemTree problem_tree = + make_parallel_split(make_leaf(k1), make_leaf(k2)); + + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers( + get_all_leaf_paths(problem_tree)); + + MachineMappingResultWithMemory result = + get_optimal_machine_mapping_with_memory( + cache, context, problem_tree, full_machine_spec, constraints); + MachineMappingResultWithMemory correct = + MachineMappingResultWithMemory{{SingleMachineMapping{ + CostMetric{2.5, 2}, + ParallelLayerGuidObliviousMachineMapping{{ + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + }}, + mv2, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + }}, + mv2, + }, + }}, + + }}}; + + CHECK(result == correct); + } + } +} diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc new file mode 100644 index 0000000000..6ca551c436 --- /dev/null +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -0,0 +1,585 @@ +#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "pcg/machine_view.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("remove_non_dominating_machine_mapping_result") { + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{4}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + CostMetric cost1 = CostMetric{ + 2.0, + 2, + }; + CostMetric cost2 = CostMetric{ + 4.0, + 1, + }; + CostMetric cost3 = CostMetric{ + 2.0, + 3, + }; + + SingleMachineMapping mm1 = SingleMachineMapping{ + cost1, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_0, + }, + }, + }, + }; + + SingleMachineMapping mm2 = SingleMachineMapping{ + cost2, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_1, + }, + }, + }, + }; + + SingleMachineMapping mm3 = SingleMachineMapping{ + cost3, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_2, + }, + }, + }, + }; + + SUBCASE("empty") { + MachineMappingResultWithMemory to_remove = + empty_machine_mapping_result_with_memory(); + MachineMappingResultWithMemory result = + remove_non_dominating_machine_mapping_result(to_remove); + MachineMappingResultWithMemory correct = + empty_machine_mapping_result_with_memory(); + + CHECK(result == correct); + } + + SUBCASE("no non-dominating") { + MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{ + { + mm1, + mm2, + }, + }; + MachineMappingResultWithMemory result = + remove_non_dominating_machine_mapping_result(to_remove); + MachineMappingResultWithMemory correct = to_remove; + + CHECK(result == correct); + } + + SUBCASE("non-dominating") { + MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{ + { + mm1, + mm2, + mm3, + }, + }; + MachineMappingResultWithMemory result = + remove_non_dominating_machine_mapping_result(to_remove); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + { + mm1, + mm2, + }, + }; + + CHECK(result == correct); + } + } + + TEST_CASE("series_combine(memory)") { + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + CostMetric pre_cost = CostMetric{ + 2.0, + 2, + }; + MachineMappingResultWithMemory pre = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + pre_cost, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{ + {BinaryTreePathEntry::LEFT_CHILD}, + }, + machine_view_0, + }, + { + BinaryTreePath{ + {BinaryTreePathEntry::RIGHT_CHILD}, + }, + machine_view_1, + }, + }, + }, + }, + }}; + + CostMetric post_cost = CostMetric{ + 4.0, + 1, + }; + + MachineMappingResultWithMemory post = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + post_cost, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_1, + }, + }, + }, + }, + }}; + + MachineMappingResultWithMemory empty = + empty_machine_mapping_result_with_memory(); + + float comm_cost = 3.0; + + SUBCASE("pre is empty") { + MachineMappingResultWithMemory result = series_combine( + comm_cost, empty, post, ParallelSplitTransformation::LthenR); + MachineMappingResultWithMemory correct = empty; + + CHECK(result == correct); + } + + SUBCASE("post is empty") { + MachineMappingResultWithMemory result = series_combine( + comm_cost, pre, empty, ParallelSplitTransformation::LthenR); + MachineMappingResultWithMemory correct = empty; + + CHECK(result == correct); + } + + SUBCASE("both are nonempty") { + MachineMappingResultWithMemory no_parallel_split_transform = + MachineMappingResultWithMemory{ + { + SingleMachineMapping{ + /*cost=*/CostMetric{ + pre_cost.runtime + comm_cost + post_cost.runtime, + pre_cost.memory + post_cost.memory, + }, + /*machine_mapping=*/ + ParallelLayerGuidObliviousMachineMapping{{ + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + BinaryTreePathEntry::LEFT_CHILD, + }}, + machine_view_0, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + BinaryTreePathEntry::RIGHT_CHILD, + }}, + machine_view_1, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + }}, + machine_view_1, + }, + }}, + }, + }, + }; + + SUBCASE("parallel_split_transformation = std::nullopt") { + MachineMappingResultWithMemory result = + series_combine(comm_cost, pre, post, std::nullopt); + MachineMappingResultWithMemory correct = no_parallel_split_transform; + + CHECK(result == correct); + } + + SUBCASE("parallel_split_transformation = LthenR") { + MachineMappingResultWithMemory result = series_combine( + comm_cost, pre, post, ParallelSplitTransformation::LthenR); + MachineMappingResultWithMemory correct = no_parallel_split_transform; + + CHECK(result == correct); + } + + SUBCASE("parallel_split_transformation = RthenL") { + MachineMappingResultWithMemory result = series_combine( + comm_cost, pre, post, ParallelSplitTransformation::RthenL); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + { + SingleMachineMapping{ + /*cost=*/CostMetric{ + pre_cost.runtime + comm_cost + post_cost.runtime, + pre_cost.memory + post_cost.memory, + }, + /*machine_mapping=*/ + ParallelLayerGuidObliviousMachineMapping{{ + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + BinaryTreePathEntry::LEFT_CHILD, + }}, + machine_view_0, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::RIGHT_CHILD, + BinaryTreePathEntry::RIGHT_CHILD, + }}, + machine_view_1, + }, + { + BinaryTreePath{{ + BinaryTreePathEntry::LEFT_CHILD, + }}, + machine_view_1, + }, + }}, + }, + }, + }; + + CHECK(result == correct); + } + } + } + + TEST_CASE("parallel_combine(memory)") { + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + CostMetric lhs_cost = CostMetric{ + 2.0, + 2, + }; + MachineMappingResultWithMemory lhs = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + lhs_cost, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{ + {BinaryTreePathEntry::LEFT_CHILD}, + }, + machine_view_0, + }, + { + BinaryTreePath{ + {BinaryTreePathEntry::RIGHT_CHILD}, + }, + machine_view_1, + }, + }, + }, + }, + }}; + + CostMetric rhs_cost = CostMetric{ + 4.0, + 1, + }; + MachineMappingResultWithMemory rhs = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + rhs_cost, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_1, + }, + }, + }, + }, + }}; + + MachineMappingResultWithMemory empty = + empty_machine_mapping_result_with_memory(); + + SUBCASE("lhs is empty") { + MachineMappingResultWithMemory result = parallel_combine(empty, rhs); + MachineMappingResultWithMemory correct = empty; + + CHECK(result == correct); + } + + SUBCASE("rhs is empty") { + MachineMappingResultWithMemory result = parallel_combine(lhs, empty); + MachineMappingResultWithMemory correct = empty; + + CHECK(result == correct); + } + + SUBCASE("both are nonempty") { + MachineMappingResultWithMemory result = parallel_combine(lhs, rhs); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + SingleMachineMapping{ + /*cost=*/CostMetric{ + std::max(lhs_cost.runtime, rhs_cost.runtime), + std::max(lhs_cost.memory, rhs_cost.memory), + }, + /*machine_mapping=*/ + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD, + BinaryTreePathEntry::LEFT_CHILD}}, + machine_view_0, + }, + { + BinaryTreePath{{BinaryTreePathEntry::LEFT_CHILD, + BinaryTreePathEntry::RIGHT_CHILD}}, + machine_view_1, + }, + { + BinaryTreePath{{BinaryTreePathEntry::RIGHT_CHILD}}, + machine_view_1, + }, + }, + }, + }, + }}; + + CHECK(result == correct); + } + } + + TEST_CASE("minimize_runtime(memory)") { + MachineView machine_view_0 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{1}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_1 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{2}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + MachineView machine_view_2 = MachineView{ + /*start=*/MachineSpaceCoordinate{ + /*node_idx=*/0, + /*device_idx=*/0, + /*device_type=*/DeviceType::GPU, + }, + /*dimensions=*/ + { + MachineViewDimension{ + stride_t{4}, + MachineSpecificationDimension::INTRA_NODE, + }, + }, + }; + + CostMetric cost1 = CostMetric{ + 2.0, + 2, + }; + CostMetric cost2 = CostMetric{ + 4.0, + 1, + }; + CostMetric cost3 = CostMetric{ + 2.0, + 3, + }; + + SingleMachineMapping mm1 = SingleMachineMapping{ + cost1, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_0, + }, + }, + }, + }; + + SingleMachineMapping mm2 = SingleMachineMapping{ + cost2, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_1, + }, + }, + }, + }; + + SingleMachineMapping mm3 = SingleMachineMapping{ + cost3, + ParallelLayerGuidObliviousMachineMapping{ + { + { + BinaryTreePath{{}}, + machine_view_2, + }, + }, + }, + }; + + MachineMappingResultWithMemory result1 = MachineMappingResultWithMemory{ + { + mm1, + mm2, + }, + }; + + MachineMappingResultWithMemory result2 = MachineMappingResultWithMemory{ + { + mm2, + mm3, + }, + }; + + MachineMappingResultWithMemory result = minimize_runtime(result1, result2); + MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + { + mm1, + mm2, + }, + }; + + CHECK(result == correct); + } +} From 77783771b873ba2f2abd0fa45d32a03db548138b Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 18 Dec 2024 00:38:17 -0500 Subject: [PATCH 07/16] renaming --- .../compiler/cost_estimator/cost_estimator.h | 9 +- .../compiler/cost_estimator/cost_metric.h | 28 ---- ...truct.toml => op_cost_metrics.struct.toml} | 2 +- .../get_optimal_machine_mapping.h | 1 - .../machine_mapping_config.struct.toml | 13 -- .../get_optimal_machine_mapping_with_memory.h | 19 ++- .../machine_mapping_result_with_memory.h | 41 ------ ....h => machine_mapping_with_memory_cache.h} | 16 +-- ...ine_mapping_with_memory_cache.struct.toml} | 6 +- .../machine_mapping_with_memory_result.h | 41 ++++++ ...ne_mapping_with_memory_result.struct.toml} | 2 +- .../single_machine_mapping.struct.toml | 4 +- .../compiler/cost_estimator/cost_estimator.cc | 7 +- .../compiler/cost_estimator/cost_metric.cc | 55 -------- .../get_optimal_machine_mapping.cc | 2 +- ...get_optimal_machine_mapping_with_memory.cc | 56 ++++---- ...c => machine_mapping_with_memory_cache.cc} | 20 +-- ... => machine_mapping_with_memory_result.cc} | 78 +++++------ .../cost_estimator_for_test.cc | 56 ++------ .../machine_mapping/cost_estimator_for_test.h | 30 +---- .../get_optimal_machine_mapping.cc | 41 +++++- ...get_optimal_machine_mapping_with_memory.cc | 46 +++---- .../machine_mapping_result_with_memory.cc | 122 +++++++++--------- 23 files changed, 276 insertions(+), 419 deletions(-) delete mode 100644 lib/compiler/include/compiler/cost_estimator/cost_metric.h rename lib/compiler/include/compiler/cost_estimator/{cost_metric.struct.toml => op_cost_metrics.struct.toml} (88%) delete mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml delete mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.h => machine_mapping_with_memory_cache.h} (51%) rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.struct.toml => machine_mapping_with_memory_cache.struct.toml} (78%) create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{machine_mapping_result_with_memory.struct.toml => machine_mapping_with_memory_result.struct.toml} (89%) delete mode 100644 lib/compiler/src/compiler/cost_estimator/cost_metric.cc rename lib/compiler/src/compiler/machine_mapping/memory_optimization/{machine_mapping_cache_with_memory.cc => machine_mapping_with_memory_cache.cc} (50%) rename lib/compiler/src/compiler/machine_mapping/memory_optimization/{machine_mapping_result_with_memory.cc => machine_mapping_with_memory_result.cc} (59%) diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h index 828200cc6a..9b006f178a 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h +++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H -#include "compiler/cost_estimator/cost_metric.dtg.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" #include "compiler/cost_estimator/tensor_set_movement.dtg.h" #include "op-attrs/parallel_tensor_shape.dtg.h" @@ -12,10 +12,8 @@ namespace FlexFlow { struct ICostEstimator { - virtual float estimate_cost(OpCostEstimateKey const &) const = 0; + virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0; virtual float estimate_cost(TensorSetMovement const &) const = 0; - virtual CostMetric - estimate_cost_with_memory(OpCostEstimateKey const &) const = 0; ICostEstimator() = default; ICostEstimator(ICostEstimator const &) = delete; @@ -26,9 +24,8 @@ struct ICostEstimator { CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator); struct CostEstimator { - float estimate_cost(OpCostEstimateKey const &k) const; + OpCostMetrics estimate_cost(OpCostEstimateKey const &) const; float estimate_cost(TensorSetMovement const &m) const; - CostMetric estimate_cost_with_memory(OpCostEstimateKey const &k) const; template static typename std::enable_if::value, diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.h b/lib/compiler/include/compiler/cost_estimator/cost_metric.h deleted file mode 100644 index 98b0cb228d..0000000000 --- a/lib/compiler/include/compiler/cost_estimator/cost_metric.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H -#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_METRIC_H - -#include "compiler/cost_estimator/cost_metric.dtg.h" -#include - -namespace FlexFlow { - -CostMetric zero_cost_metric(); - -CostMetric combine_cost_metrics_inter_device(CostMetric const &c1, - CostMetric const &c2); -CostMetric - combine_cost_metrics_inter_device(std::vector const &costs); - -CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1, - CostMetric const &c2); -CostMetric combine_cost_metrics_intra_device_sequential( - std::vector const &costs); - -CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1, - CostMetric const &c2); -CostMetric combine_cost_metrics_intra_device_parallel( - std::vector const &costs); - -} // namespace FlexFlow - -#endif diff --git a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml similarity index 88% rename from lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml rename to lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml index 0666bb9e11..f137935a4d 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_metric.struct.toml +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "CostMetric" +name = "OpCostMetrics" features = [ "eq", "fmt", diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h index f69e6ab91b..62da90bfcb 100644 --- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h @@ -2,7 +2,6 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_GET_OPTIMAL_MACHINE_MAPPING_H #include "compiler/machine_mapping/machine_mapping_cache.dtg.h" -#include "compiler/machine_mapping/machine_mapping_config.dtg.h" #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h" #include "compiler/machine_mapping/machine_mapping_context.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml deleted file mode 100644 index f4c0b61291..0000000000 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_config.struct.toml +++ /dev/null @@ -1,13 +0,0 @@ -namespace = "FlexFlow" -name = "MachineMappingConfig" -features = [ - "eq", - "hash", - "fmt", -] - -includes = [] - -[[fields]] -name = "enable_memory_optimization" -type = "bool" diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h index f8a2e4d75a..d176d298db 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h @@ -2,27 +2,26 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_GET_OPTIMAL_MACHINE_MAPPING_WITH_MEMORY_H #include "compiler/machine_mapping/machine_mapping_cache.dtg.h" -#include "compiler/machine_mapping/machine_mapping_config.dtg.h" #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h" #include "compiler/machine_mapping/machine_mapping_context.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" -#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" #include "pcg/machine_specification.dtg.h" namespace FlexFlow { -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, MachineMappingConstraints const &constraints); -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, @@ -30,15 +29,15 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( std::optional const ¶llel_split_transformation); -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints); -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &, UnmappedOpCostEstimateKey const &leaf, MachineSpecification const &resources, diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h deleted file mode 100644 index d56d33f7ec..0000000000 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H -#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H - -#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h" -#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" -#include - -namespace FlexFlow { - -[[nodiscard]] MachineMappingResultWithMemory - empty_machine_mapping_result_with_memory(); -[[nodiscard]] bool is_empty(MachineMappingResultWithMemory const &); - -[[nodiscard]] MachineMappingResultWithMemory get_mapping_with_minimal_runtime( - std::unordered_set const &); - -[[nodiscard]] MachineMappingResultWithMemory - remove_non_dominating_machine_mapping_result( - MachineMappingResultWithMemory const &); - -[[nodiscard]] MachineMappingResultWithMemory - series_combine(float comm_cost, - MachineMappingResultWithMemory const &pre_result, - MachineMappingResultWithMemory const &post_result, - std::optional const - ¶llel_split_transformation); -[[nodiscard]] MachineMappingResultWithMemory - parallel_combine(MachineMappingResultWithMemory const &lhs_result, - MachineMappingResultWithMemory const &rhs_result); - -[[nodiscard]] MachineMappingResultWithMemory - minimize_runtime(MachineMappingResultWithMemory const &m1, - MachineMappingResultWithMemory const &m2); - -[[nodiscard]] MachineMappingResultWithMemory - make_singleton_machine_mapping_result_with_memory( - CostMetric cost, MachineView const &machine_view); - -} // namespace FlexFlow - -#endif diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h similarity index 51% rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h index 2c45c04d3d..b749235c89 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h @@ -1,18 +1,18 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_CACHE_WITH_MEMORY_H -#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.dtg.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h" namespace FlexFlow { -MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory(); -std::optional - machine_mapping_cache_with_memory_load( - MachineMappingCacheWithMemory const &, MachineMappingState const &); -void machine_mapping_cache_with_memory_save( - MachineMappingCacheWithMemory &, +MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache(); +std::optional + machine_mapping_with_memory_cache_load( + MachineMappingWithMemoryCache const &, MachineMappingState const &); +void machine_mapping_with_memory_cache_save( + MachineMappingWithMemoryCache &, MachineMappingState const &, - MachineMappingResultWithMemory const &); + MachineMappingWithMemoryResult const &); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml similarity index 78% rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml index e7afa26bb3..c2fe393e99 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "MachineMappingCacheWithMemory" +name = "MachineMappingWithMemoryCache" features = [ "eq", "hash", @@ -9,7 +9,7 @@ features = [ includes = [ "", "compiler/machine_mapping/machine_mapping_state.dtg.h", - "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.dtg.h", + "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h", ] src_includes = [ @@ -19,4 +19,4 @@ src_includes = [ [[fields]] name = "raw_map" -type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingResultWithMemory>" +type = "std::unordered_map<::FlexFlow::MachineMappingState, ::FlexFlow::MachineMappingWithMemoryResult>" diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h new file mode 100644 index 0000000000..0383376116 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h @@ -0,0 +1,41 @@ +#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H +#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_MEMORY_OPTIMIZATION_MACHINE_MAPPING_RESULT_WITH_MEMORY_H + +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.dtg.h" +#include "compiler/machine_mapping/parallel_split_transformation.dtg.h" +#include + +namespace FlexFlow { + +[[nodiscard]] MachineMappingWithMemoryResult + empty_machine_mapping_with_memory_result(); +[[nodiscard]] bool is_empty(MachineMappingWithMemoryResult const &); + +[[nodiscard]] MachineMappingWithMemoryResult get_mapping_with_minimal_runtime( + std::unordered_set const &); + +[[nodiscard]] MachineMappingWithMemoryResult + remove_non_pareto_optimal_machine_mapping_result( + MachineMappingWithMemoryResult const &); + +[[nodiscard]] MachineMappingWithMemoryResult + series_combine(float comm_cost, + MachineMappingWithMemoryResult const &pre_result, + MachineMappingWithMemoryResult const &post_result, + std::optional const + ¶llel_split_transformation); +[[nodiscard]] MachineMappingWithMemoryResult + parallel_combine(MachineMappingWithMemoryResult const &lhs_result, + MachineMappingWithMemoryResult const &rhs_result); + +[[nodiscard]] MachineMappingWithMemoryResult + minimize_runtime(MachineMappingWithMemoryResult const &m1, + MachineMappingWithMemoryResult const &m2); + +[[nodiscard]] MachineMappingWithMemoryResult + make_singleton_machine_mapping_with_memory_result( + OpCostMetrics cost, MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml similarity index 89% rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml index f3b2895b83..50de145b36 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "MachineMappingResultWithMemory" +name = "MachineMappingWithMemoryResult" features = [ "eq", "hash", diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml index 05a23e905a..f33e320e3b 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml @@ -8,12 +8,12 @@ features = [ includes = [ "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h", - "compiler/cost_estimator/cost_metric.dtg.h", + "compiler/cost_estimator/op_cost_metrics.dtg.h", ] [[fields]] name = "cost" -type = "::FlexFlow::CostMetric" +type = "::FlexFlow::OpCostMetrics" [[fields]] name = "machine_mapping" diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc index 40a0f4e2a4..6ac6e3a8d6 100644 --- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc +++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc @@ -5,7 +5,7 @@ namespace FlexFlow { CostEstimator::CostEstimator(std::shared_ptr implementation_ptr) : implementation_ptr(implementation_ptr) {} -float CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +OpCostMetrics CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->implementation_ptr->estimate_cost(k); } @@ -13,9 +13,4 @@ float CostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->implementation_ptr->estimate_cost(m); } -CostMetric - CostEstimator::estimate_cost_with_memory(OpCostEstimateKey const &k) const { - return this->implementation_ptr->estimate_cost_with_memory(k); -} - } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc b/lib/compiler/src/compiler/cost_estimator/cost_metric.cc deleted file mode 100644 index dfaf0702c9..0000000000 --- a/lib/compiler/src/compiler/cost_estimator/cost_metric.cc +++ /dev/null @@ -1,55 +0,0 @@ -#include "compiler/cost_estimator/cost_metric.h" - -namespace FlexFlow { - -CostMetric zero_cost_metric() { - return CostMetric{ - /*runtime=*/0, - /*memory=*/0, - }; -} - -CostMetric combine_cost_metrics_inter_device(CostMetric const &c1, - CostMetric const &c2) { - return CostMetric{c1.runtime + c2.runtime, c1.memory + c2.memory}; -} - -CostMetric - combine_cost_metrics_inter_device(std::vector const &costs) { - CostMetric result = zero_cost_metric(); - for (CostMetric const &cost : costs) { - result = combine_cost_metrics_inter_device(result, cost); - } - return result; -} - -CostMetric combine_cost_metrics_intra_device_sequential(CostMetric const &c1, - CostMetric const &c2) { - return CostMetric{c1.runtime + c2.runtime, std::max(c1.memory, c2.memory)}; -} - -CostMetric combine_cost_metrics_intra_device_sequential( - std::vector const &costs) { - CostMetric result = zero_cost_metric(); - for (CostMetric const &cost : costs) { - result = combine_cost_metrics_intra_device_sequential(result, cost); - } - return result; -} - -CostMetric combine_cost_metrics_intra_device_parallel(CostMetric const &c1, - CostMetric const &c2) { - return CostMetric{std::max(c1.runtime, c2.runtime), - std::max(c1.memory, c2.memory)}; -} - -CostMetric combine_cost_metrics_intra_device_parallel( - std::vector const &costs) { - CostMetric result = zero_cost_metric(); - for (CostMetric const &cost : costs) { - result = combine_cost_metrics_intra_device_parallel(result, cost); - } - return result; -} - -} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 10abd7ff90..5bdd8645a5 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -240,7 +240,7 @@ MachineMappingResult auto get_mapping_result = [&](MachineView const &machine_view) { OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(leaf, machine_view); - float cost = context.cost_estimator.estimate_cost(mapped); + float cost = context.cost_estimator.estimate_cost(mapped).runtime; return make_singleton_machine_mapping_result(cost, machine_view); }; diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 676f3a6c8e..96a67afaab 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -4,8 +4,8 @@ #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" -#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" -#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" @@ -24,8 +24,8 @@ namespace FlexFlow { -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, @@ -38,15 +38,15 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( }; { - std::optional cached_result = - machine_mapping_cache_with_memory_load(result_cache, state); + std::optional cached_result = + machine_mapping_with_memory_cache_load(result_cache, state); if (cached_result) { return cached_result.value(); } } - MachineMappingResultWithMemory result = - problem_tree.visit(overload{ + MachineMappingWithMemoryResult result = + problem_tree.visit(overload{ [&](MMProblemTreeSeriesSplit const &series_split) { return get_optimal_machine_mapping_with_memory( result_cache, @@ -65,12 +65,12 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( }, }); - machine_mapping_cache_with_memory_save(result_cache, state, result); + machine_mapping_with_memory_cache_save(result_cache, state, result); return result; } -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, @@ -105,7 +105,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( MachineMappingConstraints pre_candidate = with_additional_constraints( restrict_to_left_child(constraints), assigned_pre_machine_views); - MachineMappingResultWithMemory pre_result = + MachineMappingWithMemoryResult pre_result = get_optimal_machine_mapping_with_memory( result_cache, context, @@ -122,7 +122,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( MachineMappingConstraints post_candidate = with_additional_constraints( restrict_to_right_child(constraints), assigned_post_machine_views); - MachineMappingResultWithMemory post_result = + MachineMappingWithMemoryResult post_result = get_optimal_machine_mapping_with_memory( result_cache, context, @@ -133,8 +133,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( return post_result; }; - MachineMappingResultWithMemory result = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult result = + empty_machine_mapping_with_memory_result(); AbstractedTensorSetMovement tensor_movement = series_split.tensor_set_movement; @@ -142,7 +142,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( &assigned_pre_machine_views : get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) { - MachineMappingResultWithMemory pre_result = + MachineMappingWithMemoryResult pre_result = eval_pre_boundary_mapping(assigned_pre_machine_views); for (ParallelLayerGuidObliviousMachineMapping const @@ -150,7 +150,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( get_boundary_machine_view_assignments( get_dst_layers(tensor_movement))) { - MachineMappingResultWithMemory post_result = + MachineMappingWithMemoryResult post_result = eval_post_boundary_mapping(assigned_post_machine_views); TensorSetMovement comm_across_split = @@ -172,8 +172,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( return result; } -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, @@ -182,7 +182,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( MachineMappingProblemTree lhs = parallel_split.get_left_child(); MachineMappingProblemTree rhs = parallel_split.get_right_child(); - MachineMappingResultWithMemory series_result = [&] { + MachineMappingWithMemoryResult series_result = [&] { MMProblemTreeSeriesSplit series_split = MMProblemTreeSeriesSplit{ /*tensor_set_movement=*/empty_abstracted_tensor_set_movement(), /*left_child=*/lhs, @@ -206,13 +206,13 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( auto evaluate_resource_split = [&](std::pair const &resource_split) { - MachineMappingResultWithMemory left_result = + MachineMappingWithMemoryResult left_result = get_optimal_machine_mapping_with_memory(result_cache, context, lhs, resource_split.first, left_constraints); - MachineMappingResultWithMemory right_result = + MachineMappingWithMemoryResult right_result = get_optimal_machine_mapping_with_memory(result_cache, context, rhs, @@ -222,7 +222,7 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( return parallel_combine(left_result, right_result); }; - std::unordered_set parallel_results = + std::unordered_set parallel_results = transform(get_machine_resource_splits(resources), evaluate_resource_split); @@ -230,8 +230,8 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( get_mapping_with_minimal_runtime(parallel_results)); } -MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( - MachineMappingCacheWithMemory &result_cache, +MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( + MachineMappingWithMemoryCache &result_cache, MachineMappingContext const &context, UnmappedOpCostEstimateKey const &leaf, MachineSpecification const &resource, @@ -249,13 +249,13 @@ MachineMappingResultWithMemory get_optimal_machine_mapping_with_memory( auto get_mapping_result = [&](MachineView const &machine_view) { OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(leaf, machine_view); - CostMetric cost = context.cost_estimator.estimate_cost_with_memory(mapped); + OpCostMetrics cost = context.cost_estimator.estimate_cost(mapped); - return make_singleton_machine_mapping_result_with_memory(cost, + return make_singleton_machine_mapping_with_memory_result(cost, machine_view); }; - std::unordered_set candidate_results = + std::unordered_set candidate_results = transform(candidates, get_mapping_result); return get_mapping_with_minimal_runtime(candidate_results); diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc similarity index 50% rename from lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc rename to lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc index e74612250e..617ba682be 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.cc @@ -1,27 +1,27 @@ -#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h" #include "utils/containers/contains_key.h" #include "utils/containers/try_at.h" namespace FlexFlow { -MachineMappingCacheWithMemory empty_machine_mapping_cache_with_memory() { - return MachineMappingCacheWithMemory{{}}; +MachineMappingWithMemoryCache empty_machine_mapping_with_memory_cache() { + return MachineMappingWithMemoryCache{{}}; } -std::optional - machine_mapping_cache_with_memory_load( - MachineMappingCacheWithMemory const &cache, +std::optional + machine_mapping_with_memory_cache_load( + MachineMappingWithMemoryCache const &cache, MachineMappingState const &k) { return try_at(cache.raw_map, k); } -void machine_mapping_cache_with_memory_save( - MachineMappingCacheWithMemory &cache, +void machine_mapping_with_memory_cache_save( + MachineMappingWithMemoryCache &cache, MachineMappingState const &k, - MachineMappingResultWithMemory const &v) { + MachineMappingWithMemoryResult const &v) { if (contains_key(cache.raw_map, k)) { throw mk_runtime_error(fmt::format( - "machine_mapping_cache_with_memory_save expected key to not already " + "machine_mapping_with_memory_cache_save expected key to not already " "exist, but received existing key {}", k)); } diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc similarity index 59% rename from lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc rename to lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc index 1c4f8e1142..d38e4a7b6a 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc @@ -1,57 +1,57 @@ -#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.h" #include "utils/containers/set_union.h" #include "utils/full_binary_tree/binary_tree_path.h" namespace FlexFlow { -MachineMappingResultWithMemory empty_machine_mapping_result_with_memory() { - return MachineMappingResultWithMemory{ +MachineMappingWithMemoryResult empty_machine_mapping_with_memory_result() { + return MachineMappingWithMemoryResult{ {}, }; } -MachineMappingResultWithMemory get_mapping_with_minimal_runtime( - std::unordered_set const &candidates) { - MachineMappingResultWithMemory result = - empty_machine_mapping_result_with_memory(); +MachineMappingWithMemoryResult get_mapping_with_minimal_runtime( + std::unordered_set const &candidates) { + MachineMappingWithMemoryResult result = + empty_machine_mapping_with_memory_result(); - for (MachineMappingResultWithMemory const &candidate : candidates) { + for (MachineMappingWithMemoryResult const &candidate : candidates) { result = minimize_runtime(result, candidate); } return result; } -MachineMappingResultWithMemory remove_non_dominating_machine_mapping_result( - MachineMappingResultWithMemory const &result) { - std::unordered_set non_dominating_mappings; +MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( + MachineMappingWithMemoryResult const &result) { + std::unordered_set non_pareto_optimal_mappings; for (SingleMachineMapping const &mapping : result.machine_mappings) { - bool is_dominating = true; + bool is_pareto_optimal = true; for (SingleMachineMapping const &other_mapping : result.machine_mappings) { if (mapping.cost.runtime >= other_mapping.cost.runtime && mapping.cost.memory >= other_mapping.cost.memory && mapping != other_mapping) { - is_dominating = false; + is_pareto_optimal = false; break; } } - if (is_dominating) { - non_dominating_mappings.insert(mapping); + if (is_pareto_optimal) { + non_pareto_optimal_mappings.insert(mapping); } } - return MachineMappingResultWithMemory{std::move(non_dominating_mappings)}; + return MachineMappingWithMemoryResult{std::move(non_pareto_optimal_mappings)}; } -MachineMappingResultWithMemory +MachineMappingWithMemoryResult series_combine(float comm_cost, - MachineMappingResultWithMemory const &pre_result, - MachineMappingResultWithMemory const &post_result, + MachineMappingWithMemoryResult const &pre_result, + MachineMappingWithMemoryResult const &post_result, std::optional const ¶llel_split_transformation) { auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm, SingleMachineMapping const &post_mm) { - CostMetric cost = CostMetric{ + OpCostMetrics cost = OpCostMetrics{ pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, pre_mm.cost.memory + post_mm.cost.memory, }; @@ -71,23 +71,23 @@ MachineMappingResultWithMemory return SingleMachineMapping{cost, mapping}; }; - MachineMappingResultWithMemory result = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult result = + empty_machine_mapping_with_memory_result(); for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) { for (SingleMachineMapping const &post_mm : post_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm)); } } - return remove_non_dominating_machine_mapping_result(result); + return remove_non_pareto_optimal_machine_mapping_result(result); } -MachineMappingResultWithMemory - parallel_combine(MachineMappingResultWithMemory const &lhs_result, - MachineMappingResultWithMemory const &rhs_result) { +MachineMappingWithMemoryResult + parallel_combine(MachineMappingWithMemoryResult const &lhs_result, + MachineMappingWithMemoryResult const &rhs_result) { auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm, SingleMachineMapping const &rhs_mm) { - CostMetric cost = CostMetric{ + OpCostMetrics cost = OpCostMetrics{ std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), }; @@ -98,30 +98,30 @@ MachineMappingResultWithMemory return SingleMachineMapping{cost, mapping}; }; - MachineMappingResultWithMemory result = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult result = + empty_machine_mapping_with_memory_result(); for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) { for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm)); } } - return remove_non_dominating_machine_mapping_result(result); + return remove_non_pareto_optimal_machine_mapping_result(result); } -MachineMappingResultWithMemory - minimize_runtime(MachineMappingResultWithMemory const &m1, - MachineMappingResultWithMemory const &m2) { - MachineMappingResultWithMemory result = MachineMappingResultWithMemory{ +MachineMappingWithMemoryResult + minimize_runtime(MachineMappingWithMemoryResult const &m1, + MachineMappingWithMemoryResult const &m2) { + MachineMappingWithMemoryResult result = MachineMappingWithMemoryResult{ set_union(m1.machine_mappings, m2.machine_mappings), }; - return remove_non_dominating_machine_mapping_result(result); + return remove_non_pareto_optimal_machine_mapping_result(result); } -MachineMappingResultWithMemory - make_singleton_machine_mapping_result_with_memory( - CostMetric cost, MachineView const &machine_view) { - return MachineMappingResultWithMemory{{ +MachineMappingWithMemoryResult + make_singleton_machine_mapping_with_memory_result( + OpCostMetrics cost, MachineView const &machine_view) { + return MachineMappingWithMemoryResult{{ SingleMachineMapping{ cost, ParallelLayerGuidObliviousMachineMapping{{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc index b55b4d283c..6ebfc45a6f 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc @@ -5,16 +5,13 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( - std::function const &get_operator_cost, + std::function const &get_operator_cost, std::function const - &get_communication_cost, - std::function const - &get_operator_cost_with_memory) + &get_communication_cost) : get_operator_cost(get_operator_cost), - get_communication_cost(get_communication_cost), - get_operator_cost_with_memory(get_operator_cost_with_memory) {} + get_communication_cost(get_communication_cost) {} -float TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +OpCostMetrics TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->get_operator_cost(k); } @@ -22,58 +19,21 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->get_communication_cost(m); } -CostMetric TestCostEstimator::estimate_cost_with_memory( - OpCostEstimateKey const &k) const { - return this->get_operator_cost_with_memory(k); -} - CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, + std::function const &get_operator_cost, std::function const &get_communication_cost) { - auto get_operator_cost_with_memory = [=](OpCostEstimateKey const &k) { - return CostMetric{ - get_operator_cost(k), - 0, - }; - }; - - return make_fake_cost_estimator( - get_operator_cost, get_communication_cost, get_operator_cost_with_memory); -} - -CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map) { - return make_fake_cost_estimator( - [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, - [comm_cost_map](TensorSetMovement const &m) { - return comm_cost_map.at(m); - }); -} - -CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, - std::function const - &get_communication_cost, - std::function const - &get_operator_cost_with_memory) { return CostEstimator::create( - get_operator_cost, get_communication_cost, get_operator_cost_with_memory); + get_operator_cost, get_communication_cost); } CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map, - std::unordered_map const - &op_cost_with_memory_map) { + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map) { return make_fake_cost_estimator( [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, [comm_cost_map](TensorSetMovement const &m) { return comm_cost_map.at(m); - }, - [op_cost_with_memory_map](OpCostEstimateKey const &k) { - return op_cost_with_memory_map.at(k); }); } diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h index 302421f873..7fb4bcc6f8 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h @@ -11,48 +11,28 @@ namespace FlexFlow { struct TestCostEstimator : public ICostEstimator { - std::function get_operator_cost; + std::function get_operator_cost; std::function get_communication_cost; - std::function - get_operator_cost_with_memory; TestCostEstimator() = delete; TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost, decltype(get_communication_cost) - const &get_communication_cost, - decltype(get_operator_cost_with_memory) - const &get_operator_cost_with_memory); + const &get_communication_cost); - float estimate_cost(OpCostEstimateKey const &) const override; + OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override; float estimate_cost(TensorSetMovement const &) const override; - - CostMetric - estimate_cost_with_memory(OpCostEstimateKey const &) const override; }; CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, + std::function const &get_operator_cost, std::function const &get_communication_cost); CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, + std::unordered_map const &op_cost_map, std::unordered_map const &comm_cost_map); -CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, - std::function const - &get_communication_cost, - std::function const - &get_operator_cost_with_memory); - -CostEstimator make_fake_cost_estimator( - std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map, - std::unordered_map const - &op_cost_with_memory_map); - } // namespace FlexFlow #endif diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index a0d06fe930..14a8b2e014 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -144,13 +144,19 @@ TEST_SUITE(FF_TEST_SUITE) { {binary_tree_root_path(), mv2}, }}; + printf("Before constructing cost_estimator\n"); + + auto map1 = std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, + {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, + {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, + {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, + }}; + + printf("After constructing map1\n"); + CostEstimator cost_estimator = make_fake_cost_estimator( - std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0}, - {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0}, - {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5}, - {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5}, - }}, + map1, std::unordered_map{{ {TensorSetMovement{{}}, 0.0}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), @@ -163,11 +169,34 @@ TEST_SUITE(FF_TEST_SUITE) { 0.4}, }}); + // CostEstimator cost_estimator = make_fake_cost_estimator( + // std::unordered_map{{ + // {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, + // {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, + // {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, + // {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, + // }}, + // std::unordered_map{{ + // {TensorSetMovement{{}}, 0.0}, + // {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), + // 0.1}, + // {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), + // 0.2}, + // {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), + // 0.3}, + // {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), + // 0.4}, + // }}); + + printf("After constructing cost_estimator\n"); + MachineMappingContext context = MachineMappingContext{ cost_estimator, allowed_machine_views1, }; + printf("After constructing context\n"); + MachineMappingCache cache = empty_machine_mapping_cache(); SUBCASE("single layer") { diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 566af800ea..440ebde343 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -4,7 +4,7 @@ #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" -#include "compiler/machine_mapping/memory_optimization/machine_mapping_cache_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h" #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" #include "utils/containers/get_only.h" @@ -145,11 +145,11 @@ TEST_SUITE(FF_TEST_SUITE) { }}; CostEstimator cost_estimator = make_fake_cost_estimator( - std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), 1.0}, - {map_unmapped_op_cost_estimate_key(k2, mv1), 2.0}, - {map_unmapped_op_cost_estimate_key(k1, mv2), 1.5}, - {map_unmapped_op_cost_estimate_key(k2, mv2), 2.5}, + std::unordered_map{{ + {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics{1.0, 2}}, + {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics{2.0, 3}}, + {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics{1.5, 1}}, + {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics{2.5, 2}}, }}, std::unordered_map{{ {TensorSetMovement{{}}, 0.0}, @@ -161,12 +161,6 @@ TEST_SUITE(FF_TEST_SUITE) { 0.3}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), 0.4}, - }}, - std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), CostMetric{1.0, 2}}, - {map_unmapped_op_cost_estimate_key(k2, mv1), CostMetric{2.0, 3}}, - {map_unmapped_op_cost_estimate_key(k1, mv2), CostMetric{1.5, 1}}, - {map_unmapped_op_cost_estimate_key(k2, mv2), CostMetric{2.5, 2}}, }}); MachineMappingContext context = MachineMappingContext{ @@ -174,8 +168,8 @@ TEST_SUITE(FF_TEST_SUITE) { allowed_machine_views1, }; - MachineMappingCacheWithMemory cache = - empty_machine_mapping_cache_with_memory(); + MachineMappingWithMemoryCache cache = + empty_machine_mapping_with_memory_cache(); SUBCASE("single layer") { MachineMappingProblemTree problem_tree = make_leaf(k1); @@ -184,18 +178,18 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResultWithMemory result = + MachineMappingWithMemoryResult result = get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ SingleMachineMapping{ - CostMetric{1.0, 2}, + OpCostMetrics{1.0, 2}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, }}, }, SingleMachineMapping{ - CostMetric{1.5, 1}, + OpCostMetrics{1.5, 1}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv2}, }}, @@ -213,12 +207,12 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResultWithMemory result = + MachineMappingWithMemoryResult result = get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ SingleMachineMapping{ - CostMetric{1.0 + 2.0 + 0.1, 2 + 3}, + OpCostMetrics{1.0 + 2.0 + 0.1, 2 + 3}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ @@ -235,7 +229,7 @@ TEST_SUITE(FF_TEST_SUITE) { }}, }, SingleMachineMapping{ - CostMetric{1.5 + 2.5 + 0.1, 1 + 2}, + OpCostMetrics{1.5 + 2.5 + 0.1, 1 + 2}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ @@ -264,12 +258,12 @@ TEST_SUITE(FF_TEST_SUITE) { get_unconstrained_solution_for_layers( get_all_leaf_paths(problem_tree)); - MachineMappingResultWithMemory result = + MachineMappingWithMemoryResult result = get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); - MachineMappingResultWithMemory correct = - MachineMappingResultWithMemory{{SingleMachineMapping{ - CostMetric{2.5, 2}, + MachineMappingWithMemoryResult correct = + MachineMappingWithMemoryResult{{SingleMachineMapping{ + OpCostMetrics{2.5, 2}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index 6ca551c436..bdd58f8717 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -1,11 +1,11 @@ -#include "compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" #include "pcg/machine_view.h" #include using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("remove_non_dominating_machine_mapping_result") { + TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ /*node_idx=*/0, @@ -51,15 +51,15 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CostMetric cost1 = CostMetric{ + OpCostMetrics cost1 = OpCostMetrics{ 2.0, 2, }; - CostMetric cost2 = CostMetric{ + OpCostMetrics cost2 = OpCostMetrics{ 4.0, 1, }; - CostMetric cost3 = CostMetric{ + OpCostMetrics cost3 = OpCostMetrics{ 2.0, 3, }; @@ -101,41 +101,41 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("empty") { - MachineMappingResultWithMemory to_remove = - empty_machine_mapping_result_with_memory(); - MachineMappingResultWithMemory result = - remove_non_dominating_machine_mapping_result(to_remove); - MachineMappingResultWithMemory correct = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult to_remove = + empty_machine_mapping_with_memory_result(); + MachineMappingWithMemoryResult result = + remove_non_pareto_optimal_machine_mapping_result(to_remove); + MachineMappingWithMemoryResult correct = + empty_machine_mapping_with_memory_result(); CHECK(result == correct); } - SUBCASE("no non-dominating") { - MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{ + SUBCASE("no non-pareto_optimal") { + MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{ { mm1, mm2, }, }; - MachineMappingResultWithMemory result = - remove_non_dominating_machine_mapping_result(to_remove); - MachineMappingResultWithMemory correct = to_remove; + MachineMappingWithMemoryResult result = + remove_non_pareto_optimal_machine_mapping_result(to_remove); + MachineMappingWithMemoryResult correct = to_remove; CHECK(result == correct); } - SUBCASE("non-dominating") { - MachineMappingResultWithMemory to_remove = MachineMappingResultWithMemory{ + SUBCASE("non-pareto_optimal") { + MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{ { mm1, mm2, mm3, }, }; - MachineMappingResultWithMemory result = - remove_non_dominating_machine_mapping_result(to_remove); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult result = + remove_non_pareto_optimal_machine_mapping_result(to_remove); + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{ { mm1, mm2, @@ -177,11 +177,11 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CostMetric pre_cost = CostMetric{ + OpCostMetrics pre_cost = OpCostMetrics{ 2.0, 2, }; - MachineMappingResultWithMemory pre = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ SingleMachineMapping{ pre_cost, ParallelLayerGuidObliviousMachineMapping{ @@ -203,12 +203,12 @@ TEST_SUITE(FF_TEST_SUITE) { }, }}; - CostMetric post_cost = CostMetric{ + OpCostMetrics post_cost = OpCostMetrics{ 4.0, 1, }; - MachineMappingResultWithMemory post = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{ SingleMachineMapping{ post_cost, ParallelLayerGuidObliviousMachineMapping{ @@ -222,33 +222,33 @@ TEST_SUITE(FF_TEST_SUITE) { }, }}; - MachineMappingResultWithMemory empty = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult empty = + empty_machine_mapping_with_memory_result(); float comm_cost = 3.0; SUBCASE("pre is empty") { - MachineMappingResultWithMemory result = series_combine( + MachineMappingWithMemoryResult result = series_combine( comm_cost, empty, post, ParallelSplitTransformation::LthenR); - MachineMappingResultWithMemory correct = empty; + MachineMappingWithMemoryResult correct = empty; CHECK(result == correct); } SUBCASE("post is empty") { - MachineMappingResultWithMemory result = series_combine( + MachineMappingWithMemoryResult result = series_combine( comm_cost, pre, empty, ParallelSplitTransformation::LthenR); - MachineMappingResultWithMemory correct = empty; + MachineMappingWithMemoryResult correct = empty; CHECK(result == correct); } SUBCASE("both are nonempty") { - MachineMappingResultWithMemory no_parallel_split_transform = - MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult no_parallel_split_transform = + MachineMappingWithMemoryResult{ { SingleMachineMapping{ - /*cost=*/CostMetric{ + /*cost=*/OpCostMetrics{ pre_cost.runtime + comm_cost + post_cost.runtime, pre_cost.memory + post_cost.memory, }, @@ -280,28 +280,28 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("parallel_split_transformation = std::nullopt") { - MachineMappingResultWithMemory result = + MachineMappingWithMemoryResult result = series_combine(comm_cost, pre, post, std::nullopt); - MachineMappingResultWithMemory correct = no_parallel_split_transform; + MachineMappingWithMemoryResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = LthenR") { - MachineMappingResultWithMemory result = series_combine( + MachineMappingWithMemoryResult result = series_combine( comm_cost, pre, post, ParallelSplitTransformation::LthenR); - MachineMappingResultWithMemory correct = no_parallel_split_transform; + MachineMappingWithMemoryResult correct = no_parallel_split_transform; CHECK(result == correct); } SUBCASE("parallel_split_transformation = RthenL") { - MachineMappingResultWithMemory result = series_combine( + MachineMappingWithMemoryResult result = series_combine( comm_cost, pre, post, ParallelSplitTransformation::RthenL); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{ { SingleMachineMapping{ - /*cost=*/CostMetric{ + /*cost=*/OpCostMetrics{ pre_cost.runtime + comm_cost + post_cost.runtime, pre_cost.memory + post_cost.memory, }, @@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CostMetric lhs_cost = CostMetric{ + OpCostMetrics lhs_cost = OpCostMetrics{ 2.0, 2, }; - MachineMappingResultWithMemory lhs = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ SingleMachineMapping{ lhs_cost, ParallelLayerGuidObliviousMachineMapping{ @@ -394,11 +394,11 @@ TEST_SUITE(FF_TEST_SUITE) { }, }}; - CostMetric rhs_cost = CostMetric{ + OpCostMetrics rhs_cost = OpCostMetrics{ 4.0, 1, }; - MachineMappingResultWithMemory rhs = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ SingleMachineMapping{ rhs_cost, ParallelLayerGuidObliviousMachineMapping{ @@ -412,28 +412,28 @@ TEST_SUITE(FF_TEST_SUITE) { }, }}; - MachineMappingResultWithMemory empty = - empty_machine_mapping_result_with_memory(); + MachineMappingWithMemoryResult empty = + empty_machine_mapping_with_memory_result(); SUBCASE("lhs is empty") { - MachineMappingResultWithMemory result = parallel_combine(empty, rhs); - MachineMappingResultWithMemory correct = empty; + MachineMappingWithMemoryResult result = parallel_combine(empty, rhs); + MachineMappingWithMemoryResult correct = empty; CHECK(result == correct); } SUBCASE("rhs is empty") { - MachineMappingResultWithMemory result = parallel_combine(lhs, empty); - MachineMappingResultWithMemory correct = empty; + MachineMappingWithMemoryResult result = parallel_combine(lhs, empty); + MachineMappingWithMemoryResult correct = empty; CHECK(result == correct); } SUBCASE("both are nonempty") { - MachineMappingResultWithMemory result = parallel_combine(lhs, rhs); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{{ + MachineMappingWithMemoryResult result = parallel_combine(lhs, rhs); + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ SingleMachineMapping{ - /*cost=*/CostMetric{ + /*cost=*/OpCostMetrics{ std::max(lhs_cost.runtime, rhs_cost.runtime), std::max(lhs_cost.memory, rhs_cost.memory), }, @@ -509,15 +509,15 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - CostMetric cost1 = CostMetric{ + OpCostMetrics cost1 = OpCostMetrics{ 2.0, 2, }; - CostMetric cost2 = CostMetric{ + OpCostMetrics cost2 = OpCostMetrics{ 4.0, 1, }; - CostMetric cost3 = CostMetric{ + OpCostMetrics cost3 = OpCostMetrics{ 2.0, 3, }; @@ -558,22 +558,22 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - MachineMappingResultWithMemory result1 = MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult result1 = MachineMappingWithMemoryResult{ { mm1, mm2, }, }; - MachineMappingResultWithMemory result2 = MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult result2 = MachineMappingWithMemoryResult{ { mm2, mm3, }, }; - MachineMappingResultWithMemory result = minimize_runtime(result1, result2); - MachineMappingResultWithMemory correct = MachineMappingResultWithMemory{ + MachineMappingWithMemoryResult result = minimize_runtime(result1, result2); + MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{ { mm1, mm2, From 03151607792794f430b730a5c6605c4388f0a204 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 18 Dec 2024 10:13:54 -0500 Subject: [PATCH 08/16] fmt --- .../compiler/cost_estimator/cost_estimator.h | 2 +- ...get_optimal_machine_mapping_with_memory.cc | 2 +- .../cost_estimator_for_test.cc | 13 +++++++----- .../machine_mapping/cost_estimator_for_test.h | 3 ++- .../get_optimal_machine_mapping.cc | 20 ++++++++++--------- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h index 9b006f178a..ecaffa337b 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h +++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_COST_ESTIMATOR_H -#include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/cost_estimator/tensor_set_movement.dtg.h" #include "op-attrs/parallel_tensor_shape.dtg.h" #include "op-attrs/pcg_operator_attrs.dtg.h" diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 96a67afaab..b67083e8cd 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -4,8 +4,8 @@ #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" -#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc index 6ebfc45a6f..0431104878 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc @@ -5,13 +5,15 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( - std::function const &get_operator_cost, + std::function const + &get_operator_cost, std::function const &get_communication_cost) : get_operator_cost(get_operator_cost), get_communication_cost(get_communication_cost) {} -OpCostMetrics TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { +OpCostMetrics + TestCostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->get_operator_cost(k); } @@ -20,11 +22,12 @@ float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { } CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, + std::function const + &get_operator_cost, std::function const &get_communication_cost) { - return CostEstimator::create( - get_operator_cost, get_communication_cost); + return CostEstimator::create(get_operator_cost, + get_communication_cost); } CostEstimator make_fake_cost_estimator( diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h index 7fb4bcc6f8..16ea3a85bc 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h +++ b/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h @@ -25,7 +25,8 @@ struct TestCostEstimator : public ICostEstimator { }; CostEstimator make_fake_cost_estimator( - std::function const &get_operator_cost, + std::function const + &get_operator_cost, std::function const &get_communication_cost); diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 14a8b2e014..81665fbb94 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -147,11 +147,11 @@ TEST_SUITE(FF_TEST_SUITE) { printf("Before constructing cost_estimator\n"); auto map1 = std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, - {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, - {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, - {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, - }}; + {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, + {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, + {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, + {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, + }}; printf("After constructing map1\n"); @@ -171,10 +171,12 @@ TEST_SUITE(FF_TEST_SUITE) { // CostEstimator cost_estimator = make_fake_cost_estimator( // std::unordered_map{{ - // {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, - // {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, - // {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, - // {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, + // {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, + // 0)}, {map_unmapped_op_cost_estimate_key(k2, mv1), + // OpCostMetrics(2.0, 0)}, {map_unmapped_op_cost_estimate_key(k1, + // mv2), OpCostMetrics(1.5, 0)}, + // {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, + // 0)}, // }}, // std::unordered_map{{ // {TensorSetMovement{{}}, 0.0}, From 855a7d53716ae5f7b3eb4fb51508c58530d2a4a4 Mon Sep 17 00:00:00 2001 From: wmdi Date: Mon, 30 Dec 2024 01:06:56 -0500 Subject: [PATCH 09/16] fix --- .../get_optimal_machine_mapping.cc | 41 ++----- ...get_optimal_machine_mapping_with_memory.cc | 5 +- .../machine_mapping_result_with_memory.cc | 104 ++++++++++-------- 3 files changed, 68 insertions(+), 82 deletions(-) diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 81665fbb94..f5d5a5ee1b 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -144,17 +144,17 @@ TEST_SUITE(FF_TEST_SUITE) { {binary_tree_root_path(), mv2}, }}; - printf("Before constructing cost_estimator\n"); - auto map1 = std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, 0)}, - {map_unmapped_op_cost_estimate_key(k2, mv1), OpCostMetrics(2.0, 0)}, - {map_unmapped_op_cost_estimate_key(k1, mv2), OpCostMetrics(1.5, 0)}, - {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, 0)}, + {map_unmapped_op_cost_estimate_key(k1, mv1), + OpCostMetrics{/*runtime=*/1.0, /*memory=*/0}}, + {map_unmapped_op_cost_estimate_key(k2, mv1), + OpCostMetrics{/*runtime=*/2.0, /*memory=*/0}}, + {map_unmapped_op_cost_estimate_key(k1, mv2), + OpCostMetrics{/*runtime=*/1.5, /*memory=*/0}}, + {map_unmapped_op_cost_estimate_key(k2, mv2), + OpCostMetrics{/*runtime=*/2.5, /*memory=*/0}}, }}; - printf("After constructing map1\n"); - CostEstimator cost_estimator = make_fake_cost_estimator( map1, std::unordered_map{{ @@ -169,36 +169,11 @@ TEST_SUITE(FF_TEST_SUITE) { 0.4}, }}); - // CostEstimator cost_estimator = make_fake_cost_estimator( - // std::unordered_map{{ - // {map_unmapped_op_cost_estimate_key(k1, mv1), OpCostMetrics(1.0, - // 0)}, {map_unmapped_op_cost_estimate_key(k2, mv1), - // OpCostMetrics(2.0, 0)}, {map_unmapped_op_cost_estimate_key(k1, - // mv2), OpCostMetrics(1.5, 0)}, - // {map_unmapped_op_cost_estimate_key(k2, mv2), OpCostMetrics(2.5, - // 0)}, - // }}, - // std::unordered_map{{ - // {TensorSetMovement{{}}, 0.0}, - // {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - // 0.1}, - // {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - // 0.2}, - // {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - // 0.3}, - // {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - // 0.4}, - // }}); - - printf("After constructing cost_estimator\n"); - MachineMappingContext context = MachineMappingContext{ cost_estimator, allowed_machine_views1, }; - printf("After constructing context\n"); - MachineMappingCache cache = empty_machine_mapping_cache(); SUBCASE("single layer") { diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 440ebde343..063f6a9826 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -212,7 +212,10 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ SingleMachineMapping{ - OpCostMetrics{1.0 + 2.0 + 0.1, 2 + 3}, + OpCostMetrics{ + /*runtime=*/1.0 + 2.0 + 0.1, + /*memory=*/2 + 3, + }, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index bdd58f8717..3a28576193 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -52,16 +52,16 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - 2.0, - 2, + /*runtime=*/2.0, + /*memory=*/2, }; OpCostMetrics cost2 = OpCostMetrics{ - 4.0, - 1, + /*runtime=*/4.0, + /*memory=*/1, }; OpCostMetrics cost3 = OpCostMetrics{ - 2.0, - 3, + /*runtime=*/2.0, + /*memory=*/3, }; SingleMachineMapping mm1 = SingleMachineMapping{ @@ -101,40 +101,42 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("empty") { - MachineMappingWithMemoryResult to_remove = + MachineMappingWithMemoryResult before_remove = empty_machine_mapping_with_memory_result(); MachineMappingWithMemoryResult result = - remove_non_pareto_optimal_machine_mapping_result(to_remove); + remove_non_pareto_optimal_machine_mapping_result(before_remove); MachineMappingWithMemoryResult correct = empty_machine_mapping_with_memory_result(); CHECK(result == correct); } - SUBCASE("no non-pareto_optimal") { - MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{ - { - mm1, - mm2, - }, - }; + SUBCASE("all solutions are pareto-optimal") { + MachineMappingWithMemoryResult before_remove = + MachineMappingWithMemoryResult{ + { + mm1, + mm2, + }, + }; MachineMappingWithMemoryResult result = - remove_non_pareto_optimal_machine_mapping_result(to_remove); - MachineMappingWithMemoryResult correct = to_remove; + remove_non_pareto_optimal_machine_mapping_result(before_remove); + MachineMappingWithMemoryResult correct = before_remove; CHECK(result == correct); } - SUBCASE("non-pareto_optimal") { - MachineMappingWithMemoryResult to_remove = MachineMappingWithMemoryResult{ - { - mm1, - mm2, - mm3, - }, - }; + SUBCASE("there exists a non-pareto-optimal solution") { + MachineMappingWithMemoryResult before_remove = + MachineMappingWithMemoryResult{ + { + mm1, + mm2, + mm3, + }, + }; MachineMappingWithMemoryResult result = - remove_non_pareto_optimal_machine_mapping_result(to_remove); + remove_non_pareto_optimal_machine_mapping_result(before_remove); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{ { mm1, @@ -146,7 +148,9 @@ TEST_SUITE(FF_TEST_SUITE) { } } - TEST_CASE("series_combine(memory)") { + TEST_CASE("series_combine(float, MachineMappingWithMemoryResult const &, " + "MachineMappingWithMemoryResult const &, " + "std::optional const&)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ /*node_idx=*/0, @@ -178,8 +182,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics pre_cost = OpCostMetrics{ - 2.0, - 2, + /*runtime=*/2.0, + /*memory=*/2, }; MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ SingleMachineMapping{ @@ -204,8 +208,8 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics post_cost = OpCostMetrics{ - 4.0, - 1, + /*runtime=*/4.0, + /*memory=*/1, }; MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{ @@ -249,8 +253,9 @@ TEST_SUITE(FF_TEST_SUITE) { { SingleMachineMapping{ /*cost=*/OpCostMetrics{ - pre_cost.runtime + comm_cost + post_cost.runtime, - pre_cost.memory + post_cost.memory, + /*runtime=*/pre_cost.runtime + comm_cost + + post_cost.runtime, + /*memory=*/pre_cost.memory + post_cost.memory, }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ @@ -302,8 +307,9 @@ TEST_SUITE(FF_TEST_SUITE) { { SingleMachineMapping{ /*cost=*/OpCostMetrics{ - pre_cost.runtime + comm_cost + post_cost.runtime, - pre_cost.memory + post_cost.memory, + /*runtime=*/pre_cost.runtime + comm_cost + + post_cost.runtime, + /*memory=*/pre_cost.memory + post_cost.memory, }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ @@ -337,7 +343,9 @@ TEST_SUITE(FF_TEST_SUITE) { } } - TEST_CASE("parallel_combine(memory)") { + TEST_CASE("parallel_combine(float, MachineMappingWithMemoryResult const &, " + "MachineMappingWithMemoryResult const &, " + "std::optional const&)") { MachineView machine_view_0 = MachineView{ /*start=*/MachineSpaceCoordinate{ /*node_idx=*/0, @@ -369,8 +377,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics lhs_cost = OpCostMetrics{ - 2.0, - 2, + /*runtime=*/2.0, + /*memory=*/2, }; MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ SingleMachineMapping{ @@ -395,8 +403,8 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics rhs_cost = OpCostMetrics{ - 4.0, - 1, + /*runtime=*/4.0, + /*memory=*/1, }; MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ SingleMachineMapping{ @@ -434,8 +442,8 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ SingleMachineMapping{ /*cost=*/OpCostMetrics{ - std::max(lhs_cost.runtime, rhs_cost.runtime), - std::max(lhs_cost.memory, rhs_cost.memory), + /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime), + /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory), }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{ @@ -510,16 +518,16 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - 2.0, - 2, + /*runtime=*/2.0, + /*memory=*/2, }; OpCostMetrics cost2 = OpCostMetrics{ - 4.0, - 1, + /*runtime=*/4.0, + /*memory=*/1, }; OpCostMetrics cost3 = OpCostMetrics{ - 2.0, - 3, + /*runtime=*/2.0, + /*memory=*/3, }; SingleMachineMapping mm1 = SingleMachineMapping{ From 2b4e127e92e99b4f9a89e18056474129aeea69cd Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 8 Jan 2025 20:39:30 -0500 Subject: [PATCH 10/16] rename single machine mapping --- ...hine_mapping_for_single_layer.struct.toml} | 2 +- ...ine_mapping_with_memory_result.struct.toml | 4 +-- .../machine_mapping_with_memory_result.cc | 28 +++++++++---------- ...get_optimal_machine_mapping_with_memory.cc | 10 +++---- .../machine_mapping_result_with_memory.cc | 26 ++++++++--------- 5 files changed, 35 insertions(+), 35 deletions(-) rename lib/compiler/include/compiler/machine_mapping/memory_optimization/{single_machine_mapping.struct.toml => machine_mapping_for_single_layer.struct.toml} (90%) diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml similarity index 90% rename from lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml rename to lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml index f33e320e3b..b61dd134c0 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/single_machine_mapping.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "SingleMachineMapping" +name = "MachineMappingForSingleLayer" features = [ "eq", "hash", diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml index 50de145b36..c1e1ee1cac 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "compiler/machine_mapping/memory_optimization/single_machine_mapping.dtg.h", + "compiler/machine_mapping/memory_optimization/machine_mapping_for_single_layer.dtg.h", ] src_includes = [ @@ -17,4 +17,4 @@ src_includes = [ [[fields]] name = "machine_mappings" -type = "std::unordered_set<::FlexFlow::SingleMachineMapping>" +type = "std::unordered_set<::FlexFlow::MachineMappingForSingleLayer>" diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc index d38e4a7b6a..2f443e4fc5 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc @@ -25,10 +25,10 @@ MachineMappingWithMemoryResult get_mapping_with_minimal_runtime( MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( MachineMappingWithMemoryResult const &result) { - std::unordered_set non_pareto_optimal_mappings; - for (SingleMachineMapping const &mapping : result.machine_mappings) { + std::unordered_set non_pareto_optimal_mappings; + for (MachineMappingForSingleLayer const &mapping : result.machine_mappings) { bool is_pareto_optimal = true; - for (SingleMachineMapping const &other_mapping : result.machine_mappings) { + for (MachineMappingForSingleLayer const &other_mapping : result.machine_mappings) { if (mapping.cost.runtime >= other_mapping.cost.runtime && mapping.cost.memory >= other_mapping.cost.memory && mapping != other_mapping) { @@ -49,8 +49,8 @@ MachineMappingWithMemoryResult MachineMappingWithMemoryResult const &post_result, std::optional const ¶llel_split_transformation) { - auto combine_machine_mapping = [&](SingleMachineMapping const &pre_mm, - SingleMachineMapping const &post_mm) { + auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &pre_mm, + MachineMappingForSingleLayer const &post_mm) { OpCostMetrics cost = OpCostMetrics{ pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, pre_mm.cost.memory + post_mm.cost.memory, @@ -68,13 +68,13 @@ MachineMappingWithMemoryResult } }(); - return SingleMachineMapping{cost, mapping}; + return MachineMappingForSingleLayer{cost, mapping}; }; MachineMappingWithMemoryResult result = empty_machine_mapping_with_memory_result(); - for (SingleMachineMapping const &pre_mm : pre_result.machine_mappings) { - for (SingleMachineMapping const &post_mm : post_result.machine_mappings) { + for (MachineMappingForSingleLayer const &pre_mm : pre_result.machine_mappings) { + for (MachineMappingForSingleLayer const &post_mm : post_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm)); } } @@ -85,8 +85,8 @@ MachineMappingWithMemoryResult MachineMappingWithMemoryResult parallel_combine(MachineMappingWithMemoryResult const &lhs_result, MachineMappingWithMemoryResult const &rhs_result) { - auto combine_machine_mapping = [&](SingleMachineMapping const &lhs_mm, - SingleMachineMapping const &rhs_mm) { + auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &lhs_mm, + MachineMappingForSingleLayer const &rhs_mm) { OpCostMetrics cost = OpCostMetrics{ std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), @@ -95,13 +95,13 @@ MachineMappingWithMemoryResult ParallelLayerGuidObliviousMachineMapping mapping = binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping); - return SingleMachineMapping{cost, mapping}; + return MachineMappingForSingleLayer{cost, mapping}; }; MachineMappingWithMemoryResult result = empty_machine_mapping_with_memory_result(); - for (SingleMachineMapping const &lhs_mm : lhs_result.machine_mappings) { - for (SingleMachineMapping const &rhs_mm : rhs_result.machine_mappings) { + for (MachineMappingForSingleLayer const &lhs_mm : lhs_result.machine_mappings) { + for (MachineMappingForSingleLayer const &rhs_mm : rhs_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm)); } } @@ -122,7 +122,7 @@ MachineMappingWithMemoryResult make_singleton_machine_mapping_with_memory_result( OpCostMetrics cost, MachineView const &machine_view) { return MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ cost, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), machine_view}, diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 063f6a9826..8761116be2 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -182,13 +182,13 @@ TEST_SUITE(FF_TEST_SUITE) { get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ OpCostMetrics{1.0, 2}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, }}, }, - SingleMachineMapping{ + MachineMappingForSingleLayer{ OpCostMetrics{1.5, 1}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv2}, @@ -211,7 +211,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ OpCostMetrics{ /*runtime=*/1.0 + 2.0 + 0.1, /*memory=*/2 + 3, @@ -231,7 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }}, }, - SingleMachineMapping{ + MachineMappingForSingleLayer{ OpCostMetrics{1.5 + 2.5 + 0.1, 1 + 2}, ParallelLayerGuidObliviousMachineMapping{{ { @@ -265,7 +265,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_optimal_machine_mapping_with_memory( cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = - MachineMappingWithMemoryResult{{SingleMachineMapping{ + MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{ OpCostMetrics{2.5, 2}, ParallelLayerGuidObliviousMachineMapping{{ { diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index 3a28576193..a47d8713e9 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -64,7 +64,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*memory=*/3, }; - SingleMachineMapping mm1 = SingleMachineMapping{ + MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ cost1, ParallelLayerGuidObliviousMachineMapping{ { @@ -76,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - SingleMachineMapping mm2 = SingleMachineMapping{ + MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{ cost2, ParallelLayerGuidObliviousMachineMapping{ { @@ -88,7 +88,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - SingleMachineMapping mm3 = SingleMachineMapping{ + MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{ cost3, ParallelLayerGuidObliviousMachineMapping{ { @@ -186,7 +186,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*memory=*/2, }; MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ pre_cost, ParallelLayerGuidObliviousMachineMapping{ { @@ -213,7 +213,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ post_cost, ParallelLayerGuidObliviousMachineMapping{ { @@ -251,7 +251,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult no_parallel_split_transform = MachineMappingWithMemoryResult{ { - SingleMachineMapping{ + MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ /*runtime=*/pre_cost.runtime + comm_cost + post_cost.runtime, @@ -305,7 +305,7 @@ TEST_SUITE(FF_TEST_SUITE) { comm_cost, pre, post, ParallelSplitTransformation::RthenL); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{ { - SingleMachineMapping{ + MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ /*runtime=*/pre_cost.runtime + comm_cost + post_cost.runtime, @@ -381,7 +381,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*memory=*/2, }; MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ lhs_cost, ParallelLayerGuidObliviousMachineMapping{ { @@ -407,7 +407,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*memory=*/1, }; MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ rhs_cost, ParallelLayerGuidObliviousMachineMapping{ { @@ -440,7 +440,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("both are nonempty") { MachineMappingWithMemoryResult result = parallel_combine(lhs, rhs); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ - SingleMachineMapping{ + MachineMappingForSingleLayer{ /*cost=*/OpCostMetrics{ /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime), /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory), @@ -530,7 +530,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*memory=*/3, }; - SingleMachineMapping mm1 = SingleMachineMapping{ + MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ cost1, ParallelLayerGuidObliviousMachineMapping{ { @@ -542,7 +542,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - SingleMachineMapping mm2 = SingleMachineMapping{ + MachineMappingForSingleLayer mm2 = MachineMappingForSingleLayer{ cost2, ParallelLayerGuidObliviousMachineMapping{ { @@ -554,7 +554,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - SingleMachineMapping mm3 = SingleMachineMapping{ + MachineMappingForSingleLayer mm3 = MachineMappingForSingleLayer{ cost3, ParallelLayerGuidObliviousMachineMapping{ { From 50bae937f6acd0e77cb77733bfbd23da167747db Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 8 Jan 2025 21:08:47 -0500 Subject: [PATCH 11/16] format --- .../machine_mapping_with_memory_result.cc | 80 ++++++++++--------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc index 2f443e4fc5..a6c2d1ed04 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc @@ -28,7 +28,8 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( std::unordered_set non_pareto_optimal_mappings; for (MachineMappingForSingleLayer const &mapping : result.machine_mappings) { bool is_pareto_optimal = true; - for (MachineMappingForSingleLayer const &other_mapping : result.machine_mappings) { + for (MachineMappingForSingleLayer const &other_mapping : + result.machine_mappings) { if (mapping.cost.runtime >= other_mapping.cost.runtime && mapping.cost.memory >= other_mapping.cost.memory && mapping != other_mapping) { @@ -49,32 +50,35 @@ MachineMappingWithMemoryResult MachineMappingWithMemoryResult const &post_result, std::optional const ¶llel_split_transformation) { - auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &pre_mm, - MachineMappingForSingleLayer const &post_mm) { - OpCostMetrics cost = OpCostMetrics{ - pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, - pre_mm.cost.memory + post_mm.cost.memory, - }; - - ParallelLayerGuidObliviousMachineMapping mapping = [&] { - if (parallel_split_transformation.has_value() && - parallel_split_transformation.value() == - ParallelSplitTransformation::RthenL) { - return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping, - /*rhs=*/pre_mm.machine_mapping); - } else { - return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping, - /*rhs=*/post_mm.machine_mapping); - } - }(); - - return MachineMappingForSingleLayer{cost, mapping}; - }; + auto combine_machine_mapping = + [&](MachineMappingForSingleLayer const &pre_mm, + MachineMappingForSingleLayer const &post_mm) { + OpCostMetrics cost = OpCostMetrics{ + pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime, + pre_mm.cost.memory + post_mm.cost.memory, + }; + + ParallelLayerGuidObliviousMachineMapping mapping = [&] { + if (parallel_split_transformation.has_value() && + parallel_split_transformation.value() == + ParallelSplitTransformation::RthenL) { + return binary_combine_mappings(/*lhs=*/post_mm.machine_mapping, + /*rhs=*/pre_mm.machine_mapping); + } else { + return binary_combine_mappings(/*lhs=*/pre_mm.machine_mapping, + /*rhs=*/post_mm.machine_mapping); + } + }(); + + return MachineMappingForSingleLayer{cost, mapping}; + }; MachineMappingWithMemoryResult result = empty_machine_mapping_with_memory_result(); - for (MachineMappingForSingleLayer const &pre_mm : pre_result.machine_mappings) { - for (MachineMappingForSingleLayer const &post_mm : post_result.machine_mappings) { + for (MachineMappingForSingleLayer const &pre_mm : + pre_result.machine_mappings) { + for (MachineMappingForSingleLayer const &post_mm : + post_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(pre_mm, post_mm)); } } @@ -85,23 +89,27 @@ MachineMappingWithMemoryResult MachineMappingWithMemoryResult parallel_combine(MachineMappingWithMemoryResult const &lhs_result, MachineMappingWithMemoryResult const &rhs_result) { - auto combine_machine_mapping = [&](MachineMappingForSingleLayer const &lhs_mm, - MachineMappingForSingleLayer const &rhs_mm) { - OpCostMetrics cost = OpCostMetrics{ - std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), - std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), - }; + auto combine_machine_mapping = + [&](MachineMappingForSingleLayer const &lhs_mm, + MachineMappingForSingleLayer const &rhs_mm) { + OpCostMetrics cost = OpCostMetrics{ + std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime), + std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), + }; - ParallelLayerGuidObliviousMachineMapping mapping = - binary_combine_mappings(lhs_mm.machine_mapping, rhs_mm.machine_mapping); + ParallelLayerGuidObliviousMachineMapping mapping = + binary_combine_mappings(lhs_mm.machine_mapping, + rhs_mm.machine_mapping); - return MachineMappingForSingleLayer{cost, mapping}; - }; + return MachineMappingForSingleLayer{cost, mapping}; + }; MachineMappingWithMemoryResult result = empty_machine_mapping_with_memory_result(); - for (MachineMappingForSingleLayer const &lhs_mm : lhs_result.machine_mappings) { - for (MachineMappingForSingleLayer const &rhs_mm : rhs_result.machine_mappings) { + for (MachineMappingForSingleLayer const &lhs_mm : + lhs_result.machine_mappings) { + for (MachineMappingForSingleLayer const &rhs_mm : + rhs_result.machine_mappings) { result.machine_mappings.insert(combine_machine_mapping(lhs_mm, rhs_mm)); } } From d96b678a9fbf883fb228377df898b4c8d1aab2b7 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 15 Jan 2025 17:07:28 -0500 Subject: [PATCH 12/16] top-level loop for compiler --- .../compiler/algorithm_config.variant.toml | 18 ++ lib/compiler/include/compiler/compiler.h | 32 +--- .../data_parallelism_config.struct.toml | 14 ++ .../graph_optimize_result.struct.toml | 16 -- .../machine_mapping/machine_mapping.h | 5 + ...ne_mapping_problem_tree_result.struct.toml | 21 +++ .../compiler/search_result.struct.toml | 17 ++ ...get_pcg_balanced_binary_sp_decomposition.h | 2 + .../include/compiler/unity_algorithm.h | 24 --- .../allowed_machine_views.h | 0 .../graph_optimize_state.h | 10 +- .../unity_algorithm/unity_algorithm.h | 21 +++ .../unity_search_config.struct.toml} | 2 +- lib/compiler/src/compiler/compiler.cc | 32 ++++ .../src/compiler/graph_optimize_state.cc | 85 ---------- .../machine_mapping/machine_mapping.cc | 39 +++++ .../allowed_machine_views.cc | 2 +- .../unity_algorithm/graph_optimize_state.cc | 49 ++++++ .../unity_algorithm/unity_algorithm.cc | 157 ++++++++++++++++++ lib/compiler/src/unity_algorithm.cc | 93 ----------- .../test/src/allowed_machine_views.cc | 2 +- lib/compiler/test/src/graph_optimize_state.cc | 129 +++++++------- lib/compiler/test/src/unity_algorithm.cc | 3 +- lib/pcg/include/pcg/operator_task_space.h | 5 + .../parallel_computation_graph.h | 4 + lib/pcg/src/pcg/operator_task_space.cc | 5 + .../parallel_computation_graph.cc | 5 + .../binary_sp_decomposition_tree.h | 6 + .../binary_sp_decomposition_tree.cc | 7 + 29 files changed, 489 insertions(+), 316 deletions(-) create mode 100644 lib/compiler/include/compiler/algorithm_config.variant.toml create mode 100644 lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml delete mode 100644 lib/compiler/include/compiler/graph_optimize_result.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml create mode 100644 lib/compiler/include/compiler/search_result.struct.toml delete mode 100644 lib/compiler/include/compiler/unity_algorithm.h rename lib/compiler/include/compiler/{ => unity_algorithm}/allowed_machine_views.h (100%) rename lib/compiler/include/compiler/{ => unity_algorithm}/graph_optimize_state.h (67%) create mode 100644 lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h rename lib/compiler/include/compiler/{optimizer_config.struct.toml => unity_algorithm/unity_search_config.struct.toml} (90%) create mode 100644 lib/compiler/src/compiler/compiler.cc delete mode 100644 lib/compiler/src/compiler/graph_optimize_state.cc rename lib/compiler/src/compiler/{ => unity_algorithm}/allowed_machine_views.cc (98%) create mode 100644 lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc create mode 100644 lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc delete mode 100644 lib/compiler/src/unity_algorithm.cc diff --git a/lib/compiler/include/compiler/algorithm_config.variant.toml b/lib/compiler/include/compiler/algorithm_config.variant.toml new file mode 100644 index 0000000000..4e58104875 --- /dev/null +++ b/lib/compiler/include/compiler/algorithm_config.variant.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "AlgorithmConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/data_parallelism/data_parallelism_config.dtg.h", + "compiler/unity_algorithm/unity_search_config.dtg.h", +] + +[[values]] +type = "::FlexFlow::DataParallelismConfig" + +[[values]] +type = "::FlexFlow::UnitySearchConfig" diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h index 178ab19a53..3faacd8f16 100644 --- a/lib/compiler/include/compiler/compiler.h +++ b/lib/compiler/include/compiler/compiler.h @@ -1,42 +1,24 @@ #ifndef _FLEXFLOW_COMPILER_COMPILER_H #define _FLEXFLOW_COMPILER_COMPILER_H -#include "pcg/cost_values.h" -#include "pcg/machine_view.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph.h" -#include "pcg/tensor_mapping.h" +#include "compiler/algorithm_config.dtg.h" +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/search_result.dtg.h" +#include "pcg/machine_specification.dtg.h" namespace FlexFlow { enum class SearchAlgorithm { DATA_PARALLEL, -}; - -using SearchAlgorithmConfig = std::variant<>; -using SearchSolution = std::variant<>; - -struct SearchResult { - ParallelComputationGraph pcg; - TensorMapping tensor_mapping; - SearchSolution solution; - CostValues cost_values; + UNITY, }; SearchResult optimize(ComputationGraph const &, MachineSpecification const &, CostEstimator const &, SearchAlgorithm, - optional const &); - -// struct SearchSolution { -// LabelledMultiDiGraph optimized_pcg; -// std::unordered_map device_assignments; -// /* std::unordered_map> tensor_mappings; */ -// }; -// -// SearchSolution run_data_parallelize(ComputationGraph const &, -// MachineSpecification const &); + AlgorithmConfig const &, + DeviceType); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml new file mode 100644 index 0000000000..68512fa473 --- /dev/null +++ b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml @@ -0,0 +1,14 @@ +namespace = "FlexFlow" +name = "DataParallelismConfig" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ +] + +[[fields]] +name = "degree" +type = "int" diff --git a/lib/compiler/include/compiler/graph_optimize_result.struct.toml b/lib/compiler/include/compiler/graph_optimize_result.struct.toml deleted file mode 100644 index 22f29cbd59..0000000000 --- a/lib/compiler/include/compiler/graph_optimize_result.struct.toml +++ /dev/null @@ -1,16 +0,0 @@ -namespace = "FlexFlow" -name = "GraphOptimizeResult" -features = [ ] - -includes = [ - "compiler/machine_mapping/machine_mapping.dtg.h", - "pcg/parallel_computation_graph/parallel_computation_graph.h" -] - -[[fields]] -name = "pcg" -type = "::FlexFlow::ParallelComputationGraph" - -[[fields]] -name = "machine_mapping" -type = "::FlexFlow::MachineMapping" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h index 06cbbf942d..8f9fe23c1c 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H #include "compiler/machine_mapping/machine_mapping.dtg.h" +#include "compiler/machine_mapping/machine_mapping_result.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" namespace FlexFlow { @@ -10,6 +12,9 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &, bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2); +MachineMapping get_machine_mapping_from_machine_mapping_result( + PCGBinarySPDecomposition const &, MachineMappingResult const &); + } // namespace FlexFlow #endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml new file mode 100644 index 0000000000..252cd88276 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "GetMachineMappingProblemTreeResult" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h", + "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h", + "utils/bidict/bidict.h" +] + +[[fields]] +type = "::FlexFlow::MachineMappingProblemTree" +name = "mm_problem_tree" + +[[fields]] +type = "::FlexFlow::bidict<::FlexFlow::UnmappedOpCostEstimateKey, ::FlexFlow::parallel_layer_guid_t>" +name = "mapping" diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml new file mode 100644 index 0000000000..3776ec5568 --- /dev/null +++ b/lib/compiler/include/compiler/search_result.struct.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "SearchResult" +features = [ +] + +includes = [ + "pcg/parallel_computation_graph/parallel_computation_graph.h", + "machine_mapping/machine_mapping.h", +] + +[[fields]] +name = "pcg" +type = "::FlexFlow::ParallelComputationGraph" + +[[fields]] +name = "machine_mapping" +type = "::FlexFlow::MachineMapping" diff --git a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h index d43edaa79d..bb7459c767 100644 --- a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h +++ b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" + namespace FlexFlow { std::optional diff --git a/lib/compiler/include/compiler/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm.h deleted file mode 100644 index 232f2b9563..0000000000 --- a/lib/compiler/include/compiler/unity_algorithm.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H -#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H - -#include "compiler/cost_estimator/cost_estimator.h" -#include "compiler/graph_optimize_result.dtg.h" -#include "optimizer_config.dtg.h" -#include "pcg/computation_graph.h" -#include "pcg/machine_specification.dtg.h" -#include "substitutions/sub_parallel_computation_graph.h" - -namespace FlexFlow { - -GraphOptimizeResult graph_optimize( - ParallelComputationGraph &pcg, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - std::function( - ParallelLayerAttrs const &, MachineSpecification const &)> const - &allowed_machine_views, - OptimizerConfig const &opt_config); - -} // namespace FlexFlow - -#endif diff --git a/lib/compiler/include/compiler/allowed_machine_views.h b/lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h similarity index 100% rename from lib/compiler/include/compiler/allowed_machine_views.h rename to lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h similarity index 67% rename from lib/compiler/include/compiler/graph_optimize_state.h rename to lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h index 2de2321ba6..3a2823c46d 100644 --- a/lib/compiler/include/compiler/graph_optimize_state.h +++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h @@ -1,16 +1,16 @@ #ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H #define _FLEXFLOW_COMPILER_MCMC_STATE_H -#include "compiler/graph_optimize_result.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" namespace FlexFlow { struct GraphOptimizeState { - GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result, - float runtime); + GraphOptimizeState(ParallelComputationGraph const &pcg, + float runtime_with_optimal_mm); - GraphOptimizeResult graph_optimize_result; - float runtime; + ParallelComputationGraph pcg; + float runtime_with_optimal_mm; bool operator==(GraphOptimizeState const &other) const; bool operator!=(GraphOptimizeState const &other) const; diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h new file mode 100644 index 0000000000..4396bef734 --- /dev/null +++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H +#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/search_result.dtg.h" +#include "compiler/unity_algorithm/unity_search_config.dtg.h" +#include "pcg/machine_specification.dtg.h" +#include "substitutions/substitution.h" + +namespace FlexFlow { + +SearchResult graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + std::vector const &substitutions, + UnitySearchConfig const &search_config, + DeviceType device_type); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/optimizer_config.struct.toml b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml similarity index 90% rename from lib/compiler/include/compiler/optimizer_config.struct.toml rename to lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml index b7f4f71e9c..9ec22cf916 100644 --- a/lib/compiler/include/compiler/optimizer_config.struct.toml +++ b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "OptimizerConfig" +name = "UnitySearchConfig" features = [ "eq", "hash", diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc new file mode 100644 index 0000000000..a428c51abc --- /dev/null +++ b/lib/compiler/src/compiler/compiler.cc @@ -0,0 +1,32 @@ +#include "compiler/compiler.h" +#include "compiler/unity_algorithm/unity_algorithm.h" + +namespace FlexFlow { + +SearchResult optimize(ComputationGraph const &computation_graph, + MachineSpecification const &machine_specification, + CostEstimator const &cost_estimator, + SearchAlgorithm search_algorithm, + UnitySearchConfig const &search_config, + DeviceType device_type) { + switch (search_algorithm) { + case SearchAlgorithm::DATA_PARALLEL: + throw std::runtime_error( + "Data parallel search algorithm is not implemented yet"); + case SearchAlgorithm::UNITY: { + ParallelComputationGraph pcg = + parallel_computation_graph_from_computation_graph(computation_graph); + std::vector substitutions; // TODO: Implement this + return graph_optimize(pcg, + cost_estimator, + machine_specification, + substitutions, + search_config, + device_type); + } + default: + throw std::runtime_error("Unknown search algorithm"); + } +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc deleted file mode 100644 index 4b4f323ea4..0000000000 --- a/lib/compiler/src/compiler/graph_optimize_state.cc +++ /dev/null @@ -1,85 +0,0 @@ -#include "compiler/graph_optimize_state.h" -#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" - -namespace FlexFlow { - -GraphOptimizeState::GraphOptimizeState( - GraphOptimizeResult const &graph_optimize_result, float runtime) - : graph_optimize_result(graph_optimize_result), runtime(runtime) {} - -bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const { - // Note(@wmdi): This is a hack to implement a partially correct homomorphism - // check. Switch to the homomorphism check used in substitutions right after - // https://github.com/flexflow/FlexFlow/pull/1471 is merged. - auto layers1 = topological_ordering(graph_optimize_result.pcg); - auto layers2 = topological_ordering(other.graph_optimize_result.pcg); - if (layers1.size() != layers2.size()) { - return false; - } - std::unordered_map mapping; - for (size_t i = 0; i < layers1.size(); ++i) { - if (get_parallel_layer_attrs(graph_optimize_result.pcg, layers1[i]) != - get_parallel_layer_attrs(other.graph_optimize_result.pcg, layers2[i])) { - return false; - } - auto inputs1 = get_incoming_tensors(graph_optimize_result.pcg, layers1[i]); - auto inputs2 = - get_incoming_tensors(other.graph_optimize_result.pcg, layers2[i]); - if (inputs1.size() != inputs2.size()) { - return false; - } - for (size_t j = 0; j < inputs1.size(); ++j) { - if (inputs1[j] != mapping.at(inputs2[j])) { - return false; - } - } - auto outputs1 = get_layer_outputs(graph_optimize_result.pcg, layers1[i]); - auto outputs2 = - get_layer_outputs(other.graph_optimize_result.pcg, layers2[i]); - if (outputs1.size() != outputs2.size()) { - return false; - } - for (size_t j = 0; j < outputs1.size(); ++j) { - mapping.emplace(outputs2[j], outputs1[j]); - } - } - return true; -} - -bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const { - return !(*this == other); -} - -bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const { - return runtime < other.runtime; -} - -} // namespace FlexFlow - -namespace std { - -size_t hash<::FlexFlow::GraphOptimizeState>::operator()( - ::FlexFlow::GraphOptimizeState const &state) const { - // TODO(@wmdi): Eventually it might be good to use a proper graph hash like - // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash - size_t seed = 0; - auto layers = topological_ordering(state.graph_optimize_result.pcg); - ::FlexFlow::hash_combine(seed, layers.size()); - for (auto layer : layers) { - ::FlexFlow::hash_combine( - seed, get_parallel_layer_attrs(state.graph_optimize_result.pcg, layer)); - auto inputs = get_incoming_tensors(state.graph_optimize_result.pcg, layer); - ::FlexFlow::hash_combine(seed, inputs.size()); - for (auto input : inputs) { - for (size_t i = 0; i < layers.size(); ++i) { - if (get_source_layer(input) == layers[i]) { - ::FlexFlow::hash_combine(seed, i); - break; - } - } - } - } - return seed; -} - -} // namespace std diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 57e82684e9..39222b91ac 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -1,7 +1,9 @@ #include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" #include "utils/containers/merge_maps.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" namespace FlexFlow { @@ -14,4 +16,41 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) { return are_disjoint(keys(m1.machine_views), keys(m2.machine_views)); } +MachineMapping get_machine_mapping_from_machine_mapping_result( + PCGBinarySPDecomposition const &sp_decomposition, + MachineMappingResult const &mm_result) { + + BinarySPDecompositionTree sp_tree = + binary_sp_tree_from_pcg_sp_tree(sp_decomposition); + + auto get_layer_from_path = + [&](BinaryTreePath const &path) -> parallel_layer_guid_t { + std::optional subtree_optional = + binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path); + if (!subtree_optional.has_value()) { + throw std::runtime_error("Invalid tree path"); + } + BinarySPDecompositionTree subtree = subtree_optional.value(); + if (!subtree.is_node()) { + throw std::runtime_error("Invalid tree path to a leaf"); + } + return parallel_layer_guid_t{ + subtree.get(), + }; + }; + + std::unordered_map mm; + + if (mm_result.raw_result) { + FeasibleMachineMappingResult const &feasible_mm_result = + mm_result.raw_result.value(); + for (auto const &[path, mv] : + feasible_mm_result.machine_mapping.raw_mapping) { + mm.insert({get_layer_from_path(path), mv}); + } + } + + return MachineMapping{mm}; +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc similarity index 98% rename from lib/compiler/src/compiler/allowed_machine_views.cc rename to lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc index 1c226f79b0..d6fca79403 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc @@ -1,4 +1,4 @@ -#include "compiler/allowed_machine_views.h" +#include "compiler/unity_algorithm/allowed_machine_views.h" #include "pcg/machine_specification.h" #include "pcg/machine_view.h" #include "pcg/multi_dimensional_stride.dtg.h" diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc new file mode 100644 index 0000000000..bf8f089cc0 --- /dev/null +++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -0,0 +1,49 @@ +#include "compiler/unity_algorithm/graph_optimize_state.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h" + +namespace FlexFlow { + +GraphOptimizeState::GraphOptimizeState(ParallelComputationGraph const &pcg, + float runtime_with_optimal_mm) + : pcg(pcg), runtime_with_optimal_mm(runtime_with_optimal_mm) {} + +bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const { + return pcgs_are_isomorphic(pcg, other.pcg); +} + +bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const { + return !(*this == other); +} + +bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const { + return runtime_with_optimal_mm < other.runtime_with_optimal_mm; +} + +} // namespace FlexFlow + +namespace std { + +size_t hash<::FlexFlow::GraphOptimizeState>::operator()( + ::FlexFlow::GraphOptimizeState const &state) const { + // TODO(@wmdi): Eventually it might be good to use a proper graph hash like + // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash + size_t seed = 0; + auto layers = topological_ordering(state.pcg); + ::FlexFlow::hash_combine(seed, layers.size()); + for (auto layer : layers) { + ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer)); + auto inputs = get_incoming_tensors(state.pcg, layer); + ::FlexFlow::hash_combine(seed, inputs.size()); + for (auto input : inputs) { + for (size_t i = 0; i < layers.size(); ++i) { + if (get_source_layer(input) == layers[i]) { + ::FlexFlow::hash_combine(seed, i); + break; + } + } + } + } + return seed; +} + +} // namespace std diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc new file mode 100644 index 0000000000..2f2caf11ec --- /dev/null +++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc @@ -0,0 +1,157 @@ +#include "compiler/unity_algorithm/unity_algorithm.h" +#include "compiler/machine_mapping/get_optimal_machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h" +#include "compiler/unity_algorithm/allowed_machine_views.h" +#include "compiler/unity_algorithm/graph_optimize_state.h" +#include "pcg/machine_specification.dtg.h" +#include "pcg/operator_task_space.h" +#include "substitutions/pcg_pattern.h" +#include "substitutions/sub_parallel_computation_graph.h" +#include "substitutions/substitution.h" +#include "utils/deduplicated_priority_queue.h" +#include "utils/graph/node/algorithms.h" + +namespace FlexFlow { + +/* + * Applies a substitution to all possible positions in PCG + */ +std::vector + all_pcgs_obtained_by_applying_a_substitution( + ParallelComputationGraph const &pcg, + std::vector const &substitutions) { + std::vector results; + SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(pcg); + for (Substitution const &substitution : substitutions) { + for (PCGPatternMatch const &pattern_match : + find_pattern_matches(substitution.pcg_pattern, subpcg)) { + SubParallelComputationGraph subpcg_from_substitution = + apply_substitution(subpcg, substitution, pattern_match); + results.push_back( + pcg_from_sub_pcg_by_dropping_inputs(subpcg_from_substitution)); + } + } + return results; +} + +SearchResult graph_optimize(ParallelComputationGraph &pcg, + CostEstimator const &cost_estimator, + MachineSpecification const &resources, + std::vector const &substitutions, + UnitySearchConfig const &search_config, + DeviceType device_type) { + + // NOTE(@wmdi): This mapping is only used for allowed_machine_views + std::unordered_map + mapping_from_unmapped_op_cost_estimate_key_parallel_layer = [&] { + std::unordered_map + mapping; + for (parallel_layer_guid_t layer : get_parallel_layers(pcg)) { + // NOTE(@wmdi): Assume layers with the same key have the same allowed + // machine views + mapping.insert( + {get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), layer}); + } + return mapping; + }(); + + MachineMappingCache cached_subgraph_costs = MachineMappingCache{ + {}, + }; + DeduplicatedPriorityQueue candidates; + + MachineMappingContext context = MachineMappingContext{ + /*cost_estimator=*/cost_estimator, + /*allowed_machine_views=*/ + [&](UnmappedOpCostEstimateKey const &key, + MachineSpecification const &resources) + -> std::unordered_set { + return get_allowed_machine_views( + resources, + get_operator_task_space( + pcg, + mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at( + key)), + device_type); + }, + }; + + auto optimize_pcg = [&](ParallelComputationGraph const &pcg) + -> std::pair { + std::optional maybe_sp_decomp = + get_pcg_balanced_binary_sp_decomposition(pcg); + + if (!maybe_sp_decomp.has_value()) { + throw std::runtime_error("Fail to SP-ize PCG"); + } + + PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value(); + + MachineMappingConstraints constraints = MachineMappingConstraints{ + /*machine_views=*/{}, + }; + + MachineMappingResult mm_result = get_optimal_machine_mapping( + cached_subgraph_costs, + context, + get_machine_mapping_problem_tree(pcg, sp_decomp), + resources, + constraints); + + float runtime_with_optimal_mm; + if (mm_result.raw_result == std::nullopt) { + runtime_with_optimal_mm = std::numeric_limits::infinity(); + } else { + runtime_with_optimal_mm = mm_result.raw_result.value().runtime; + } + return { + GraphOptimizeState{ + /*pcg=*/pcg, + /*runtime_with_optimal_mm=*/runtime_with_optimal_mm, + }, + get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result), + }; + }; + + GraphOptimizeState best_state = optimize_pcg(pcg).first; + candidates.push(best_state); + + for (int iteration = 0; + !candidates.empty() && iteration < search_config.budget; + ++iteration) { + GraphOptimizeState current_state = candidates.top(); + candidates.pop(); + + if (current_state < best_state) { + best_state = current_state; + } else if (current_state.runtime_with_optimal_mm > + best_state.runtime_with_optimal_mm * search_config.alpha) { + continue; + } + + for (ParallelComputationGraph const &new_pcg : + all_pcgs_obtained_by_applying_a_substitution(current_state.pcg, + substitutions)) { + std::optional new_pcg_optimize_result = + optimize_pcg(new_pcg).first; + if (new_pcg_optimize_result == std::nullopt) { + continue; + } + GraphOptimizeState new_state = new_pcg_optimize_result.value(); + if (new_state.runtime_with_optimal_mm <= search_config.threshold && + get_nodes(new_pcg.raw_graph).size() <= search_config.max_num_ops) { + candidates.push(new_state); + } + } + } + + return SearchResult{ + /*pcg=*/best_state.pcg, + /*machine_mapping=*/optimize_pcg(best_state.pcg).second, + }; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/unity_algorithm.cc b/lib/compiler/src/unity_algorithm.cc deleted file mode 100644 index 86a211c535..0000000000 --- a/lib/compiler/src/unity_algorithm.cc +++ /dev/null @@ -1,93 +0,0 @@ -#include "compiler/unity_algorithm.h" -#include "compiler/graph_optimize_state.h" -#include "compiler/machine_mapping/get_optimal_machine_mapping.h" -#include "pcg/machine_specification.dtg.h" -#include "substitutions/substitution.h" -#include "utils/deduplicated_priority_queue.h" -#include "utils/graph/node/algorithms.h" -namespace FlexFlow { - -/* - * Gets all substitutions applicable to a PCG - */ -std::vector - get_all_applicable_substitutions(ParallelComputationGraph const &pcg) { - NOT_IMPLEMENTED(); -} - -/* - * Applies a substitution to all possible positions in PCG - */ -std::vector - apply_substitution(ParallelComputationGraph const &pcg, - Substitution const &) { - NOT_IMPLEMENTED(); -} - -GraphOptimizeResult graph_optimize( - ParallelComputationGraph &pcg, - CostEstimator const &cost_estimator, - MachineSpecification const &resources, - std::function( - ParallelLayerAttrs const &, MachineSpecification const &)> const - &allowed_machine_views, - OptimizerConfig const &opt_config) { - NOT_IMPLEMENTED(); - - // std::vector substitutions = - // get_all_applicable_substitutions(pcg); - // - // MachineMappingCache cached_subgraph_costs; - // DeduplicatedPriorityQueue candidates; - // - // MachineMappingResult original_pcg_cost = - // get_optimal_machine_mapping(pcg, - // allowed_machine_views, - // cost_estimator, - // resources, - // cached_subgraph_costs); - // - // GraphOptimizeState initial_state = { - // GraphOptimizeResult(pcg, original_pcg_cost.machine_mapping), - // original_pcg_cost.runtime}; - // - // GraphOptimizeState best_state = initial_state; - // candidates.push(initial_state); - // - // for (int iteration = 0; !candidates.empty() && iteration < - // opt_config.budget; - // ++iteration) { - // GraphOptimizeState current_state = candidates.top(); - // candidates.pop(); - // - // if (current_state.runtime < best_state.runtime) { - // best_state = current_state; - // } else if (current_state.runtime > best_state.runtime * opt_config.alpha) - // { - // continue; - // } - // - // for (Substitution const &substitution : substitutions) { - // for (ParallelComputationGraph const &new_pcg : apply_substitution( - // current_state.graph_optimize_result.pcg, substitution)) { - // MachineMappingResult new_pcg_cost = - // get_optimal_machine_mapping(new_pcg, - // allowed_machine_views, - // cost_estimator, - // resources, - // cached_subgraph_costs); - // GraphOptimizeState new_state{ - // GraphOptimizeResult(new_pcg, new_pcg_cost.machine_mapping), - // new_pcg_cost.runtime}; - // if (new_pcg_cost.runtime <= opt_config.threshold && - // get_nodes(new_pcg.raw_graph).size() <= opt_config.max_num_ops) { - // candidates.push(new_state); - // } - // } - // } - // } - - // return best_state.graph_optimize_result; -} - -} // namespace FlexFlow diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc index 936894ad2d..b885f4f8ea 100644 --- a/lib/compiler/test/src/allowed_machine_views.cc +++ b/lib/compiler/test/src/allowed_machine_views.cc @@ -1,4 +1,4 @@ -#include "compiler/allowed_machine_views.h" +#include "compiler/unity_algorithm/allowed_machine_views.h" #include "doctest/doctest.h" #include "utils/containers/extend.h" #include "utils/containers/range.h" diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc index 46177ad420..3bc9893f18 100644 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ b/lib/compiler/test/src/graph_optimize_state.cc @@ -1,80 +1,81 @@ -#include "compiler/graph_optimize_state.h" +#include "compiler/unity_algorithm/graph_optimize_state.h" #include "doctest/doctest.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("GraphOptimizeState::operator==") { - ParallelComputationGraphBuilder builder; + // TODO(@wmdi): to be udpated +// TEST_CASE("GraphOptimizeState::operator==") { +// ParallelComputationGraphBuilder builder; - ParallelTensorShape input_shape = - ParallelTensorShape{ParallelTensorDims{ - FFOrdered{ - ShardParallelDim{32, 2}, - ShardParallelDim{16, 1}, - }, - ReplicaParallelDimSet{ - SumDegree{1}, - DiscardCopyDegree{1}, - }, - }, - DataType::FLOAT}; +// ParallelTensorShape input_shape = +// ParallelTensorShape{ParallelTensorDims{ +// FFOrdered{ +// ShardParallelDim{32, 2}, +// ShardParallelDim{16, 1}, +// }, +// ReplicaParallelDimSet{ +// SumDegree{1}, +// DiscardCopyDegree{1}, +// }, +// }, +// DataType::FLOAT}; - parallel_tensor_guid_t input0 = - builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); - parallel_tensor_guid_t dense0 = builder.dense(input0, - 8, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense0"); +// parallel_tensor_guid_t input0 = +// builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); +// parallel_tensor_guid_t dense0 = builder.dense(input0, +// 8, +// Activation::RELU, +// true, +// DataType::FLOAT, +// std::nullopt, +// std::nullopt, +// "dense0"); - parallel_tensor_guid_t dense1 = builder.dense(dense0, - 4, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense1"); +// parallel_tensor_guid_t dense1 = builder.dense(dense0, +// 4, +// Activation::RELU, +// true, +// DataType::FLOAT, +// std::nullopt, +// std::nullopt, +// "dense1"); - ParallelComputationGraph pcg = builder.pcg; +// ParallelComputationGraph pcg = builder.pcg; - // `machine_mapping` is determined by the PCG and the device mapping - // algorithm, and `runtime` is determined by the PCG and the device mapping, - // so their values here do not matter. - std::unordered_map empty_machine_views; - MachineMapping empty_machine_mapping(empty_machine_views); - bool result1 = - GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), - 0) == - GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0); - bool correct1 = true; - CHECK(result1 == correct1); +// // `machine_mapping` is determined by the PCG and the device mapping +// // algorithm, and `runtime` is determined by the PCG and the device mapping, +// // so their values here do not matter. +// std::unordered_map empty_machine_views; +// MachineMapping empty_machine_mapping(empty_machine_views); +// bool result1 = +// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), +// 0) == +// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0); +// bool correct1 = true; +// CHECK(result1 == correct1); - ParallelComputationGraphBuilder builder_; +// ParallelComputationGraphBuilder builder_; - parallel_tensor_guid_t input0_ = - builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); - parallel_tensor_guid_t dense0_ = builder.dense(input0, - 8, - Activation::RELU, - true, - DataType::FLOAT, - std::nullopt, - std::nullopt, - "dense0"); +// parallel_tensor_guid_t input0_ = +// builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); +// parallel_tensor_guid_t dense0_ = builder.dense(input0, +// 8, +// Activation::RELU, +// true, +// DataType::FLOAT, +// std::nullopt, +// std::nullopt, +// "dense0"); - ParallelComputationGraph pcg_ = builder.pcg; +// ParallelComputationGraph pcg_ = builder.pcg; - bool result2 = - GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), - 0) == - GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0); - bool correct2 = false; - CHECK(result2 == correct2); - } +// bool result2 = +// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), +// 0) == +// GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0); +// bool correct2 = false; +// CHECK(result2 == correct2); +// } } diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc index 8ff0978ea5..d8523f6659 100644 --- a/lib/compiler/test/src/unity_algorithm.cc +++ b/lib/compiler/test/src/unity_algorithm.cc @@ -1,7 +1,8 @@ -#include "compiler/unity_algorithm.h" +#include "compiler/unity_algorithm/unity_algorithm.h" #include "doctest/doctest.h" TEST_SUITE(FF_TEST_SUITE) { + // TODO: to be udpated // Rapidcheck does not work for now // TEST_CASE("graph_optimize") { // RC_SUBCASE([](ComputationGraph const &g, diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h index 61cab4eff1..1a19397c72 100644 --- a/lib/pcg/include/pcg/operator_task_space.h +++ b/lib/pcg/include/pcg/operator_task_space.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_PCG_INCLUDE_OPERATOR_TASK_SPACE_H #include "pcg/operator_task_space.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" #include "pcg/task_space_coordinate.dtg.h" #include #include @@ -17,6 +19,9 @@ TaskSpaceCoordinate size_t num_dims(OperatorTaskSpace const &task); size_t num_tasks(OperatorTaskSpace const &task); +OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h index c740e1ffd2..3cbd1f1977 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H #define _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H +#include "pcg/computation_graph.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h" #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h" @@ -66,6 +67,9 @@ ParallelComputationGraph without_layer_names(ParallelComputationGraph const &); bool pcgs_are_isomorphic(ParallelComputationGraph const &, ParallelComputationGraph const &); +ParallelComputationGraph + parallel_computation_graph_from_computation_graph(ComputationGraph const &); + } // namespace FlexFlow #endif diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index 2538cb4ea0..d50cce2af3 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -36,4 +36,9 @@ size_t num_tasks(OperatorTaskSpace const &task) { return product(task.degrees); } +OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, + parallel_layer_guid_t const &layer) { + NOT_IMPLEMENTED(); +} + } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index 781c44640c..704f1fa48b 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -203,4 +203,9 @@ bool pcgs_are_isomorphic(ParallelComputationGraph const &lhs, .has_value(); } +ParallelComputationGraph parallel_computation_graph_from_computation_graph( + ComputationGraph const &) { + NOT_IMPLEMENTED(); +} + } // namespace FlexFlow diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h index de48cd17e9..9b4ea6cd20 100644 --- a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h +++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h @@ -1,11 +1,13 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H +#include "utils/full_binary_tree/binary_tree_path.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_parallel_split.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_series_split.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h" #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h" +#include #include namespace FlexFlow { @@ -23,6 +25,10 @@ std::unordered_multiset get_leaves(BinarySPDecompositionTree const &); SPDecompositionTreeNodeType get_node_type(BinarySPDecompositionTree const &); +std::optional + binary_sp_decomposition_tree_get_subtree_at_path( + BinarySPDecompositionTree const &, BinaryTreePath const &); + } // namespace FlexFlow #endif diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc index 62489ff75f..3e4bc13289 100644 --- a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc +++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc @@ -1,5 +1,6 @@ #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_left_associative.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_right_associative.h" @@ -82,4 +83,10 @@ SPDecompositionTreeNodeType }); } +std::optional + binary_sp_decomposition_tree_get_subtree_at_path( + BinarySPDecompositionTree const &tree, BinaryTreePath const &path) { + return get_subtree_at_path(tree, generic_impl_for_binary_sp_tree(), path); +} + } // namespace FlexFlow From c16bcf605f824b6292e89247ac40d6ed1acb0d13 Mon Sep 17 00:00:00 2001 From: wmdi Date: Tue, 21 Jan 2025 17:40:59 -0500 Subject: [PATCH 13/16] fixes --- lib/compiler/include/compiler/compiler.h | 4 +-- ...ne_mapping_problem_tree_result.struct.toml | 21 -------------- .../compiler/search_result.struct.toml | 2 +- lib/compiler/src/compiler/compiler.cc | 23 +++++++-------- .../machine_mapping/machine_mapping.cc | 12 ++++---- .../unity_algorithm/graph_optimize_state.cc | 10 +++---- .../unity_algorithm/unity_algorithm.cc | 29 ++++++++++--------- lib/pcg/src/pcg/operator_task_space.cc | 17 ++++++++++- 8 files changed, 55 insertions(+), 63 deletions(-) delete mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h index 3faacd8f16..8697c06beb 100644 --- a/lib/compiler/include/compiler/compiler.h +++ b/lib/compiler/include/compiler/compiler.h @@ -16,9 +16,7 @@ enum class SearchAlgorithm { SearchResult optimize(ComputationGraph const &, MachineSpecification const &, CostEstimator const &, - SearchAlgorithm, - AlgorithmConfig const &, - DeviceType); + AlgorithmConfig const &); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml deleted file mode 100644 index 252cd88276..0000000000 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree_result.struct.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "GetMachineMappingProblemTreeResult" -features = [ - "eq", - "hash", - "fmt", -] - -includes = [ - "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h", - "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h", - "utils/bidict/bidict.h" -] - -[[fields]] -type = "::FlexFlow::MachineMappingProblemTree" -name = "mm_problem_tree" - -[[fields]] -type = "::FlexFlow::bidict<::FlexFlow::UnmappedOpCostEstimateKey, ::FlexFlow::parallel_layer_guid_t>" -name = "mapping" diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml index 3776ec5568..120d182c75 100644 --- a/lib/compiler/include/compiler/search_result.struct.toml +++ b/lib/compiler/include/compiler/search_result.struct.toml @@ -5,7 +5,7 @@ features = [ includes = [ "pcg/parallel_computation_graph/parallel_computation_graph.h", - "machine_mapping/machine_mapping.h", + "compiler/machine_mapping/machine_mapping.h", ] [[fields]] diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc index a428c51abc..f2ff32b944 100644 --- a/lib/compiler/src/compiler/compiler.cc +++ b/lib/compiler/src/compiler/compiler.cc @@ -1,19 +1,19 @@ #include "compiler/compiler.h" #include "compiler/unity_algorithm/unity_algorithm.h" +#include "utils/overload.h" namespace FlexFlow { SearchResult optimize(ComputationGraph const &computation_graph, MachineSpecification const &machine_specification, CostEstimator const &cost_estimator, - SearchAlgorithm search_algorithm, - UnitySearchConfig const &search_config, - DeviceType device_type) { - switch (search_algorithm) { - case SearchAlgorithm::DATA_PARALLEL: + AlgorithmConfig const &search_config) { + return search_config.visit(overload{ + [&](DataParallelismConfig const &config) -> SearchResult { throw std::runtime_error( "Data parallel search algorithm is not implemented yet"); - case SearchAlgorithm::UNITY: { + }, + [&](UnitySearchConfig const &config) { ParallelComputationGraph pcg = parallel_computation_graph_from_computation_graph(computation_graph); std::vector substitutions; // TODO: Implement this @@ -21,12 +21,11 @@ SearchResult optimize(ComputationGraph const &computation_graph, cost_estimator, machine_specification, substitutions, - search_config, - device_type); - } - default: - throw std::runtime_error("Unknown search algorithm"); - } + config, + DeviceType::GPU); + + }, + }); } } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 39222b91ac..33a8f686f5 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -3,6 +3,7 @@ #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" #include "utils/containers/merge_maps.h" +#include "utils/containers/map_keys.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" namespace FlexFlow { @@ -28,14 +29,14 @@ MachineMapping get_machine_mapping_from_machine_mapping_result( std::optional subtree_optional = binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path); if (!subtree_optional.has_value()) { - throw std::runtime_error("Invalid tree path"); + throw std::runtime_error(fmt::format("Invalid tree path {}", path)); } BinarySPDecompositionTree subtree = subtree_optional.value(); if (!subtree.is_node()) { - throw std::runtime_error("Invalid tree path to a leaf"); + throw std::runtime_error(fmt::format("Invalid tree path to a leaf: found {} instead", subtree)); } return parallel_layer_guid_t{ - subtree.get(), + subtree.require_node(), }; }; @@ -44,10 +45,7 @@ MachineMapping get_machine_mapping_from_machine_mapping_result( if (mm_result.raw_result) { FeasibleMachineMappingResult const &feasible_mm_result = mm_result.raw_result.value(); - for (auto const &[path, mv] : - feasible_mm_result.machine_mapping.raw_mapping) { - mm.insert({get_layer_from_path(path), mv}); - } + mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, get_layer_from_path); } return MachineMapping{mm}; diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc index bf8f089cc0..a8fa303ff6 100644 --- a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc +++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -28,15 +28,15 @@ size_t hash<::FlexFlow::GraphOptimizeState>::operator()( // TODO(@wmdi): Eventually it might be good to use a proper graph hash like // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash size_t seed = 0; - auto layers = topological_ordering(state.pcg); + std::vector<::FlexFlow::parallel_layer_guid_t> layers = topological_ordering(state.pcg); ::FlexFlow::hash_combine(seed, layers.size()); - for (auto layer : layers) { + for (::FlexFlow::parallel_layer_guid_t const & layer : layers) { ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer)); - auto inputs = get_incoming_tensors(state.pcg, layer); + std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = get_incoming_tensors(state.pcg, layer); ::FlexFlow::hash_combine(seed, inputs.size()); - for (auto input : inputs) { + for (::FlexFlow::parallel_tensor_guid_t input : inputs) { for (size_t i = 0; i < layers.size(); ++i) { - if (get_source_layer(input) == layers[i]) { + if (get_source_layer(input) == layers.at(i)) { ::FlexFlow::hash_combine(seed, i); break; } diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc index 2f2caf11ec..e7df440f7b 100644 --- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc +++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc @@ -13,6 +13,9 @@ #include "substitutions/substitution.h" #include "utils/deduplicated_priority_queue.h" #include "utils/graph/node/algorithms.h" +#include "compiler/machine_mapping/machine_mapping_cache.h" +#include "compiler/machine_mapping/machine_mapping_constraints.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" namespace FlexFlow { @@ -58,9 +61,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, return mapping; }(); - MachineMappingCache cached_subgraph_costs = MachineMappingCache{ - {}, - }; + MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache(); DeduplicatedPriorityQueue candidates; MachineMappingContext context = MachineMappingContext{ @@ -79,6 +80,14 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, }, }; + auto get_runtime_cost = [](MachineMappingResult const &mm_result) { + if (mm_result.raw_result == std::nullopt) { + return std::numeric_limits::infinity(); + } else { + return mm_result.raw_result.value().runtime; + } + }; + auto optimize_pcg = [&](ParallelComputationGraph const &pcg) -> std::pair { std::optional maybe_sp_decomp = @@ -90,9 +99,9 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value(); - MachineMappingConstraints constraints = MachineMappingConstraints{ - /*machine_views=*/{}, - }; + MachineMappingProblemTree problem_tree = get_machine_mapping_problem_tree(pcg, sp_decomp); + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree)); MachineMappingResult mm_result = get_optimal_machine_mapping( cached_subgraph_costs, @@ -101,16 +110,10 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, resources, constraints); - float runtime_with_optimal_mm; - if (mm_result.raw_result == std::nullopt) { - runtime_with_optimal_mm = std::numeric_limits::infinity(); - } else { - runtime_with_optimal_mm = mm_result.raw_result.value().runtime; - } return { GraphOptimizeState{ /*pcg=*/pcg, - /*runtime_with_optimal_mm=*/runtime_with_optimal_mm, + /*runtime_with_optimal_mm=*/get_runtime_cost(mm_result), }, get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result), }; diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index d50cce2af3..046571855e 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -1,10 +1,18 @@ #include "pcg/operator_task_space.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/operator_task_space.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" #include "utils/containers/cartesian_product.h" +#include "utils/containers/extend.h" #include "utils/containers/maximum.h" #include "utils/containers/product.h" #include "utils/containers/range.h" #include "utils/containers/transform.h" #include "utils/containers/unordered_set_of.h" +#include "utils/containers/vector_of.h" #include "utils/fmt/unordered_set.h" namespace FlexFlow { @@ -38,7 +46,14 @@ size_t num_tasks(OperatorTaskSpace const &task) { OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, parallel_layer_guid_t const &layer) { - NOT_IMPLEMENTED(); + parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0); + ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor); + + std::vector degrees; + extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); + degrees.push_back(get_sum_degree(shape)); + degrees.push_back(get_discard_copy_degree(shape)); + return OperatorTaskSpace{degrees}; } } // namespace FlexFlow From 62389ad919b0ead5de247b527eaec2f077cd0dbc Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 22 Jan 2025 18:02:37 -0500 Subject: [PATCH 14/16] upd --- .../allowed_machine_views.h | 0 .../machine_mapping_problem_tree.h | 1 + .../unity_algorithm/unity_algorithm.h | 3 +- lib/compiler/src/compiler/compiler.cc | 28 ++-- .../allowed_machine_views.cc | 2 +- .../machine_mapping/machine_mapping.cc | 8 +- .../unity_algorithm/graph_optimize_state.cc | 8 +- .../unity_algorithm/unity_algorithm.cc | 35 +++-- .../test/src/allowed_machine_views.cc | 2 +- lib/compiler/test/src/graph_optimize_state.cc | 133 +++++++++--------- lib/utils/include/utils/optional.h | 5 + 11 files changed, 116 insertions(+), 109 deletions(-) rename lib/compiler/include/compiler/{unity_algorithm => machine_mapping}/allowed_machine_views.h (100%) rename lib/compiler/src/compiler/{unity_algorithm => machine_mapping}/allowed_machine_views.cc (98%) diff --git a/lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h b/lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h similarity index 100% rename from lib/compiler/include/compiler/unity_algorithm/allowed_machine_views.h rename to lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h index 29e9e7c90b..2976a55bf1 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h @@ -4,6 +4,7 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h" #include "utils/full_binary_tree/binary_tree_path.dtg.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h" #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h" diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h index 4396bef734..223c4961eb 100644 --- a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h +++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h @@ -13,8 +13,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, std::vector const &substitutions, - UnitySearchConfig const &search_config, - DeviceType device_type); + UnitySearchConfig const &search_config); } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc index f2ff32b944..1c56e58796 100644 --- a/lib/compiler/src/compiler/compiler.cc +++ b/lib/compiler/src/compiler/compiler.cc @@ -9,22 +9,18 @@ SearchResult optimize(ComputationGraph const &computation_graph, CostEstimator const &cost_estimator, AlgorithmConfig const &search_config) { return search_config.visit(overload{ - [&](DataParallelismConfig const &config) -> SearchResult { - throw std::runtime_error( - "Data parallel search algorithm is not implemented yet"); - }, - [&](UnitySearchConfig const &config) { - ParallelComputationGraph pcg = - parallel_computation_graph_from_computation_graph(computation_graph); - std::vector substitutions; // TODO: Implement this - return graph_optimize(pcg, - cost_estimator, - machine_specification, - substitutions, - config, - DeviceType::GPU); - - }, + [&](DataParallelismConfig const &config) -> SearchResult { + throw std::runtime_error( + "Data parallel search algorithm is not implemented yet"); + }, + [&](UnitySearchConfig const &config) { + ParallelComputationGraph pcg = + parallel_computation_graph_from_computation_graph( + computation_graph); + std::vector substitutions; // TODO: Implement this + return graph_optimize( + pcg, cost_estimator, machine_specification, substitutions, config); + }, }); } diff --git a/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc similarity index 98% rename from lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc rename to lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc index d6fca79403..bcd8a63f84 100644 --- a/lib/compiler/src/compiler/unity_algorithm/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc @@ -1,4 +1,4 @@ -#include "compiler/unity_algorithm/allowed_machine_views.h" +#include "compiler/machine_mapping/allowed_machine_views.h" #include "pcg/machine_specification.h" #include "pcg/machine_view.h" #include "pcg/multi_dimensional_stride.dtg.h" diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index 33a8f686f5..e54ed925de 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -2,8 +2,8 @@ #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" #include "utils/containers/are_disjoint.h" #include "utils/containers/keys.h" -#include "utils/containers/merge_maps.h" #include "utils/containers/map_keys.h" +#include "utils/containers/merge_maps.h" #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" namespace FlexFlow { @@ -33,7 +33,8 @@ MachineMapping get_machine_mapping_from_machine_mapping_result( } BinarySPDecompositionTree subtree = subtree_optional.value(); if (!subtree.is_node()) { - throw std::runtime_error(fmt::format("Invalid tree path to a leaf: found {} instead", subtree)); + throw std::runtime_error(fmt::format( + "Invalid tree path to a leaf: found {} instead", subtree)); } return parallel_layer_guid_t{ subtree.require_node(), @@ -45,7 +46,8 @@ MachineMapping get_machine_mapping_from_machine_mapping_result( if (mm_result.raw_result) { FeasibleMachineMappingResult const &feasible_mm_result = mm_result.raw_result.value(); - mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, get_layer_from_path); + mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, + get_layer_from_path); } return MachineMapping{mm}; diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc index a8fa303ff6..1aa7f05655 100644 --- a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc +++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -28,11 +28,13 @@ size_t hash<::FlexFlow::GraphOptimizeState>::operator()( // TODO(@wmdi): Eventually it might be good to use a proper graph hash like // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash size_t seed = 0; - std::vector<::FlexFlow::parallel_layer_guid_t> layers = topological_ordering(state.pcg); + std::vector<::FlexFlow::parallel_layer_guid_t> layers = + topological_ordering(state.pcg); ::FlexFlow::hash_combine(seed, layers.size()); - for (::FlexFlow::parallel_layer_guid_t const & layer : layers) { + for (::FlexFlow::parallel_layer_guid_t const &layer : layers) { ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer)); - std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = get_incoming_tensors(state.pcg, layer); + std::vector<::FlexFlow::parallel_tensor_guid_t> inputs = + get_incoming_tensors(state.pcg, layer); ::FlexFlow::hash_combine(seed, inputs.size()); for (::FlexFlow::parallel_tensor_guid_t input : inputs) { for (size_t i = 0; i < layers.size(); ++i) { diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc index e7df440f7b..01c9c645a6 100644 --- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc +++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc @@ -1,21 +1,23 @@ #include "compiler/unity_algorithm/unity_algorithm.h" +#include "compiler/machine_mapping/allowed_machine_views.h" #include "compiler/machine_mapping/get_optimal_machine_mapping.h" #include "compiler/machine_mapping/machine_mapping.h" +#include "compiler/machine_mapping/machine_mapping_cache.h" +#include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" #include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h" -#include "compiler/unity_algorithm/allowed_machine_views.h" #include "compiler/unity_algorithm/graph_optimize_state.h" #include "pcg/machine_specification.dtg.h" #include "pcg/operator_task_space.h" #include "substitutions/pcg_pattern.h" #include "substitutions/sub_parallel_computation_graph.h" #include "substitutions/substitution.h" +#include "utils/containers/generate_map.h" #include "utils/deduplicated_priority_queue.h" #include "utils/graph/node/algorithms.h" -#include "compiler/machine_mapping/machine_mapping_cache.h" -#include "compiler/machine_mapping/machine_mapping_constraints.h" -#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "utils/optional.h" namespace FlexFlow { @@ -44,8 +46,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, CostEstimator const &cost_estimator, MachineSpecification const &resources, std::vector const &substitutions, - UnitySearchConfig const &search_config, - DeviceType device_type) { + UnitySearchConfig const &search_config) { // NOTE(@wmdi): This mapping is only used for allowed_machine_views std::unordered_map @@ -76,7 +77,7 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, pcg, mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at( key)), - device_type); + DeviceType::GPU); }, }; @@ -90,18 +91,14 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, auto optimize_pcg = [&](ParallelComputationGraph const &pcg) -> std::pair { - std::optional maybe_sp_decomp = - get_pcg_balanced_binary_sp_decomposition(pcg); - - if (!maybe_sp_decomp.has_value()) { - throw std::runtime_error("Fail to SP-ize PCG"); - } - - PCGBinarySPDecomposition sp_decomp = maybe_sp_decomp.value(); - - MachineMappingProblemTree problem_tree = get_machine_mapping_problem_tree(pcg, sp_decomp); - MachineMappingConstraints constraints = - get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree)); + PCGBinarySPDecomposition sp_decomp = + expect(get_pcg_balanced_binary_sp_decomposition(pcg), + "Failed to get SP decomposition of PCG"); + + MachineMappingProblemTree problem_tree = + get_machine_mapping_problem_tree(pcg, sp_decomp); + MachineMappingConstraints constraints = + get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree)); MachineMappingResult mm_result = get_optimal_machine_mapping( cached_subgraph_costs, diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc index b885f4f8ea..2481d84283 100644 --- a/lib/compiler/test/src/allowed_machine_views.cc +++ b/lib/compiler/test/src/allowed_machine_views.cc @@ -1,4 +1,4 @@ -#include "compiler/unity_algorithm/allowed_machine_views.h" +#include "compiler/machine_mapping/allowed_machine_views.h" #include "doctest/doctest.h" #include "utils/containers/extend.h" #include "utils/containers/range.h" diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc index 3bc9893f18..0be6d0a048 100644 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ b/lib/compiler/test/src/graph_optimize_state.cc @@ -5,77 +5,82 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - // TODO(@wmdi): to be udpated -// TEST_CASE("GraphOptimizeState::operator==") { -// ParallelComputationGraphBuilder builder; + // TODO(@wmdi): to be udpated + // TEST_CASE("GraphOptimizeState::operator==") { + // ParallelComputationGraphBuilder builder; -// ParallelTensorShape input_shape = -// ParallelTensorShape{ParallelTensorDims{ -// FFOrdered{ -// ShardParallelDim{32, 2}, -// ShardParallelDim{16, 1}, -// }, -// ReplicaParallelDimSet{ -// SumDegree{1}, -// DiscardCopyDegree{1}, -// }, -// }, -// DataType::FLOAT}; + // ParallelTensorShape input_shape = + // ParallelTensorShape{ParallelTensorDims{ + // FFOrdered{ + // ShardParallelDim{32, 2}, + // ShardParallelDim{16, 1}, + // }, + // ReplicaParallelDimSet{ + // SumDegree{1}, + // DiscardCopyDegree{1}, + // }, + // }, + // DataType::FLOAT}; -// parallel_tensor_guid_t input0 = -// builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); -// parallel_tensor_guid_t dense0 = builder.dense(input0, -// 8, -// Activation::RELU, -// true, -// DataType::FLOAT, -// std::nullopt, -// std::nullopt, -// "dense0"); + // parallel_tensor_guid_t input0 = + // builder.create_input_tensor(input_shape, CreateGrad::YES, + // "input0"); + // parallel_tensor_guid_t dense0 = builder.dense(input0, + // 8, + // Activation::RELU, + // true, + // DataType::FLOAT, + // std::nullopt, + // std::nullopt, + // "dense0"); -// parallel_tensor_guid_t dense1 = builder.dense(dense0, -// 4, -// Activation::RELU, -// true, -// DataType::FLOAT, -// std::nullopt, -// std::nullopt, -// "dense1"); + // parallel_tensor_guid_t dense1 = builder.dense(dense0, + // 4, + // Activation::RELU, + // true, + // DataType::FLOAT, + // std::nullopt, + // std::nullopt, + // "dense1"); -// ParallelComputationGraph pcg = builder.pcg; + // ParallelComputationGraph pcg = builder.pcg; -// // `machine_mapping` is determined by the PCG and the device mapping -// // algorithm, and `runtime` is determined by the PCG and the device mapping, -// // so their values here do not matter. -// std::unordered_map empty_machine_views; -// MachineMapping empty_machine_mapping(empty_machine_views); -// bool result1 = -// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), -// 0) == -// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), 0); -// bool correct1 = true; -// CHECK(result1 == correct1); + // // `machine_mapping` is determined by the PCG and the device mapping + // // algorithm, and `runtime` is determined by the PCG and the device + // mapping, + // // so their values here do not matter. + // std::unordered_map + // empty_machine_views; MachineMapping + // empty_machine_mapping(empty_machine_views); bool result1 = + // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), + // 0) == + // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), + // 0); + // bool correct1 = true; + // CHECK(result1 == correct1); -// ParallelComputationGraphBuilder builder_; + // ParallelComputationGraphBuilder builder_; -// parallel_tensor_guid_t input0_ = -// builder.create_input_tensor(input_shape, CreateGrad::YES, "input0"); -// parallel_tensor_guid_t dense0_ = builder.dense(input0, -// 8, -// Activation::RELU, -// true, -// DataType::FLOAT, -// std::nullopt, -// std::nullopt, -// "dense0"); + // parallel_tensor_guid_t input0_ = + // builder.create_input_tensor(input_shape, CreateGrad::YES, + // "input0"); + // parallel_tensor_guid_t dense0_ = builder.dense(input0, + // 8, + // Activation::RELU, + // true, + // DataType::FLOAT, + // std::nullopt, + // std::nullopt, + // "dense0"); -// ParallelComputationGraph pcg_ = builder.pcg; + // ParallelComputationGraph pcg_ = builder.pcg; -// bool result2 = -// GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), -// 0) == -// GraphOptimizeState(GraphOptimizeResult(pcg_, empty_machine_mapping), 0); -// bool correct2 = false; -// CHECK(result2 == correct2); -// } + // bool result2 = + // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), + // 0) == + // GraphOptimizeState(GraphOptimizeResult(pcg_, + // empty_machine_mapping), 0); + // bool correct2 = false; + // CHECK(result2 == correct2); + // } } diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h index 377561d70c..8673264d36 100644 --- a/lib/utils/include/utils/optional.h +++ b/lib/utils/include/utils/optional.h @@ -32,6 +32,11 @@ T const &assert_unwrap(std::optional const &o) { return o.value(); } +template +T expect(std::optional const &x, std::string const &err) { + return unwrap(x, [&]() { throw mk_runtime_error(err); }); +} + } // namespace FlexFlow #endif From 6d2fe50d4c9b59063949f3f10dead54c102646f1 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 29 Jan 2025 14:58:49 -0500 Subject: [PATCH 15/16] fixes --- .../machine_mapping/machine_mapping.h | 6 +- .../unmapped_op_cost_estimate_key.struct.toml | 4 ++ .../machine_mapping/machine_mapping_result.h | 2 + .../machine_mapping/machine_mapping.cc | 64 +++++++++---------- .../unmapped_op_cost_estimate_key.cc | 3 + .../machine_mapping/machine_mapping_result.cc | 8 +++ .../unity_algorithm/unity_algorithm.cc | 41 ++++-------- 7 files changed, 65 insertions(+), 63 deletions(-) diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h index 8f9fe23c1c..f17e921f2b 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h @@ -12,7 +12,11 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &, bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2); -MachineMapping get_machine_mapping_from_machine_mapping_result( +parallel_layer_guid_t + get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition, + BinaryTreePath const &path); + +std::optional get_machine_mapping_from_machine_mapping_result( PCGBinarySPDecomposition const &, MachineMappingResult const &); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml index fe76683eb7..7493c68387 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml @@ -11,6 +11,7 @@ includes = [ "op-attrs/parallel_tensor_shape.dtg.h", "", "pcg/machine_view.dtg.h", + "pcg/operator_task_space.dtg.h", ] src_includes = [ @@ -34,3 +35,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>" name = "output_shapes" type = "std::vector<::FlexFlow::ParallelTensorShape>" +[[fields]] +name = "op_task_space" +type = "::FlexFlow::OperatorTaskSpace" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index b21fea5f24..db2f4e6f0d 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -31,6 +31,8 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); make_singleton_machine_mapping_result(float runtime, MachineView const &machine_view); +[[nodiscard]] float get_runtime_cost(MachineMappingResult const &mm_result); + } // namespace FlexFlow #endif diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc index e54ed925de..5bcab18930 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc @@ -4,7 +4,8 @@ #include "utils/containers/keys.h" #include "utils/containers/map_keys.h" #include "utils/containers/merge_maps.h" -#include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h" +#include "utils/containers/transform.h" +#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h" namespace FlexFlow { @@ -17,40 +18,39 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) { return are_disjoint(keys(m1.machine_views), keys(m2.machine_views)); } -MachineMapping get_machine_mapping_from_machine_mapping_result( - PCGBinarySPDecomposition const &sp_decomposition, - MachineMappingResult const &mm_result) { +parallel_layer_guid_t + get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition, + BinaryTreePath const &path) { + std::optional subtree_optional = + get_subtree_at_path( + sp_decomposition, generic_impl_for_pcg_sp_tree(), path); + + if (!subtree_optional.has_value()) { + throw std::runtime_error(fmt::format("Invalid tree path {}", path)); + } - BinarySPDecompositionTree sp_tree = - binary_sp_tree_from_pcg_sp_tree(sp_decomposition); - - auto get_layer_from_path = - [&](BinaryTreePath const &path) -> parallel_layer_guid_t { - std::optional subtree_optional = - binary_sp_decomposition_tree_get_subtree_at_path(sp_tree, path); - if (!subtree_optional.has_value()) { - throw std::runtime_error(fmt::format("Invalid tree path {}", path)); - } - BinarySPDecompositionTree subtree = subtree_optional.value(); - if (!subtree.is_node()) { - throw std::runtime_error(fmt::format( - "Invalid tree path to a leaf: found {} instead", subtree)); - } - return parallel_layer_guid_t{ - subtree.require_node(), - }; - }; - - std::unordered_map mm; - - if (mm_result.raw_result) { - FeasibleMachineMappingResult const &feasible_mm_result = - mm_result.raw_result.value(); - mm = map_keys(feasible_mm_result.machine_mapping.raw_mapping, - get_layer_from_path); + PCGBinarySPDecomposition subtree = subtree_optional.value(); + if (!subtree.is_leaf()) { + throw std::runtime_error( + fmt::format("Invalid tree path to a leaf: found {} instead", subtree)); } + return subtree.require_leaf(); +} + +std::optional get_machine_mapping_from_machine_mapping_result( + PCGBinarySPDecomposition const &sp_decomposition, + MachineMappingResult const &mm_result) { - return MachineMapping{mm}; + return transform( + mm_result.raw_result, + [&](FeasibleMachineMappingResult const &feasible_mm_result) { + return MachineMapping{ + map_keys(feasible_mm_result.machine_mapping.raw_mapping, + [&](BinaryTreePath const &path) { + return get_layer_from_path(sp_decomposition, path); + }), + }; + }); } } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc index 990b287f8b..b6d701cb98 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc @@ -1,4 +1,5 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "pcg/operator_task_space.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" @@ -18,6 +19,8 @@ UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer( transform(get_incoming_weights(pcg, layer), get_tensor_shape), /*output_shapes=*/ transform(get_layer_outputs(pcg, layer), get_tensor_shape), + /*op_task_space=*/ + get_operator_task_space(pcg, layer), }; } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index 3409f7f871..031b7f7fc5 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -135,4 +135,12 @@ MachineMappingResult }; } +float get_runtime_cost(MachineMappingResult const &mm_result) { + if (mm_result.raw_result == std::nullopt) { + return std::numeric_limits::infinity(); + } else { + return mm_result.raw_result.value().runtime; + } +} + } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc index 01c9c645a6..3e2b2188b4 100644 --- a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc +++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc @@ -7,6 +7,7 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_result.h" #include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h" #include "compiler/unity_algorithm/graph_optimize_state.h" #include "pcg/machine_specification.dtg.h" @@ -48,20 +49,6 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, std::vector const &substitutions, UnitySearchConfig const &search_config) { - // NOTE(@wmdi): This mapping is only used for allowed_machine_views - std::unordered_map - mapping_from_unmapped_op_cost_estimate_key_parallel_layer = [&] { - std::unordered_map - mapping; - for (parallel_layer_guid_t layer : get_parallel_layers(pcg)) { - // NOTE(@wmdi): Assume layers with the same key have the same allowed - // machine views - mapping.insert( - {get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), layer}); - } - return mapping; - }(); - MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache(); DeduplicatedPriorityQueue candidates; @@ -72,25 +59,12 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, MachineSpecification const &resources) -> std::unordered_set { return get_allowed_machine_views( - resources, - get_operator_task_space( - pcg, - mapping_from_unmapped_op_cost_estimate_key_parallel_layer.at( - key)), - DeviceType::GPU); + resources, key.op_task_space, DeviceType::GPU); }, }; - auto get_runtime_cost = [](MachineMappingResult const &mm_result) { - if (mm_result.raw_result == std::nullopt) { - return std::numeric_limits::infinity(); - } else { - return mm_result.raw_result.value().runtime; - } - }; - auto optimize_pcg = [&](ParallelComputationGraph const &pcg) - -> std::pair { + -> std::pair> { PCGBinarySPDecomposition sp_decomp = expect(get_pcg_balanced_binary_sp_decomposition(pcg), "Failed to get SP decomposition of PCG"); @@ -148,9 +122,16 @@ SearchResult graph_optimize(ParallelComputationGraph &pcg, } } + std::optional best_mapping = + optimize_pcg(best_state.pcg).second; + + if (best_mapping == std::nullopt) { + throw std::runtime_error("Failed to find any solutions"); + } + return SearchResult{ /*pcg=*/best_state.pcg, - /*machine_mapping=*/optimize_pcg(best_state.pcg).second, + /*machine_mapping=*/best_mapping.value(), }; } From 45a931c997e696f577e9a97599b95c2db95a66a8 Mon Sep 17 00:00:00 2001 From: wmdi Date: Wed, 29 Jan 2025 21:01:00 -0500 Subject: [PATCH 16/16] fix --- .../machine_mapping}/allowed_machine_views.cc | 0 .../get_optimal_machine_mapping.cc | 5 + .../get_machine_mapping_problem_tree.cc | 18 +++ ...get_optimal_machine_mapping_with_memory.cc | 4 + .../unity_algorithm/graph_optimize_state.cc | 92 ++++++++++++++ .../unity_algorithm/unity_algorithm.cc | 115 ++++++++++++++++++ lib/compiler/test/src/graph_optimize_state.cc | 86 ------------- lib/compiler/test/src/unity_algorithm.cc | 27 ---- 8 files changed, 234 insertions(+), 113 deletions(-) rename lib/compiler/test/src/{ => compiler/machine_mapping}/allowed_machine_views.cc (100%) create mode 100644 lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc create mode 100644 lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc delete mode 100644 lib/compiler/test/src/graph_optimize_state.cc delete mode 100644 lib/compiler/test/src/unity_algorithm.cc diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc similarity index 100% rename from lib/compiler/test/src/allowed_machine_views.cc rename to lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index f5d5a5ee1b..4b77b3eebd 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -97,11 +97,15 @@ TEST_SUITE(FF_TEST_SUITE) { } }; + // Operator task spaces are not used in this test. Just make a placeholder. + OperatorTaskSpace fake_op_task_space = OperatorTaskSpace {{}}; + UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}}, /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ @@ -114,6 +118,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; ParallelTensorShape tensor_shape1 = ParallelTensorShape{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 06ab1e5b8c..ee71222fe3 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -1,7 +1,11 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" +#include "op-attrs/parallel_tensor_shape.h" #include "utils/containers/get_only.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/extend.h" +#include "pcg/operator_task_space.h" #include using namespace ::FlexFlow; @@ -93,6 +97,14 @@ TEST_SUITE(FF_TEST_SUITE) { PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{}}; + auto make_operator_task_space = [&](ParallelTensorShape const &shape) { + std::vector degrees; + extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); + degrees.push_back(get_sum_degree(shape)); + degrees.push_back(get_discard_copy_degree(shape)); + return OperatorTaskSpace{degrees}; + }; + auto make_input_key = [&](ParallelTensorShape const ¶llel_tensor_shape) { return UnmappedOpCostEstimateKey{ @@ -100,6 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{parallel_tensor_shape}, + /*op_task_space=*/make_operator_task_space(parallel_tensor_shape), }; }; @@ -149,11 +162,14 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t relu_layer = relu_added.parallel_layer; parallel_tensor_guid_t relu_output = get_only(relu_added.outputs); + OperatorTaskSpace relu_task_space = get_operator_task_space(pcg, relu_layer); + UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{ /*op_attrs=*/relu_attrs, /*input_shapes=*/{input_shape}, /*weight_shapes=*/{}, /*output_shapes=*/{relu_output_shape}, + /*op_task_space=*/relu_task_space, }; PCGBinarySPDecomposition sp_decomposition = pcg_make_series( @@ -234,11 +250,13 @@ TEST_SUITE(FF_TEST_SUITE) { {input1_tensor, input2_tensor}, {make_output_attrs(ew_op_output_shape)}); parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer; + OperatorTaskSpace ew_op_task_space = get_operator_task_space(pcg, ew_op_layer); UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{ /*op_attrs=*/ew_op_attrs, /*input_shapes=*/{input_shape, input_shape}, /*weight_shapes=*/{}, /*output_shapes=*/{ew_op_output_shape}, + /*op_task_space=*/ew_op_task_space, }; PCGBinarySPDecomposition sp_decomposition = diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 8761116be2..cc1a1043cf 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -96,12 +96,15 @@ TEST_SUITE(FF_TEST_SUITE) { return std::unordered_set{mv2}; } }; + + OperatorTaskSpace fake_op_task_space = OperatorTaskSpace {{}}; UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ /*op_attrs=*/PCGOperatorAttrs{InputAttrs{}}, /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ @@ -114,6 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*op_task_space=*/fake_op_task_space, }; ParallelTensorShape tensor_shape1 = ParallelTensorShape{ diff --git a/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc new file mode 100644 index 0000000000..0d28cecac7 --- /dev/null +++ b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc @@ -0,0 +1,92 @@ +#include "compiler/unity_algorithm/graph_optimize_state.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("GraphOptimizeState::operator== and operator!=") { + ParallelComputationGraph pcg1 = empty_parallel_computation_graph(); + ParallelComputationGraph pcg2 = empty_parallel_computation_graph(); + + + ParallelTensorShape input_shape = ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ + ShardParallelDim{10, 1}, + }, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }; + + auto make_output_attrs = [](ParallelTensorShape const &shape) { + return ParallelTensorAttrs{ + /*shape=*/shape, + /*sync_type=*/std::nullopt, + /*initializer=*/std::nullopt, + /*create_gradients=*/CreateGrad::YES, + }; + }; + + auto make_layer_attrs = [](PCGOperatorAttrs const &op_attrs) { + return ParallelLayerAttrs{ + /*op_attrs=*/op_attrs, + /*name=*/std::nullopt, + }; + }; + + PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{}}; + + add_parallel_layer( + pcg2, + /*layer_attrs=*/make_layer_attrs(input_attrs), + /*inputs=*/{}, + /*output_labels=*/{make_output_attrs(input_shape)}); + + SUBCASE("same pcgs") { + GraphOptimizeState state1 = GraphOptimizeState(pcg1, 0.0); + GraphOptimizeState state2 = GraphOptimizeState(pcg1, 0.0); + bool result_eq = state1 == state2; + bool expected_eq = true; + CHECK(result_eq == expected_eq); + bool result_neq = state1 != state2; + bool expected_neq = false; + CHECK(result_neq == expected_neq); + } + + SUBCASE("different pcgs with the same runtime") { + GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0); + GraphOptimizeState state2 = GraphOptimizeState(pcg2, 1.0); + bool result_eq = state1 == state2; + bool expected_eq = false; + CHECK(result_eq == expected_eq); + bool result_neq = state1 != state2; + bool expected_neq = true; + CHECK(result_neq == expected_neq); + } + + SUBCASE("different pcgs with different runtime") { + GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0); + GraphOptimizeState state2 = GraphOptimizeState(pcg2, 2.0); + bool result_eq = state1 == state2; + bool expected_eq = false; + CHECK(result_eq == expected_eq); + bool result_neq = state1 != state2; + bool expected_neq = true; + CHECK(result_neq == expected_neq); + } + } + + TEST_CASE("GraphOptimizeState::operator<") { + ParallelComputationGraph pcg1 = empty_parallel_computation_graph(); + ParallelComputationGraph pcg2 = empty_parallel_computation_graph(); + GraphOptimizeState state1 = GraphOptimizeState(pcg1, 1.0); + GraphOptimizeState state2 = GraphOptimizeState(pcg2, 2.0); + bool result = state1 < state2; + bool expected = true; + CHECK(result == expected); + } +} diff --git a/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc new file mode 100644 index 0000000000..447a6a04b2 --- /dev/null +++ b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc @@ -0,0 +1,115 @@ +#include "compiler/unity_algorithm/unity_algorithm.h" +#include "pcg/computation_graph_builder.h" +#include "../machine_mapping/cost_estimator_for_test.h" +#include "op-attrs/parallel_tensor_dims.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/shard_parallel_dim.h" +#include "op-attrs/replica_type.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" +#include "utils/integer_conversions.h" +#include "doctest/doctest.h" + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("graph_optimize") { + // TODO: recover this by implementing parallel_computation_graph_from_computation_graph + // ComputationGraph cg = [&] { + // ComputationGraphBuilder b; + // TensorShape input_tensor_shape = TensorShape{ + // TensorDims{ + // FFOrdered {32, 64}, + // }, + // DataType::FLOAT, + // }; + // tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES); + // t = b.dense(t, + // /*outDim=*/16, + // /*activation=*/std::nullopt); + // t = b.gelu(t); + // t = b.dense(t, + // /*outDim=*/12, + // /*activation=*/std::nullopt, + // /*use_bias=*/false, + // /*data_type=*/DataType::FLOAT, + // /*kernel_initializer=*/std::nullopt, + // /*bias_initializer=*/std::nullopt); + // t = b.relu(t); + // t = b.dense(t, + // /*outDim=*/8, + // /*activation=*/Activation::RELU); + // return b.computation_graph; + // }(); + + // ParallelComputationGraph pcg = parallel_computation_graph_from_computation_graph(cg); + + ParallelComputationGraph pcg = [&] { + ParallelComputationGraphBuilder b; + int in_channels = 24; + int batch_size = 4; + int batch_degree = 2; + parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{ + ParallelTensorDims{ + FFOrdered{ + ShardParallelDim{size_t_from_int(batch_size), batch_degree}, + ShardParallelDim{size_t_from_int(in_channels), 1}, + }, + ReplicaParallelDimSet{ + SumDegree{1}, + DiscardCopyDegree{1}, + }, + }, + DataType::FLOAT, + }); + t = b.dense(t, + /*outDim=*/16, + /*activation=*/std::nullopt); + t = b.gelu(t); + t = b.dense(t, + /*outDim=*/12, + /*activation=*/std::nullopt, + /*use_bias=*/false, + /*data_type=*/DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + t = b.relu(t); + t = b.dense(t, + /*outDim=*/8, + /*activation=*/Activation::RELU); + + return b.pcg; + }(); + + CostEstimator cost_estimator = make_fake_cost_estimator([](OpCostEstimateKey const &k) { + return OpCostMetrics{ + /*runtime=*/1.0, + /*memory=*/1, + }; + }, + [](TensorSetMovement const &) { + return 1.0; + }); + + MachineSpecification full_machine_spec = MachineSpecification{ + /*num_nodes=*/2, + /*num_cpus_per_node=*/1, + /*num_gpus_per_node=*/1, + /*inter_node_bandwidth=*/1, + /*intra_node_bandwidth=*/1, + }; + + // TODO: set up substitutions + std::vector substitutions = {}; + + UnitySearchConfig search_config = UnitySearchConfig{ + /*alpha=*/1.0, + /*budget=*/20, + /*threshold=*/1000.0, + /*max_num_ops=*/100, + }; + + // SearchResult result = graph_optimize(pcg, cost_estimator, full_machine_spec, substitutions, search_config); + + // TODO: check the result + } +} diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc deleted file mode 100644 index 0be6d0a048..0000000000 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ /dev/null @@ -1,86 +0,0 @@ -#include "compiler/unity_algorithm/graph_optimize_state.h" -#include "doctest/doctest.h" -#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" - -using namespace FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - // TODO(@wmdi): to be udpated - // TEST_CASE("GraphOptimizeState::operator==") { - // ParallelComputationGraphBuilder builder; - - // ParallelTensorShape input_shape = - // ParallelTensorShape{ParallelTensorDims{ - // FFOrdered{ - // ShardParallelDim{32, 2}, - // ShardParallelDim{16, 1}, - // }, - // ReplicaParallelDimSet{ - // SumDegree{1}, - // DiscardCopyDegree{1}, - // }, - // }, - // DataType::FLOAT}; - - // parallel_tensor_guid_t input0 = - // builder.create_input_tensor(input_shape, CreateGrad::YES, - // "input0"); - // parallel_tensor_guid_t dense0 = builder.dense(input0, - // 8, - // Activation::RELU, - // true, - // DataType::FLOAT, - // std::nullopt, - // std::nullopt, - // "dense0"); - - // parallel_tensor_guid_t dense1 = builder.dense(dense0, - // 4, - // Activation::RELU, - // true, - // DataType::FLOAT, - // std::nullopt, - // std::nullopt, - // "dense1"); - - // ParallelComputationGraph pcg = builder.pcg; - - // // `machine_mapping` is determined by the PCG and the device mapping - // // algorithm, and `runtime` is determined by the PCG and the device - // mapping, - // // so their values here do not matter. - // std::unordered_map - // empty_machine_views; MachineMapping - // empty_machine_mapping(empty_machine_views); bool result1 = - // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), - // 0) == - // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), - // 0); - // bool correct1 = true; - // CHECK(result1 == correct1); - - // ParallelComputationGraphBuilder builder_; - - // parallel_tensor_guid_t input0_ = - // builder.create_input_tensor(input_shape, CreateGrad::YES, - // "input0"); - // parallel_tensor_guid_t dense0_ = builder.dense(input0, - // 8, - // Activation::RELU, - // true, - // DataType::FLOAT, - // std::nullopt, - // std::nullopt, - // "dense0"); - - // ParallelComputationGraph pcg_ = builder.pcg; - - // bool result2 = - // GraphOptimizeState(GraphOptimizeResult(pcg, empty_machine_mapping), - // 0) == - // GraphOptimizeState(GraphOptimizeResult(pcg_, - // empty_machine_mapping), 0); - // bool correct2 = false; - // CHECK(result2 == correct2); - // } -} diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc deleted file mode 100644 index d8523f6659..0000000000 --- a/lib/compiler/test/src/unity_algorithm.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "compiler/unity_algorithm/unity_algorithm.h" -#include "doctest/doctest.h" - -TEST_SUITE(FF_TEST_SUITE) { - // TODO: to be udpated - // Rapidcheck does not work for now - // TEST_CASE("graph_optimize") { - // RC_SUBCASE([](ComputationGraph const &g, - // float alpha, - // int budget, - // float threshold, - // int max_num_ops) { - // Strategy s = graph_optimize( - // g, - // TestCostEstimator{}, - // MachineSpecification{1, 1, 4, 0.1, 0.2}, - // [](Operator const &, MachineSpecification const &) { - // return std::unordered_set{make_1d_machine_view(0, 1, - // 1)}; - // }, - // OptimizerConfig{alpha, budget, threshold, max_num_ops}); - // RC_ASSERT(get_nodes(s.pcg).size() > 0); - // RC_ASSERT(s.machine_mapping.runtime > 0); - // RC_ASSERT(keys(s.machine_mapping.machine_views) == get_nodes(s.pcg)); - // }); - // } -}