diff --git a/docs/algorithms/mapper.rst b/docs/algorithms/mapper.rst
index 816d4f6b9..14360efad 100644
--- a/docs/algorithms/mapper.rst
+++ b/docs/algorithms/mapper.rst
@@ -1,3 +1,87 @@
+Extended technology mapping
+---------------------------
+
+**Header:** ``mockturtle/algorithms/emap.hpp``
+
+The command `emap` stands for extended mapper. It supports large
+library cells, of more than 6 inputs, and can perform matching using 3
+different methods: Boolean, pattern, or hybrid. The current version
+can map to 2-output gates, such as full adders and half adders,
+and provides a 2x speedup in mapping time compared to command `map`
+for similar or better quality. Similarly, to `map`, the implementation
+is independent of the underlying graph representation.
+Additionally, `emap` supports "don't touch" white boxes (gates).
+
+Command `emap` can return the mapped network in two formats.
+Command `emap` returns a `cell_view<block_network>` that supports
+multi-output cells. Command `emap_klut` returns a `binding_view<klut_network>`
+similarly as command `map`.
+
+The following example shows how to perform delay-oriented technology mapping
+from an and-inverter graph using large cells up to 9 inputs:
+
+.. code-block:: c++
+
+   aig_network aig = ...;
+
+   /* read cell library in genlib format */
+   std::vector<gate> gates;
+   std::ifstream in( ... );
+   lorina::read_genlib( in, genlib_reader( gates ) )
+   tech_library<9> tech_lib( gates );
+
+   /* perform technology mapping */
+   cell_view<block_network> res = emap<9>( aig, tech_lib );
+
+The next example performs area-oriented graph mapping using multi-output cells:
+
+.. code-block:: c++
+
+   aig_network aig = ...;
+
+   /* read cell library in genlib format */
+   std::vector<gate> gates;
+   std::ifstream in( ... );
+   lorina::read_genlib( in, genlib_reader( gates ) )
+   tech_library tech_lib( gates );
+
+   /* perform technology mapping */
+   emap_params ps;
+   ps.area_oriented_mapping = true;
+   ps.map_multioutput = true;
+   cell_view<block_network> res = emap( aig, tech_lib, ps );
+
+In this case, `emap` is used to return a `block_network`, which can respresent multi-output
+cells as single nodes. Alternatively, also `emap_klut` can be used but multi-output cells
+would be reporesented by single-output nodes.
+
+The maximum number of cuts stored for each node is limited to 20.
+To increase this limit, change `max_cut_num` in `emap`.
+
+You can set the inputs arrival time and output required times using the parameters `arrival_times`
+and `required times`. Moreover, it is possible to ask for a required time relaxation. For instance,
+if we want to map a network with an increase of 10% over its minimal delay, we can set
+`relax_required` to 10.
+
+For further details and usage scenarios of `emap`, such as white boxes, please check the
+related tests.
+
+**Parameters and statistics**
+
+.. doxygenstruct:: mockturtle::emap_params
+   :members:
+
+.. doxygenstruct:: mockturtle::emap_stats
+   :members:
+
+**Algorithm**
+
+.. doxygenfunction:: mockturtle::emap(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_klut(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_node_map(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_load_mapping(Ntk&)
+
+
 Technology mapping and network conversion
 -----------------------------------------
 
@@ -136,84 +220,4 @@ To increase this limit, change `max_cut_num` in `fast_network_cuts`.
 **Algorithm**
 
 .. doxygenfunction:: mockturtle::map(Ntk const&, tech_library<NInputs, Configuration> const&, map_params const&, map_stats*)
-.. doxygenfunction:: mockturtle::map(Ntk&, exact_library<NtkDest, RewritingFn, NInputs> const&, map_params const&, map_stats*)
-
-
-
-Extended technology mapping
----------------------------
-
-**Header:** ``mockturtle/algorithms/emap.hpp``
-
-The command `emap` stands for extended mapper. It supports large
-library cells, of more than 6 inputs, and can perform matching using 3
-different methods: Boolean, pattern, or hybrid. The current version
-can map to 2-output gates, such as full adders and half adders,
-and provides a 2x speedup in mapping time compared to command `map`
-for similar or better quality. Similarly, to `map`, the implementation
-is independent of the underlying graph representation.
-Additionally, `emap` supports "don't touch" white boxes (gates).
-
-Command `emap` can return the mapped network in two formats.
-Command `emap` returns a `cell_view<block_network>` that supports
-multi-output cells. Command `emap_klut` returns a `binding_view<klut_network>`
-similarly as command `map`.
-
-The following example shows how to perform delay-oriented technology mapping
-from an and-inverter graph using large cells up to 9 inputs:
-
-.. code-block:: c++
-
-   aig_network aig = ...;
-
-   /* read cell library in genlib format */
-   std::vector<gate> gates;
-   std::ifstream in( ... );
-   lorina::read_genlib( in, genlib_reader( gates ) )
-   tech_library<9> tech_lib( gates );
-
-   /* perform technology mapping */
-   cell_view<block_network> res = emap<9>( aig, tech_lib );
-
-The next example performs area-oriented graph mapping using multi-output cells:
-
-.. code-block:: c++
-
-   aig_network aig = ...;
-
-   /* read cell library in genlib format */
-   std::vector<gate> gates;
-   std::ifstream in( ... );
-   lorina::read_genlib( in, genlib_reader( gates ) )
-   tech_library tech_lib( gates );
-
-   /* perform technology mapping */
-   emap_params ps;
-   ps.area_oriented_mapping = true;
-   ps.map_multioutput = true;
-   cell_view<block_network> res = emap( aig, tech_lib, ps );
-
-In this case, `emap` is used to return a `block_network`, which can respresent multi-output
-cells as single nodes. Alternatively, also `emap_klut` can be used but multi-output cells
-would be reporesented by single-output nodes.
-
-The maximum number of cuts stored for each node is limited to 32.
-To increase this limit, change `max_cut_num` in `emap`.
-
-For further details and usage scenarios of `emap`, such as white boxes, please check the
-related tests.
-
-**Parameters and statistics**
-
-.. doxygenstruct:: mockturtle::emap_params
-   :members:
-
-.. doxygenstruct:: mockturtle::emap_stats
-   :members:
-
-**Algorithm**
-
-.. doxygenfunction:: mockturtle::emap(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_klut(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_node_map(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_load_mapping(Ntk&)
\ No newline at end of file
+.. doxygenfunction:: mockturtle::map(Ntk&, exact_library<NtkDest, NInputs> const&, map_params const&, map_stats*)
\ No newline at end of file
diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index 92fed626a..55ced8457 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -65,6 +65,7 @@ int main()
   }
 
   tech_library_params tps;
+  tps.ignore_symmetries = false; // set to true to drastically speed-up mapping with minor delay increase
   tps.verbose = true;
   tech_library<9> tech_lib( gates, tps );
 
@@ -91,6 +92,7 @@ int main()
     ps.matching_mode = emap_params::hybrid;
     ps.area_oriented_mapping = false;
     ps.map_multioutput = true;
+    ps.relax_required = 0;
     emap_stats st;
     cell_view<block_network> res = emap<9>( aig, tech_lib, ps, &st );
 
diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index aaf9b7285..2ca21c5b6 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -107,14 +107,20 @@ struct emap_params
     hybrid
   } matching_mode = hybrid;
 
-  /*! \brief Required time for delay optimization. */
+  /*! \brief Target required time (for each PO). */
   double required_time{ 0.0f };
 
-  /*! \brief Required time relaxation ratio. */
+  /*! \brief Required time relaxation in percentage (10 = 10%). */
   double relax_required{ 0.0f };
 
+  /*! \brief Custom input arrival times. */
+  std::vector<double> arrival_times{};
+
+  /*! \brief Custom output required times. */
+  std::vector<double> required_times{};
+
   /*! \brief Number of rounds for area flow optimization. */
-  uint32_t area_flow_rounds{ 2u };
+  uint32_t area_flow_rounds{ 3u };
 
   /*! \brief Number of rounds for exact area optimization. */
   uint32_t ela_rounds{ 2u };
@@ -125,8 +131,8 @@ struct emap_params
   /*! \brief Number of patterns for switching activity computation. */
   uint32_t switching_activity_patterns{ 2048u };
 
-  /*! \brief Fast area recovery */
-  bool use_fast_area_recovery{ true };
+  /*! \brief Compute area-oriented alternative matches */
+  bool use_match_alternatives{ true };
 
   /*! \brief Remove the cuts that are contained in others */
   bool remove_dominated_cuts{ false };
@@ -134,9 +140,6 @@ struct emap_params
   /*! \brief Remove overlapping multi-output cuts */
   bool remove_overlapping_multicuts{ false };
 
-  /*! \brief Doesn't allow node duplication */
-  bool allow_node_duplication{ true };
-
   /*! \brief Be verbose. */
   bool verbose{ false };
 };
@@ -671,15 +674,29 @@ struct emap_triple_hash
 };
 #pragma endregion
 
+template<unsigned NInputs>
+struct best_gate_emap
+{
+  supergate<NInputs> const* gate;
+  double arrival;
+  float area;
+  float flow;
+  unsigned phase : 16;
+  unsigned cut : 12;
+  unsigned size : 4;
+};
+
 template<unsigned NInputs>
 struct node_match_emap
 {
   /* best gate match for positive and negative output phases */
-  supergate<NInputs> const* best_supergate[2];
+  supergate<NInputs> const* best_gate[2];
+  /* alternative best gate for positibe and negative output phase */
+  best_gate_emap<NInputs> best_alternative[2];
   /* fanin pin phases for both output phases */
   uint16_t phase[2];
   /* best cut index for both phases */
-  uint32_t best_cut[2];
+  uint16_t best_cut[2];
   /* node is mapped using only one phase */
   bool same_match;
   /* node is mapped to a multi-output gate */
@@ -700,20 +717,32 @@ struct node_match_emap
   float flows[2];
 };
 
-union multi_match_data
+template<class Ntk, unsigned CutSize, unsigned NInputs, classification_type Configuration>
+class emap_impl
 {
-  uint64_t data{ 0 };
-  struct
+private:
+  union multi_match_data
   {
-    uint64_t in_tfi : 1;
-    uint64_t cut_index : 31;
-    uint64_t node_index : 32;
+    uint64_t data{ 0 };
+    struct
+    {
+      uint64_t in_tfi : 1;
+      uint64_t cut_index : 31;
+      uint64_t node_index : 32;
+    };
+  };
+  union multioutput_info
+  {
+    uint32_t data;
+    struct
+    {
+      unsigned index : 29;
+      unsigned lowest_index : 1;
+      unsigned highest_index : 1;
+      unsigned has_info : 1;
+    };
   };
-};
 
-template<class Ntk, unsigned CutSize, unsigned NInputs, classification_type Configuration>
-class emap_impl
-{
 public:
   static constexpr float epsilon = 0.0005;
   static constexpr uint32_t max_cut_num = 20;
@@ -750,10 +779,11 @@ class emap_impl
         ps( ps ),
         st( st ),
         node_match( ntk.size() ),
-        node_tuple_match( ntk.size(), UINT32_MAX ),
+        node_tuple_match( ntk.size() ),
         switch_activity( ps.eswp_rounds ? switching_activity( ntk, ps.switching_activity_patterns ) : std::vector<float>( 0 ) ),
         cuts( ntk.size() )
   {
+    std::memset( node_tuple_match.data(), 0, sizeof( multioutput_info ) * ntk.size() );
     std::tie( lib_inv_area, lib_inv_delay, lib_inv_id ) = library.get_inverter_info();
     std::tie( lib_buf_area, lib_buf_delay, lib_buf_id ) = library.get_buffer_info();
     tmp_visited.reserve( 100 );
@@ -765,10 +795,11 @@ class emap_impl
         ps( ps ),
         st( st ),
         node_match( ntk.size() ),
-        node_tuple_match( ntk.size(), UINT32_MAX ),
+        node_tuple_match( ntk.size() ),
         switch_activity( switch_activity ),
         cuts( ntk.size() )
   {
+    std::memset( node_tuple_match.data(), 0, sizeof( multioutput_info ) * ntk.size() );
     std::tie( lib_inv_area, lib_inv_delay, lib_inv_id ) = library.get_inverter_info();
     std::tie( lib_buf_area, lib_buf_delay, lib_buf_id ) = library.get_buffer_info();
     tmp_visited.reserve( 100 );
@@ -789,6 +820,10 @@ class emap_impl
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* search for large matches */
     if ( ps.matching_mode == emap_params::structural || CutSize > 6 )
     {
@@ -843,6 +878,10 @@ class emap_impl
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* search for large matches */
     if ( ps.matching_mode == emap_params::structural || CutSize > 6 )
     {
@@ -888,11 +927,15 @@ class emap_impl
 
     auto [res, old2new] = initialize_map_network();
 
-    /* TODO: multi-output support is currently not implemented */
+    /* [i] multi-output support is currently not implemented */
 
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* compute cuts, matches, and initial mapping */
     if ( !ps.area_oriented_mapping )
     {
@@ -930,7 +973,6 @@ class emap_impl
     uint32_t i = 0;
     while ( i++ < ps.area_flow_rounds )
     {
-      compute_required_time();
       if ( !compute_mapping<true>() )
       {
         return false;
@@ -939,54 +981,22 @@ class emap_impl
 
     /* compute mapping using exact area */
     i = 0;
-    if ( ps.use_fast_area_recovery )
+    compute_required_time( true );
+    while ( i++ < ps.ela_rounds )
     {
-      compute_required_time( true );
-      reindex_multioutput_data();
-      while ( i++ < ps.ela_rounds )
+      if ( !compute_mapping_exact_reversed<false>() )
       {
-        if ( !compute_mapping_exact_reversed<false>( i == ps.ela_rounds ) )
-        {
-          return false;
-        }
-      }
-
-      /* compute mapping using exact switching activity estimation */
-      i = 0;
-      while ( i++ < ps.eswp_rounds )
-      {
-        if ( !compute_mapping_exact_reversed<true>( true ) )
-        {
-          return false;
-        }
+        return false;
       }
     }
-    else
-    {
-      while ( i++ < ps.ela_rounds )
-      {
-        compute_required_time();
-        if ( !compute_mapping_exact<false>( i == ps.ela_rounds ) )
-        {
-          return false;
-        }
-      }
 
-      /* compute mapping using exact switching activity estimation */
-      i = 0;
-      while ( i++ < ps.eswp_rounds )
-      {
-        compute_required_time();
-        if ( !compute_mapping_exact<true>( true ) )
-        {
-          return false;
-        }
-      }
-
-      /* cleaning not fully utilized multi-output gates */
-      if ( ps.map_multioutput )
+    /* compute mapping using exact switching activity estimation */
+    i = 0;
+    while ( i++ < ps.eswp_rounds )
+    {
+      if ( !compute_mapping_exact_reversed<true>() )
       {
-        remove_unused_multioutput();
+        return false;
       }
     }
 
@@ -1008,6 +1018,12 @@ class emap_impl
         continue;
       }
 
+      /* load multi-output cuts and data */
+      if ( ps.map_multioutput && node_tuple_match[index].has_info )
+      {
+        match_multi_add_cuts( n );
+      }
+
       /* match positive phase */
       match_phase<DO_AREA>( n, 0u );
 
@@ -1015,26 +1031,24 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
 
-      /* load and try a multi-output matches */
-      if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
-      {
-        /* continue if matches do not fit in the cut data structure due to bad settings */
-        if ( !match_multi_add_cuts<DO_AREA>( n ) )
-          continue;
+      /* select alternative matches to use */
+      select_alternatives<DO_AREA>( n );
 
-        if constexpr ( DO_AREA )
+      /* try multi-output matches */
+      if constexpr ( DO_AREA )
+      {
+        if ( ps.map_multioutput && node_tuple_match[index].highest_index )
         {
-          bool multi_success = match_multioutput<DO_AREA>( n );
-          if ( multi_success )
+          if ( match_multioutput<DO_AREA>( n ) )
             multi_node_update<DO_AREA>( n );
         }
       }
     }
 
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     if ( warning_box )
     {
@@ -1078,7 +1092,9 @@ class emap_impl
     {
       /* all terminals have flow 0.0 */
       node_data.flows[0] = node_data.flows[1] = 0.0f;
+      node_data.best_alternative[0].flow = node_data.best_alternative[1].flow = 0.0f;
       node_data.arrival[0] = node_data.arrival[1] = 0.0f;
+      node_data.best_alternative[0].arrival = node_data.best_alternative[1].arrival = 0.0f;
       /* skip if cuts have been computed before */
       if ( cuts[index].size() == 0 )
       {
@@ -1090,10 +1106,10 @@ class emap_impl
     else if ( ntk.is_pi( n ) )
     {
       node_data.flows[0] = 0.0f;
-      node_data.arrival[0] = 0.0f;
+      node_data.best_alternative[0].flow = 0.0f;
       /* PIs have the negative phase implemented with an inverter */
       node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
-      node_data.arrival[1] = lib_inv_delay;
+      node_data.best_alternative[1].flow = lib_inv_area / node_data.est_refs[1];
       /* skip if cuts have been computed before */
       if ( cuts[index].size() == 0 )
       {
@@ -1110,7 +1126,6 @@ class emap_impl
     {
       if ( ntk.is_dont_touch( n ) )
       {
-        
         warning_box |= initialize_box( n );
         return false;
       }
@@ -1197,7 +1212,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1284,7 +1299,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1307,7 +1322,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1481,7 +1496,7 @@ class emap_impl
       auto const index = ntk.node_to_index( n );
       auto& node_data = node_match[index];
 
-      node_data.best_supergates[0] = node_data.best_supergates[1] = nullptr;
+      node_data.best_gate[0] = node_data.best_gate[1] = nullptr;
       node_data.same_match = 0;
       node_data.multioutput_match[0] = node_data.multioutput_match[1] = false;
       node_data.required[0] = node_data.required[1] = std::numeric_limits<float>::max();
@@ -1501,10 +1516,8 @@ class emap_impl
       {
         /* all terminals have flow 0 */
         node_data.flows[0] = 0.0f;
-        node_data.arrival[0] = 0.0f;
         /* PIs have the negative phase implemented with an inverter */
         node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
-        node_data.arrival[1] = lib_inv_delay;
         add_unit_cut( index );
         continue;
       }
@@ -1519,10 +1532,13 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
+
+      /* select alternative matches to use */
+      select_alternatives<DO_AREA>( n );
     }
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     /* round stats */
     if ( ps.verbose )
@@ -1567,7 +1583,7 @@ class emap_impl
     new_cut->function = kitty::extend_to<6>( ntk.node_function( n ) );
 
     /* match cut and compute data */
-    compute_cut_data<DO_AREA>( new_cut, n );
+    compute_cut_data( new_cut, n );
 
     ++cuts_total;
   }
@@ -1587,6 +1603,7 @@ class emap_impl
       if ( ntk.is_pi( n ) )
       {
         node_match[index].flows[1] = lib_inv_area / node_match[index].est_refs[1];
+        node_match[index].best_alternative[1].flow = lib_inv_area / node_match[index].est_refs[1];
         continue;
       }
 
@@ -1610,12 +1627,12 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
 
       /* try a multi-output match */
       if constexpr ( DO_AREA )
       {
-        if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
+        if ( ps.map_multioutput && node_tuple_match[index].highest_index )
         {
           bool multi_success = match_multioutput<DO_AREA>( n );
           if ( multi_success )
@@ -1628,7 +1645,7 @@ class emap_impl
     }
 
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     /* round stats */
     if ( ps.verbose )
@@ -1654,85 +1671,8 @@ class emap_impl
   }
 
   template<bool SwitchActivity>
-  bool compute_mapping_exact( bool last_round )
-  {
-    for ( auto const& n : topo_order )
-    {
-      if ( ntk.is_constant( n ) || ntk.is_pi( n ) )
-        continue;
-
-      /* don't touch box */
-      if constexpr ( has_is_dont_touch_v<Ntk> )
-      {
-        if ( ntk.is_dont_touch( n ) )
-        {
-          if constexpr ( has_has_binding_v<Ntk> )
-          {
-            propagate_data_forward_white_box( n );
-          }
-          continue;
-        }
-      }
-
-      auto index = ntk.node_to_index( n );
-      auto& node_data = node_match[index];
-
-      /* recursively deselect the best cut shared between
-       * the two phases if in use in the cover */
-      if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
-      {
-        uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
-        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
-        cut_deref<SwitchActivity>( best_cut, n, use_phase );
-      }
-
-      /* match positive phase */
-      match_phase_exact<SwitchActivity>( n, 0u );
-
-      /* match negative phase */
-      match_phase_exact<SwitchActivity>( n, 1u );
-
-      /* try to drop one phase */
-      match_drop_phase<true, true>( n, 0 );
-
-      /* try a multi-output match */
-      if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
-      {
-        bool multi_success = match_multioutput_exact<SwitchActivity>( n, last_round );
-        if ( multi_success )
-          multi_node_update_exact<SwitchActivity>( n );
-      }
-
-      if ( node_match[index].map_refs[0] )
-        assert( node_match[index].arrival[0] < node_match[index].required[0] + epsilon );
-      if ( node_match[index].map_refs[1] )
-        assert( node_match[index].arrival[1] < node_match[index].required[1] + epsilon );
-    }
-
-    double area_old = area;
-    bool success = set_mapping_refs<true>();
-
-    /* round stats */
-    if ( ps.verbose )
-    {
-      float area_gain = float( ( area_old - area ) / area_old * 100 );
-      std::stringstream stats{};
-      if constexpr ( SwitchActivity )
-        stats << fmt::format( "[i] Switching: Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      else
-        stats << fmt::format( "[i] Area     : Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      st.round_stats.push_back( stats.str() );
-    }
-
-    return success;
-  }
-
-  template<bool SwitchActivity>
-  bool compute_mapping_exact_reversed( bool last_round )
+  bool compute_mapping_exact_reversed()
   {
-    /* this method works in reverse topological order: less nodes to update (faster) */
-    /* instead of propagating arrival times forward, it propagates required times backwards */
-
     for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
     {
       if ( ntk.is_constant( *it ) || ntk.is_pi( *it ) )
@@ -1761,7 +1701,7 @@ class emap_impl
 
       /* recursively deselect the best cut shared between
        * the two phases if in use in the cover */
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
       double old_required = -1;
       if ( node_data.same_match )
       {
@@ -1775,6 +1715,13 @@ class emap_impl
           node_data.required[use_phase] = std::min( node_data.required[use_phase], node_data.required[use_phase ^ 1] - lib_inv_delay );
         }
       }
+      else if ( !node_data.map_refs[0] || !node_data.map_refs[1] )
+      {
+        use_phase = node_data.map_refs[0] ? 0 : 1;
+        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+        cut_deref<SwitchActivity>( best_cut, *it, use_phase );
+        node_data.same_match = true;
+      }
 
       /* match positive phase */
       match_phase_exact<SwitchActivity>( *it, 0u );
@@ -1789,10 +1736,10 @@ class emap_impl
       }
 
       /* try to drop one phase */
-      match_drop_phase<true, true>( *it, 0 );
+      match_drop_phase<true, true, SwitchActivity>( *it );
 
-      /* try a multi-output match */
-      if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 )
+      /* try a multi-output match */ /* TODO: fix the required time*/
+      if ( ps.map_multioutput && node_tuple_match[index].lowest_index )
       {
         bool mapped = match_multioutput_exact<SwitchActivity>( *it, true );
 
@@ -1850,10 +1797,10 @@ class emap_impl
     auto& node_data = node_match[index];
 
     /* propagate required time through the leaves */
-    unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
+    unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
     unsigned other_phase = use_phase ^ 1;
 
-    assert( node_data.best_supergate[0] != nullptr || node_data.best_supergate[1] != nullptr );
+    assert( node_data.best_gate[0] != nullptr || node_data.best_gate[1] != nullptr );
     // assert( node_data.map_refs[0] || node_data.map_refs[1] );
 
     /* propagate required time over the output inverter if present */
@@ -1871,7 +1818,7 @@ class emap_impl
     {
       auto ctr = 0u;
       auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
-      auto const& supergate = node_data.best_supergate[use_phase];
+      auto const& supergate = node_data.best_gate[use_phase];
       for ( auto leaf : best_cut )
       {
         auto phase = ( node_data.phase[use_phase] >> ctr ) & 1;
@@ -1884,7 +1831,7 @@ class emap_impl
     {
       auto ctr = 0u;
       auto const& best_cut = cuts[index][node_data.best_cut[other_phase]];
-      auto const& supergate = node_data.best_supergate[other_phase];
+      auto const& supergate = node_data.best_gate[other_phase];
       for ( auto leaf : best_cut )
       {
         auto phase = ( node_data.phase[other_phase] >> ctr ) & 1;
@@ -1930,7 +1877,162 @@ class emap_impl
         if ( node_data.map_refs[0] || node_data.map_refs[1] )
         {
           /* if used and not available in the library launch a mapping error */
-          if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+          if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
+          {
+            std::cerr << "[e] MAP ERROR: technology library does not contain constant gates, impossible to perform mapping" << std::endl;
+            st.mapping_error = true;
+            return false;
+          }
+        }
+        continue;
+      }
+      else if ( ntk.is_pi( *it ) )
+      {
+        if ( node_match[index].map_refs[1] > 0u )
+        {
+          /* Add inverter area over the negated fanins */
+          area += lib_inv_area;
+          ++inv;
+        }
+        continue;
+      }
+
+      /* continue if not referenced in the cover */
+      if ( !node_match[index].map_refs[0] && !node_match[index].map_refs[1] )
+        continue;
+
+      /* don't touch box */
+      if constexpr ( has_is_dont_touch_v<Ntk> )
+      {
+        if ( ntk.is_dont_touch( *it ) )
+        {
+          set_mapping_refs_dont_touch<ELA>( *it );
+          continue;
+        }
+      }
+
+      unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
+
+      if ( node_data.best_gate[use_phase] == nullptr )
+      {
+        /* Library is not complete, mapping is not possible */
+        std::cerr << "[e] MAP ERROR: technology library is not complete, impossible to perform mapping" << std::endl;
+        st.mapping_error = true;
+        return false;
+      }
+
+      if ( node_data.same_match || node_data.map_refs[use_phase] > 0 )
+      {
+        if constexpr ( !ELA )
+        {
+          auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+          auto ctr = 0u;
+
+          for ( auto const leaf : best_cut )
+          {
+            if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
+              node_match[leaf].map_refs[1]++;
+            else
+              node_match[leaf].map_refs[0]++;
+          }
+        }
+        area += node_data.area[use_phase];
+        if ( node_data.same_match && node_data.map_refs[use_phase ^ 1] > 0 )
+        {
+          if ( iteration < ps.area_flow_rounds )
+          {
+            ++node_data.map_refs[use_phase];
+          }
+          area += lib_inv_area;
+          ++inv;
+        }
+      }
+
+      /* invert the phase */
+      use_phase = use_phase ^ 1;
+
+      /* if both phases are implemented and used */
+      if ( !node_data.same_match && node_data.map_refs[use_phase] > 0 )
+      {
+        if constexpr ( !ELA )
+        {
+          auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+
+          auto ctr = 0u;
+          for ( auto const leaf : best_cut )
+          {
+            if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
+              node_match[leaf].map_refs[1]++;
+            else
+              node_match[leaf].map_refs[0]++;
+          }
+        }
+        area += node_data.area[use_phase];
+      }
+    }
+
+    ++iteration;
+
+    if constexpr ( ELA )
+    {
+      return true;
+    }
+
+    /* blend estimated references */
+    float const coef = 1.0f / ( ( iteration + 1.0f ) * ( iteration + 1.0f ) );
+    for ( auto i = 0u; i < ntk.size(); ++i )
+    {
+      node_match[i].est_refs[0] = std::max( 1.0f, coef * node_match[i].est_refs[0] + ( 1 - coef ) * node_match[i].map_refs[0] );
+      node_match[i].est_refs[1] = std::max( 1.0f, coef * node_match[i].est_refs[1] + ( 1 - coef ) * node_match[i].map_refs[1] );
+    }
+
+    return true;
+  }
+
+  template<bool DO_AREA, bool ELA>
+  bool set_mapping_refs_and_req()
+  {
+    for ( auto i = 0u; i < node_match.size(); ++i )
+    {
+      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
+    }
+
+    /* compute the current worst delay and update the mapping refs */
+    delay = 0.0f;
+    ntk.foreach_po( [this]( auto s ) {
+      const auto index = ntk.node_to_index( ntk.get_node( s ) );
+
+      if ( ntk.is_complemented( s ) )
+        delay = std::max( delay, node_match[index].arrival[1] );
+      else
+        delay = std::max( delay, node_match[index].arrival[0] );
+
+      if constexpr ( !ELA )
+      {
+        if ( ntk.is_complemented( s ) )
+          node_match[index].map_refs[1]++;
+        else
+          node_match[index].map_refs[0]++;
+      }
+    } );
+
+    set_output_required_time( iteration == 0 );
+
+    /* compute current area and update mapping refs in top-down order */
+    area = 0.0f;
+    inv = 0;
+    for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
+    {
+      const auto index = ntk.node_to_index( *it );
+      auto& node_data = node_match[index];
+
+      /* skip constants and PIs */
+      if ( ntk.is_constant( *it ) )
+      {
+        if ( node_match[index].map_refs[0] || node_match[index].map_refs[1] )
+        {
+          /* if used and not available in the library launch a mapping error */
+          if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           {
             std::cerr << "[e] MAP ERROR: technology library does not contain constant gates, impossible to perform mapping" << std::endl;
             st.mapping_error = true;
@@ -1964,9 +2066,15 @@ class emap_impl
         }
       }
 
-      unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
+      /* refine best matches with alternatives */
+      if constexpr ( !DO_AREA )
+      {
+        if ( ps.use_match_alternatives )
+          refine_best_matches( *it );
+      }
 
-      if ( node_data.best_supergate[use_phase] == nullptr )
+      unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
+      if ( node_data.best_gate[use_phase] == nullptr )
       {
         /* Library is not complete, mapping is not possible */
         std::cerr << "[e] MAP ERROR: technology library is not complete, impossible to perform mapping" << std::endl;
@@ -1994,8 +2102,7 @@ class emap_impl
         {
           if ( iteration < ps.area_flow_rounds )
           {
-            // ++node_data.map_refs[use_phase];
-            node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
+            ++node_data.map_refs[use_phase];
           }
           area += lib_inv_area;
           ++inv;
@@ -2023,6 +2130,11 @@ class emap_impl
         }
         area += node_data.area[use_phase];
       }
+
+      if ( !ps.area_oriented_mapping )
+      {
+        match_propagate_required( index );
+      }
     }
 
     ++iteration;
@@ -2033,10 +2145,11 @@ class emap_impl
     }
 
     /* blend estimated references */
+    float const coef = 1.0f / ( ( iteration + 1.0f ) * ( iteration + 1.0f ) );
     for ( auto i = 0u; i < ntk.size(); ++i )
     {
-      node_match[i].est_refs[0] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[0] + 2.0f * node_match[i].map_refs[0] ) / 3.0 );
-      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
+      node_match[i].est_refs[0] = std::max( 1.0f, coef * node_match[i].est_refs[0] + ( 1 - coef ) * node_match[i].map_refs[0] );
+      node_match[i].est_refs[1] = std::max( 1.0f, coef * node_match[i].est_refs[1] + ( 1 - coef ) * node_match[i].map_refs[1] );
     }
 
     return true;
@@ -2065,8 +2178,7 @@ class emap_impl
       {
         if ( iteration < ps.area_flow_rounds )
         {
-          // ++node_match[index].map_refs[0];
-          node_match[index].map_refs[0] += node_match[index].map_refs[1];
+          ++node_match[index].map_refs[0];
         }
         area += lib_inv_area;
         ++inv;
@@ -2074,47 +2186,73 @@ class emap_impl
     }
   }
 
-  void compute_required_time( bool exit_early = false )
+  void set_output_required_time( bool warning )
   {
-    for ( auto i = 0u; i < node_match.size(); ++i )
-    {
-      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
-    }
-
-    /* return if mapping is area oriented */
-    if ( ps.area_oriented_mapping )
-      return;
-
     double required = delay;
-
     /* relax delay constraints */
-    if ( iteration == 1 && ps.required_time == 0.0f && ps.relax_required > 0.0f )
+    if ( iteration == 0 && ps.required_time == 0.0f && ps.required_times.empty() && ps.relax_required > 0.0f )
     {
       required *= ( 100.0 + ps.relax_required ) / 100.0;
     }
 
     /* Global target time constraint */
-    if ( ps.required_time != 0.0f )
+    if ( ps.required_times.empty() )
     {
-      if ( ps.required_time < delay - epsilon )
+      if ( ps.required_time != 0.0f )
       {
-        if ( !ps.area_oriented_mapping && iteration == 1 )
-          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
-      }
-      else
-      {
-        required = ps.required_time;
+        if ( ps.required_time < delay - epsilon )
+        {
+          if ( warning )
+            std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
+        }
+        else
+        {
+          required = ps.required_time;
+        }
       }
+
+      /* set the required time at POs */
+      ntk.foreach_po( [&]( auto const& s ) {
+        const auto index = ntk.node_to_index( ntk.get_node( s ) );
+        if ( ntk.is_complemented( s ) )
+          node_match[index].required[1] = required;
+        else
+          node_match[index].required[0] = required;
+      } );
+
+      return;
     }
 
-    /* set the required time at POs */
-    ntk.foreach_po( [&]( auto const& s ) {
+    /* Output-specific target time constraint */
+    ntk.foreach_po( [&]( auto const& s, uint32_t i ) {
       const auto index = ntk.node_to_index( ntk.get_node( s ) );
-      if ( ntk.is_complemented( s ) )
-        node_match[index].required[1] = required;
+      uint8_t phase = ntk.is_complemented( s ) ? 1 : 0;
+      if ( node_match[index].arrival[phase] > ps.required_times[i] + epsilon )
+      {
+        /* maintain the same delay */
+        node_match[index].required[phase] = node_match[index].arrival[phase];
+        if ( warning )
+          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f} at output {}", ps.required_times[i], i ) << std::endl;
+      }
       else
-        node_match[index].required[0] = required;
+      {
+        node_match[index].required[phase] = ps.required_times[i];
+      }
     } );
+  }
+
+  void compute_required_time( bool exit_early = false )
+  {
+    for ( auto i = 0u; i < node_match.size(); ++i )
+    {
+      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
+    }
+
+    /* return if mapping is area oriented */
+    if ( ps.area_oriented_mapping )
+      return;
+
+    set_output_required_time( iteration == 1 );
 
     if ( exit_early )
       return;
@@ -2184,16 +2322,16 @@ class emap_impl
         }
       }
 
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
 
       /* compute arrival of use_phase */
-      supergate<NInputs> const* best_supergate = node_data.best_supergate[use_phase];
+      supergate<NInputs> const* best_gate = node_data.best_gate[use_phase];
       double worst_arrival = 0;
       uint16_t best_phase = node_data.phase[use_phase];
       auto ctr = 0u;
       for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
       {
-        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
         worst_arrival = std::max( worst_arrival, arrival_pin );
         ++ctr;
       }
@@ -2219,15 +2357,15 @@ class emap_impl
         continue;
       }
 
-      assert( node_data.best_supergate[use_phase] != nullptr );
+      assert( node_data.best_gate[use_phase] != nullptr );
 
-      best_supergate = node_data.best_supergate[use_phase];
+      best_gate = node_data.best_gate[use_phase];
       worst_arrival = 0;
       best_phase = node_data.phase[use_phase];
       ctr = 0u;
       for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
       {
-        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
         worst_arrival = std::max( worst_arrival, arrival_pin );
         ++ctr;
       }
@@ -2270,16 +2408,16 @@ class emap_impl
   {
     uint32_t index = ntk.node_to_index( n );
     auto& node_data = node_match[index];
-    uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+    uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
 
     /* compute arrival of use_phase */
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[use_phase];
+    supergate<NInputs> const* best_gate = node_data.best_gate[use_phase];
     double worst_arrival = 0;
     uint16_t best_phase = node_data.phase[use_phase];
     auto ctr = 0u;
     for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
     {
-      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
       worst_arrival = std::max( worst_arrival, arrival_pin );
       ++ctr;
     }
@@ -2293,15 +2431,15 @@ class emap_impl
       return;
     }
 
-    assert( node_data.best_supergate[0] != nullptr );
+    assert( node_data.best_gate[0] != nullptr );
 
-    best_supergate = node_data.best_supergate[use_phase];
+    best_gate = node_data.best_gate[use_phase];
     worst_arrival = 0;
     best_phase = node_data.phase[use_phase];
     ctr = 0u;
     for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
     {
-      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
       worst_arrival = std::max( worst_arrival, arrival_pin );
       ++ctr;
     }
@@ -2312,24 +2450,24 @@ class emap_impl
   template<bool DO_AREA>
   void match_phase( node<Ntk> const& n, uint8_t phase )
   {
-    double best_arrival = std::numeric_limits<float>::max();
-    double best_area_flow = std::numeric_limits<float>::max();
-    float best_area = std::numeric_limits<float>::max();
-    uint32_t best_size = UINT32_MAX;
-    uint8_t best_cut = 0u;
-    uint16_t best_phase = 0u;
-    uint8_t cut_index = 0u;
     auto index = ntk.node_to_index( n );
-
     auto& node_data = node_match[index];
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[phase];
+    uint32_t cut_index = 0u;
+
+    node_data.best_gate[phase] = nullptr;
+    node_data.arrival[phase] = std::numeric_limits<float>::max();
+    node_data.flows[phase] = std::numeric_limits<float>::max();
+    node_data.area[phase] = std::numeric_limits<float>::max();
+    uint32_t best_size = UINT32_MAX;
+
+    best_gate_emap<NInputs>& gA = node_data.best_alternative[phase];
+    gA.gate = nullptr;
+    gA.arrival = std::numeric_limits<float>::max();
+    gA.flow = std::numeric_limits<float>::max();
+    uint32_t best_sizeA = UINT32_MAX;
 
     /* unmap multioutput */
-    if ( node_data.multioutput_match[phase] )
-    {
-      best_supergate = nullptr;
-      node_data.multioutput_match[phase] = false;
-    }
+    node_data.multioutput_match[phase] = false;
 
     /* foreach cut */
     for ( auto& cut : cuts[index] )
@@ -2355,49 +2493,68 @@ class emap_impl
       {
         uint16_t gate_polarity = gate.polarity ^ negation;
         double worst_arrival = 0.0f;
-        double area_local = gate.area;
+        double worst_arrivalA = 0.0f;
+        float area_local = gate.area;
+        float area_localA = gate.area;
 
         auto ctr = 0u;
-        node_data.phase[phase] = gate_polarity;
         for ( auto l : *cut )
         {
-          double arrival_pin = node_match[l].arrival[( gate_polarity >> ctr ) & 1] + gate.tdelay[ctr];
+          uint8_t leaf_phase = ( gate_polarity >> ctr ) & 1;
+
+          double arrival_pinA = node_match[l].best_alternative[leaf_phase].arrival + gate.tdelay[ctr];
+          worst_arrivalA = std::max( worst_arrivalA, arrival_pinA );
+
+          // if constexpr ( DO_AREA )
+          // {
+          //   if ( worst_arrivalA > node_data.required[phase] + epsilon || worst_arrivalA >= std::numeric_limits<float>::max() )
+          //     break;
+          // }
+
+          double arrival_pin = node_match[l].arrival[leaf_phase] + gate.tdelay[ctr];
           worst_arrival = std::max( worst_arrival, arrival_pin );
 
-          uint8_t leaf_phase = ( node_data.phase[phase] >> ctr ) & 1;
           area_local += node_match[l].flows[leaf_phase];
+          area_localA += node_match[l].best_alternative[leaf_phase].flow;
           ++ctr;
         }
 
+        bool skip = false;
         if constexpr ( DO_AREA )
         {
-          if ( worst_arrival > node_data.required[phase] + epsilon || worst_arrival >= std::numeric_limits<float>::max() )
+          if ( ctr < cut->size() )
             continue;
+          if ( worst_arrival > node_data.required[phase] + epsilon || worst_arrival >= std::numeric_limits<float>::max() )
+            skip = true;
         }
 
-        node_data.phase[phase] = gate_polarity;
-
-        if ( compare_map<DO_AREA>( worst_arrival, best_arrival, area_local, best_area_flow, cut->size(), best_size ) )
+        if ( !skip && compare_map<DO_AREA>( worst_arrival, node_data.arrival[phase], area_local, node_data.flows[phase], cut->size(), best_size ) )
         {
-          best_arrival = worst_arrival;
-          best_area_flow = area_local;
+          node_data.best_gate[phase] = &gate;
+          node_data.arrival[phase] = worst_arrival;
+          node_data.flows[phase] = area_local;
+          node_data.best_cut[phase] = cut_index;
+          node_data.area[phase] = gate.area;
+          node_data.phase[phase] = gate_polarity;
           best_size = cut->size();
-          best_cut = cut_index;
-          best_area = gate.area;
-          best_phase = gate_polarity;
-          best_supergate = &gate;
+        }
+
+        /* compute the alternative */
+        if ( compare_map<!DO_AREA>( worst_arrivalA, gA.arrival, area_localA, gA.flow, cut->size(), best_sizeA ) )
+        {
+          gA.gate = &gate;
+          gA.arrival = worst_arrivalA;
+          gA.area = gate.area;
+          gA.flow = area_localA;
+          gA.phase = gate_polarity;
+          gA.cut = cut_index;
+          best_sizeA = cut->size();
+          gA.size = cut->size();
         }
       }
 
       ++cut_index;
     }
-
-    node_data.flows[phase] = best_area_flow;
-    node_data.arrival[phase] = best_arrival;
-    node_data.area[phase] = best_area;
-    node_data.best_cut[phase] = best_cut;
-    node_data.phase[phase] = best_phase;
-    node_data.best_supergate[phase] = best_supergate;
   }
 
   template<bool SwitchActivity>
@@ -2413,23 +2570,23 @@ class emap_impl
     auto index = ntk.node_to_index( n );
 
     auto& node_data = node_match[index];
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[phase];
+    supergate<NInputs> const* best_gate = node_data.best_gate[phase];
 
     /* unmap multioutput */
     if ( node_data.multioutput_match[phase] )
     {
       /* dereference multi-output */
-      if ( !node_data.same_match && best_supergate != nullptr && node_data.map_refs[phase] )
+      if ( !node_data.same_match && best_gate != nullptr && node_data.map_refs[phase] )
       {
         auto const& cut = multi_cut_set[node_data.best_cut[phase]][0];
         cut_deref<SwitchActivity>( cut, n, phase );
       }
-      best_supergate = nullptr;
+      best_gate = nullptr;
       node_data.multioutput_match[phase] = false;
     }
 
     /* recompute best match info */
-    if ( best_supergate != nullptr )
+    if ( best_gate != nullptr )
     {
       /* if cut is implemented, remove it from the cover */
       if ( !node_data.same_match && node_data.map_refs[phase] )
@@ -2487,7 +2644,7 @@ class emap_impl
           best_size = cut->size();
           best_cut = cut_index;
           best_phase = gate_polarity;
-          best_supergate = &gate;
+          best_gate = &gate;
         }
       }
 
@@ -2499,7 +2656,7 @@ class emap_impl
     node_data.area[phase] = best_area;
     node_data.best_cut[phase] = best_cut;
     node_data.phase[phase] = best_phase;
-    node_data.best_supergate[phase] = best_supergate;
+    node_data.best_gate[phase] = best_gate;
 
     if ( !node_data.same_match && node_data.map_refs[phase] )
     {
@@ -2507,8 +2664,8 @@ class emap_impl
     }
   }
 
-  template<bool DO_AREA, bool ELA>
-  void match_drop_phase( node<Ntk> const& n, float required_margin_factor )
+  template<bool DO_AREA, bool ELA, bool SwitchActivity = false>
+  void match_drop_phase( node<Ntk> const& n )
   {
     auto index = ntk.node_to_index( n );
     auto& node_data = node_match[index];
@@ -2520,7 +2677,7 @@ class emap_impl
     bool use_one = false;
 
     /* only one phase is matched */
-    if ( node_data.best_supergate[0] == nullptr )
+    if ( node_data.best_gate[0] == nullptr )
     {
       set_match_complemented_phase( index, 1, worst_arrival_npos );
       if constexpr ( ELA )
@@ -2530,7 +2687,7 @@ class emap_impl
       }
       return;
     }
-    else if ( node_data.best_supergate[1] == nullptr )
+    else if ( node_data.best_gate[1] == nullptr )
     {
       set_match_complemented_phase( index, 0, worst_arrival_nneg );
       if constexpr ( ELA )
@@ -2557,44 +2714,41 @@ class emap_impl
     else
     {
       /* check if both phases + inverter meet the required time */
-      use_zero = worst_arrival_nneg < ( node_data.required[1] + epsilon - required_margin_factor * lib_inv_delay );
-      use_one = worst_arrival_npos < ( node_data.required[0] + epsilon - required_margin_factor * lib_inv_delay );
+      use_zero = worst_arrival_nneg < ( node_data.required[1] + epsilon );
+      use_one = worst_arrival_npos < ( node_data.required[0] + epsilon );
     }
 
     /* condition on not used phases, evaluate a substitution during exact area recovery */
     if constexpr ( ELA )
     {
-      if ( iteration != 0 )
+      if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
       {
-        if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
+        /* select the used match */
+        auto phase = 0;
+        auto nphase = 0;
+        if ( node_data.map_refs[0] == 0 )
         {
-          /* select the used match */
-          auto phase = 0;
-          auto nphase = 0;
-          if ( node_data.map_refs[0] == 0 )
-          {
-            phase = 1;
-            use_one = true;
-            use_zero = false;
-          }
-          else
-          {
-            nphase = 1;
-            use_one = false;
-            use_zero = true;
-          }
-          /* select the not used match instead if it leads to area improvement and doesn't violate the required time */
-          if ( node_data.arrival[nphase] + lib_inv_delay < node_data.required[phase] + epsilon )
-          {
-            auto size_phase = cuts[index][node_data.best_cut[phase]].size();
-            auto size_nphase = cuts[index][node_data.best_cut[nphase]].size();
+          phase = 1;
+          use_one = true;
+          use_zero = false;
+        }
+        else
+        {
+          nphase = 1;
+          use_one = false;
+          use_zero = true;
+        }
+        /* select the not used match instead if it leads to area improvement and doesn't violate the required time */
+        if ( node_data.arrival[nphase] + lib_inv_delay < node_data.required[phase] + epsilon )
+        {
+          auto size_phase = cuts[index][node_data.best_cut[phase]].size();
+          auto size_nphase = cuts[index][node_data.best_cut[nphase]].size();
 
-            if ( compare_map<DO_AREA>( node_data.arrival[nphase] + lib_inv_delay, node_data.arrival[phase], node_data.flows[nphase] + lib_inv_area, node_data.flows[phase], size_nphase, size_phase ) )
-            {
-              /* invert the choice */
-              use_zero = !use_zero;
-              use_one = !use_one;
-            }
+          if ( compare_map<DO_AREA>( node_data.arrival[nphase] + lib_inv_delay, node_data.arrival[phase], node_data.flows[nphase] + lib_inv_area, node_data.flows[phase], size_nphase, size_phase ) )
+          {
+            /* invert the choice */
+            use_zero = !use_zero;
+            use_one = !use_one;
           }
         }
       }
@@ -2603,21 +2757,10 @@ class emap_impl
     if ( ( !use_zero && !use_one ) )
     {
       /* use both phases */
-      if ( ps.allow_node_duplication )
-      {
-        node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
-        node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
-        node_data.same_match = false;
-        return;
-      }
-
-      /* if node duplication is not allowed, pick one phase based on delay */
-      auto size_zero = cuts[index][node_data.best_cut[0]].size();
-      auto size_one = cuts[index][node_data.best_cut[1]].size();
-      if ( compare_map<false>( worst_arrival_npos, worst_arrival_nneg, node_data.flows[1], node_data.flows[0], size_one, size_zero ) )
-        use_zero = true;
-      else
-        use_one = true;
+      node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
+      node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
+      node_data.same_match = false;
+      return;
     }
 
     /* use area flow as a tiebreaker */
@@ -2625,10 +2768,59 @@ class emap_impl
     {
       auto size_zero = cuts[index][node_data.best_cut[0]].size();
       auto size_one = cuts[index][node_data.best_cut[1]].size();
-      if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
-        use_one = false;
+
+      if constexpr ( ELA )
+      {
+        if ( !node_data.same_match )
+        {
+          /* both phases were implemented --> evaluate substitution */
+          cut_deref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          node_data.flows[1] = cut_deref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+          node_data.flows[0] = cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+        }
+        /* evaluate based on inverter cost */
+        if constexpr ( !SwitchActivity )
+        {
+          use_zero = lib_inv_area < node_data.flows[1] + epsilon;
+          use_one = lib_inv_area < node_data.flows[0] + epsilon;
+        }
+
+        if ( use_one && use_zero )
+        {
+          if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
+            use_one = false;
+          else
+            use_zero = false;
+        }
+        else if ( !use_one && !use_zero && node_data.same_match )
+        {
+          node_data.same_match = false;
+          cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+          return;
+        }
+      }
       else
-        use_zero = false;
+      {
+        /* compare flows by looking at the most convinient and referenced */
+        if ( node_data.flows[0] / node_data.est_refs[0] + lib_inv_area < node_data.flows[1] / node_data.est_refs[1] + epsilon )
+        {
+          use_one = false;
+        }
+        else if ( node_data.flows[1] / node_data.est_refs[1] + lib_inv_area < node_data.flows[0] / node_data.est_refs[0] + epsilon )
+        {
+          use_zero = false;
+        }
+        else
+        {
+          /* delay the decision on what to keep --> wait for better estimations */
+          node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
+          node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
+          node_data.same_match = false;
+          return;
+        }
+      }
     }
 
     if ( use_zero )
@@ -2676,7 +2868,7 @@ class emap_impl
     auto& node_data = node_match[index];
     auto phase_n = phase ^ 1;
     node_data.same_match = true;
-    node_data.best_supergate[phase_n] = nullptr;
+    node_data.best_gate[phase_n] = nullptr;
     node_data.best_cut[phase_n] = node_data.best_cut[phase];
     node_data.phase[phase_n] = node_data.phase[phase];
     node_data.arrival[phase_n] = worst_arrival_n;
@@ -2685,24 +2877,151 @@ class emap_impl
     node_data.flows[phase] = node_data.flows[phase] / node_data.est_refs[phase];
   }
 
-  void reindex_multioutput_data()
+  template<bool DO_AREA>
+  inline void select_alternatives( node<Ntk> const& n )
   {
-    /* re-index the multioutput list using the lowest index output instead of the greatest one */
-    if ( !ps.map_multioutput )
+    if constexpr ( DO_AREA )
+      return;
+
+    if ( !ps.use_match_alternatives )
+      return;
+
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+
+    best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
+    best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
+    float g0flow = g0.flow / node_data.est_refs[0];
+    float g1flow = g1.flow / node_data.est_refs[1];
+
+    /* process for best area */ /* removed check on required since this is executed only during a delay pass */
+    if ( g0.gate != nullptr && g0flow + lib_inv_area < g1flow + epsilon )
+    {
+      g1 = g0;
+      g1.gate = nullptr;
+      g1.arrival += lib_inv_delay;
+      g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
+      g0.flow = g0flow;
       return;
+    }
+    else if ( g1.gate != nullptr && g1flow + lib_inv_area < g0flow + epsilon )
+    {
+      g0 = g1;
+      g0.gate = nullptr;
+      g0.arrival += lib_inv_delay;
+      g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
+      g1.flow = g1flow;
+      return;
+    }
 
-    for ( auto i = ntk.num_pis(); i < topo_order.size(); ++i )
+    g0.flow = g0flow;
+    g1.flow = g1flow;
+  }
+
+  inline void refine_best_matches( node<Ntk> const& n )
+  {
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+
+    /* evaluate to change the best matches with the best alternative */
+    best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
+    best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
+
+    if ( node_data.map_refs[0] && node_data.map_refs[1] )
     {
-      uint32_t tuple_index = node_tuple_match[i];
-      if ( tuple_index >= UINT32_MAX - 1 )
-        continue;
+      if ( node_data.same_match )
+      {
+        /* pick best implementation between the two alternatives */
+        unsigned best_match_phase = node_data.best_gate[0] == nullptr ? 1 : 0;
+        unsigned use_phase = g0.gate == nullptr ? 1 : 0;
+        if ( g0.gate != nullptr && g1.gate != nullptr )
+        {
+          if ( g0.arrival > node_data.required[0] + epsilon || g1.arrival > node_data.required[1] + epsilon )
+            return;
 
-      multi_match_t const& tuple_data = multi_node_match[tuple_index][0];
-      node_tuple_match[i] = UINT32_MAX - 1; /* arbitrary value to skip the required time propagation */
-      node_tuple_match[tuple_data[0].node_index] = tuple_index;
+          refine_best_matches_copy_refinement( n, 0, false );
+          refine_best_matches_copy_refinement( n, 1, false );
+          node_data.same_match = false;
+          return;
+        }
+        else
+        {
+          best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
+          if ( gUse.arrival > node_data.required[use_phase] + epsilon || gUse.arrival + lib_inv_delay > node_data.required[use_phase ^ 1] + epsilon )
+          {
+            return;
+          }
+          refine_best_matches_copy_refinement( n, use_phase, true );
+          return;
+        }
+      }
+      else
+      {
+        /* not same match: evaluate both zero and one phase */
+        if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
+        {
+          node_data.same_match = false;
+          refine_best_matches_copy_refinement( n, 0, g1.gate == nullptr && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon );
+        }
+        if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
+        {
+          node_data.same_match = false;
+          refine_best_matches_copy_refinement( n, 1, g0.gate == nullptr && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon );
+        }
+      }
+    }
+    else if ( node_data.map_refs[0] )
+    {
+      if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
+      {
+        node_data.same_match = false;
+        refine_best_matches_copy_refinement( n, 0, false );
+      }
+      else if ( g0.gate == nullptr && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon )
+      {
+        refine_best_matches_copy_refinement( n, 1, true );
+      }
+    }
+    else
+    {
+      if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
+      {
+        node_data.same_match = false;
+        refine_best_matches_copy_refinement( n, 1, false );
+      }
+      else if ( g1.gate == nullptr && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon )
+      {
+        refine_best_matches_copy_refinement( n, 0, true );
+      }
     }
   }
 
+  inline void refine_best_matches_copy_refinement( node<Ntk> const& n, unsigned phase, bool both_phases )
+  {
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+    best_gate_emap<NInputs>& bg = node_data.best_alternative[phase];
+
+    node_data.best_gate[phase] = bg.gate;
+    node_data.phase[phase] = bg.phase;
+    node_data.best_cut[phase] = bg.cut;
+    node_data.arrival[phase] = bg.arrival;
+    node_data.area[phase] = bg.area;
+    node_data.flows[phase] = bg.flow;
+
+    if ( !both_phases )
+      return;
+
+    node_data.same_match = true;
+    phase ^= 1;
+    node_data.best_gate[phase] = nullptr;
+    node_data.phase[phase] = bg.phase;
+    node_data.best_cut[phase] = bg.cut;
+    node_data.arrival[phase] = bg.arrival + lib_inv_delay;
+    node_data.area[phase] = bg.area;
+    node_data.flows[phase] = ( bg.flow * node_data.est_refs[phase ^ 1] + lib_inv_area ) / node_data.est_refs[phase];
+  }
+
   bool initialize_box( node<Ntk> const& n )
   {
     uint32_t index = ntk.node_to_index( n );
@@ -2796,16 +3115,16 @@ class emap_impl
     /* if only one is available, the other is obtained using an inverter */
     if ( supergates_zero != nullptr )
     {
-      node_data.best_supergate[0] = &( ( *supergates_zero )[0] );
-      node_data.arrival[0] = node_data.best_supergate[0]->tdelay[0];
-      node_data.area[0] = node_data.best_supergate[0]->area;
+      node_data.best_gate[0] = &( ( *supergates_zero )[0] );
+      node_data.arrival[0] = node_data.best_gate[0]->tdelay[0];
+      node_data.area[0] = node_data.best_gate[0]->area;
       node_data.phase[0] = 0;
     }
     if ( supergates_one != nullptr )
     {
-      node_data.best_supergate[1] = &( ( *supergates_one )[0] );
-      node_data.arrival[1] = node_data.best_supergate[1]->tdelay[0];
-      node_data.area[1] = node_data.best_supergate[1]->area;
+      node_data.best_gate[1] = &( ( *supergates_one )[0] );
+      node_data.arrival[1] = node_data.best_gate[1]->tdelay[0];
+      node_data.area[1] = node_data.best_gate[1]->area;
       node_data.phase[1] = 0;
     }
     else
@@ -2829,7 +3148,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     /* get the cut */
     auto const& cut0 = cuts[tuple_data[0].node_index][tuple_data[0].cut_index];
@@ -2907,7 +3226,7 @@ class emap_impl
         /* compute area flow */
         if ( j == 0 || !node_data.multioutput_match[0] )
         {
-          uint8_t current_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+          uint8_t current_phase = node_data.best_gate[0] == nullptr ? 1 : 0;
           old_flow_sum += node_data.flows[current_phase];
         }
         uint8_t old_phase = node_data.phase[phase[j]];
@@ -2916,38 +3235,6 @@ class emap_impl
         area_flow[j] = gate.area + cut_leaves_flow( cut, n, phase[j] );
         node_data.phase[phase[j]] = old_phase;
 
-        /* local evaluation for delay (area flow improvement is approximated) */
-      //   if constexpr ( !DO_AREA )
-      //   {
-      //     /* recompute local area flow of previous matches */
-      //     double mapped_flow = node_data.flows[phase[j]];
-
-      //     if ( node_data.multioutput_match[phase[j]] )
-      //     {
-      //       /* recompute estimation for multi-output gate */
-      //       float k_est = 0;
-      //       for ( auto k = 0; k < max_multioutput_output_size; ++k )
-      //       {
-      //         uint32_t index_k = tuple_data[k].node_index;
-      //         auto used_phase = node_match[index_k].supergate[0] == nullptr ? 1 : 0;
-      //         k_est += node_match[index_k].est_refs[used_phase]; /* TODO: review */
-      //       }
-      //       mapped_flow *= k_est;
-      //     }
-      //     else
-      //     {
-      //       auto used_phase = node_data.supergate[0] == nullptr ? 1 : 0; /* TODO: review */
-      //       mapped_flow *= node_data.est_refs[used_phase];
-      //     }
-
-      //     auto const& mapped_cut = cuts[node_index][node_data.best_cut[phase[j]]];
-      //     if ( !compare_map<DO_AREA>( arrival[j], node_data.arrival[phase[j]], area_flow[j], mapped_flow, cut.size(), mapped_cut.size() ) )
-      //     {
-      //       is_best = false;
-      //       break;
-      //     }
-      //   }
-
         /* current version may lead to delay increase */
         est_refs[j] = node_data.est_refs[phase[j]];
       }
@@ -2993,7 +3280,7 @@ class emap_impl
         uint8_t mapped_phase = phase[j];
         node_data.multioutput_match[mapped_phase] = true;
 
-        node_data.best_supergate[mapped_phase] = &gate;
+        node_data.best_gate[mapped_phase] = &gate;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j];
@@ -3005,11 +3292,11 @@ class emap_impl
         /* select opposite phase */
         mapped_phase ^= 1;
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = nullptr;
+        node_data.best_gate[mapped_phase] = nullptr;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
-        node_data.area[mapped_phase] = area[j];                  /* partial area contribution */
+        node_data.area[mapped_phase] = area[j]; /* partial area contribution */
         node_data.flows[mapped_phase] = flow_sum_neg;
 
         assert( node_data.arrival[mapped_phase] < node_data.required[mapped_phase] + epsilon );
@@ -3024,7 +3311,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     /* local values storage */
     std::array<float, max_multioutput_output_size> best_exact_area;
@@ -3053,13 +3340,13 @@ class emap_impl
     for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
     {
       uint32_t node_index = tuple_data[j].node_index;
-      uint8_t selected_phase = node_match[node_index].best_supergate[0] == nullptr ? 1 : 0;
+      uint8_t selected_phase = node_match[node_index].best_gate[0] == nullptr ? 1 : 0;
 
       if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
         /* match is always single output here */
         auto const& cut = cuts[node_index][node_match[node_index].best_cut[0]];
-        uint8_t use_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t use_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         best_exact_area[j] = cut_deref<SwitchActivity>( cut, ntk.index_to_node( node_index ), use_phase );
 
         /* mapping a non referenced phase */
@@ -3079,7 +3366,7 @@ class emap_impl
 
       if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
-        uint8_t use_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t use_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         auto const& best_cut = cuts[node_index][node_match[node_index].best_cut[use_phase]];
         cut_ref<SwitchActivity>( best_cut, ntk.index_to_node( node_index ), use_phase );
       }
@@ -3218,7 +3505,7 @@ class emap_impl
 
         /* write data */
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = &gate;
+        node_data.best_gate[mapped_phase] = &gate;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j];
@@ -3228,7 +3515,7 @@ class emap_impl
         /* select opposite phase */
         mapped_phase ^= 1;
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = nullptr;
+        node_data.best_gate[mapped_phase] = nullptr;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
@@ -3246,7 +3533,7 @@ class emap_impl
   void multi_node_update( node<Ntk> const& n )
   {
     uint32_t check_index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )].index][0];
     uint64_t signature = 0;
 
     /* check if a node is in TFI: there is a path of length > 1 */
@@ -3316,7 +3603,7 @@ class emap_impl
     match_phase<DO_AREA>( n, 1u );
 
     /* try to drop one phase */
-    match_drop_phase<DO_AREA, false>( n, 0 );
+    match_drop_phase<DO_AREA, false>( n );
 
     assert( node_data.arrival[0] < node_data.required[0] + epsilon );
     assert( node_data.arrival[1] < node_data.required[1] + epsilon );
@@ -3326,7 +3613,7 @@ class emap_impl
   void multi_node_update_exact( node<Ntk> const& n )
   {
     uint32_t check_index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )].index][0];
     uint64_t signature = 0;
 
     /* check if a node is in TFI: there is a path of length > 1 */
@@ -3385,7 +3672,7 @@ class emap_impl
 
     if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
     {
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
       auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
       cut_deref<SwitchActivity>( best_cut, n, use_phase );
     }
@@ -3397,7 +3684,7 @@ class emap_impl
     match_phase_exact<SwitchActivity>( n, 1u );
 
     /* try to drop one phase */
-    match_drop_phase<true, true>( n, 0 );
+    match_drop_phase<true, true>( n );
 
     assert( node_data.arrival[0] < std::numeric_limits<float>::max() );
     assert( node_data.arrival[1] < std::numeric_limits<float>::max() );
@@ -3407,7 +3694,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
     {
@@ -3416,78 +3703,67 @@ class emap_impl
     }
   }
 
-  template<bool DO_AREA>
-  bool match_multi_add_cuts( node<Ntk> const& n )
+  void match_multi_add_cuts( node<Ntk> const& n )
   {
+    /* assume a single cut (current version) */
     uint32_t index = ntk.node_to_index( n );
-    auto& matches = multi_node_match[node_tuple_match[index]];
-
-    /* get the cuts */
-    auto tuple_data_it = matches.begin();
-    while ( tuple_data_it != matches.end() )
-    {
-      multi_match_t& tuple_data = *tuple_data_it;
-      uint32_t cut_index = tuple_data[0].cut_index;
-      auto& cut_pair = multi_cut_set[cut_index];
-      bool remove_entry = false;
+    multi_match_t& matches = multi_node_match[node_tuple_match[index].index][0];
 
-      /* insert multi-output cuts into the standard cut set */
-      for ( auto i = 0; i < max_multioutput_output_size; ++i )
-      {
-        uint64_t node_index = tuple_data[i].node_index;
-        auto& cut = cut_pair[i];
-        auto single_cut = cut_pair[i];
+    /* find the corresponding cut */
+    uint32_t cut_p = 0;
+    while ( matches[cut_p].node_index != index )
+      ++cut_p;
 
-        auto& rcuts = cuts[node_index];
+    assert( cut_p < matches.size() );
+    uint32_t cut_index = matches[cut_p].cut_index;
+    auto& cut = multi_cut_set[cut_index][cut_p];
+    auto single_cut = multi_cut_set[cut_index][cut_p];
+    auto& rcuts = cuts[index];
 
-        /* not enough space in the data structure: abort */
-        if ( rcuts.size() == max_cut_num )
-        {
-          remove_entry = true;
-          break;
-        }
+    /* not enough space in the data structure: abort */
+    if ( rcuts.size() == max_cut_num )
+    {
+      match_multi_add_cuts_remove_entry( matches );
+      return;
+    }
 
-        /* insert single cut variation if unique (for delay preservation) */
-        if ( !rcuts.is_contained( single_cut ) )
-        {
-          compute_cut_data<DO_AREA>( single_cut, ntk.index_to_node( node_index ) );
-          rcuts.append_cut( single_cut );
+    /* insert single cut variation if unique (for delay preservation) */
+    if ( !rcuts.is_contained( single_cut ) )
+    {
+      single_cut->pattern_index = 0;
+      compute_cut_data( single_cut, ntk.index_to_node( index ) );
+      rcuts.append_cut( single_cut );
 
-          /* not enough space in the data structure: abort */
-          if ( rcuts.size() == max_cut_num )
-          {
-            rcuts.limit( rcuts.size() - 1 );
-            remove_entry = true;
-            break;
-          }
-        }
+      /* not enough space in the data structure: abort */
+      if ( rcuts.size() == max_cut_num )
+      {
+        rcuts.limit( rcuts.size() - 1 );
+        match_multi_add_cuts_remove_entry( matches );
+        return;
+      }
+    }
 
-        /* add multi-output cut */
-        uint32_t num_cuts_pre = rcuts.size();
-        cut->ignore = true;
-        rcuts.append_cut( cut );
+    /* add multi-output cut */
+    uint32_t num_cuts_pre = rcuts.size();
+    cut->ignore = true;
+    rcuts.append_cut( cut );
 
-        uint32_t num_cuts_after = rcuts.size();
-        assert( num_cuts_after == num_cuts_pre + 1 );
+    uint32_t num_cuts_after = rcuts.size();
+    assert( num_cuts_after == num_cuts_pre + 1 );
 
-        rcuts.limit( num_cuts_pre );
+    rcuts.limit( num_cuts_pre );
 
-        /* update tuple data */
-        tuple_data[i].cut_index = num_cuts_pre;
-      }
+    /* update tuple data */
+    matches[cut_p].cut_index = num_cuts_pre;
+  }
 
-      if ( remove_entry )
-        matches.erase( tuple_data_it );
-      else
-        ++tuple_data_it;
+  inline void match_multi_add_cuts_remove_entry( multi_match_t const& matches )
+  {
+    /* reset matches */
+    for ( multi_match_data const& entry : matches )
+    {
+      node_tuple_match[entry.node_index].data = 0;
     }
-
-    /* matches do not fit in the data structure, remove multi-output option */
-    if ( matches.empty() )
-      node_tuple_match[index] = UINT32_MAX;
-
-    /* return if the insertion is (partially) successful */
-    return !matches.empty();
   }
 
   inline bool multi_node_update_cut_check( uint32_t index, uint64_t signature, uint8_t phase )
@@ -3499,99 +3775,6 @@ class emap_impl
 
     return false;
   }
-
-  bool remove_unused_multioutput()
-  {
-    /* TODO: update required times */
-    for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
-    {
-      if ( ntk.is_constant( *it ) || ntk.is_pi( *it ) )
-        continue;
-
-      auto index = ntk.node_to_index( *it );
-
-      /* get used multi-output gates */
-      if ( node_tuple_match[index] == UINT32_MAX )
-        continue;
-
-      if ( node_match[index].same_match && !node_match[index].multioutput_match[0] )
-        continue;
-
-      if ( !node_match[index].same_match && !( node_match[index].multioutput_match[0] || node_match[index].multioutput_match[1] ) )
-        continue;
-
-      /* check if mapped to multi-output with unused outputs */
-      multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
-
-      bool used = false;
-      bool unused = false;
-      for ( auto j = 0; j < max_multioutput_output_size; ++j )
-      {
-        uint32_t node_index = tuple_data[j].node_index;
-        auto& node_data = node_match[node_index];
-
-        if ( node_data.best_supergate[0] != nullptr && node_data.multioutput_match[0] )
-        {
-          if ( node_data.map_refs[0] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
-            used = true;
-          else
-            unused = true;
-        }
-        else if ( node_data.best_supergate[1] != nullptr && node_data.multioutput_match[1] )
-        {
-          if ( node_data.map_refs[1] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
-            used = true;
-          else
-            unused = true;
-        }
-      }
-
-      if ( !used || !unused )
-        continue;
-
-      /* remap connected outputs (reverse topo order)*/
-      for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
-      {
-        uint32_t node_index = tuple_data[j].node_index;
-        auto& node_data = node_match[node_index];
-        auto const n = ntk.index_to_node( node_index );
-
-        if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
-          continue;
-
-        /* recursively deselect the best cut shared between
-         * the two phases if in use in the cover */
-        if ( node_data.same_match )
-        {
-          uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
-          auto const& best_cut = cuts[node_index][node_data.best_cut[use_phase]];
-          cut_deref<false>( best_cut, n, use_phase );
-        }
-
-        /* match positive phase */
-        match_phase_exact<false>( n, 0u );
-
-        /* match negative phase */
-        match_phase_exact<false>( n, 1u );
-
-        /* try to drop one phase */
-        match_drop_phase<true, true>( n, 0 );
-      }
-    }
-
-    double area_old = area;
-    bool success = set_mapping_refs<true>();
-
-    /* round stats */
-    if ( ps.verbose )
-    {
-      float area_gain = float( ( area_old - area ) / area_old * 100 );
-      std::string stats = fmt::format( "[i] Cleaning : Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      st.round_stats.push_back( stats );
-    }
-
-    return success;
-  }
 #pragma endregion
 
 #pragma region Mapping utils
@@ -3670,7 +3853,7 @@ class emap_impl
         }
 
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
-        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -3743,7 +3926,7 @@ class emap_impl
       if ( node_match[leaf].same_match )
       {
         /* Add inverter area if it is used only by the current gate and leaf node is implemented in the opposite phase */
-        if ( --node_match[leaf].map_refs[leaf_phase] == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( --node_match[leaf].map_refs[leaf_phase] == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -3850,7 +4033,7 @@ class emap_impl
         }
 
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
-        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -3946,6 +4129,41 @@ class emap_impl
     } );
   }
 
+  bool init_arrivals()
+  {
+    if ( ps.required_times.size() && ps.required_times.size() != ntk.num_pos() )
+    {
+      std::cerr << "[e] MAP ERROR: required time vector does not match the output size of the network" << std::endl;
+      st.mapping_error = true;
+      return false;
+    }
+
+    if ( ps.arrival_times.empty() )
+    {
+      ntk.foreach_pi( [&]( auto const& n ) {
+        auto& node_data = node_match[ntk.node_to_index( n )];
+        node_data.arrival[0] = node_data.best_alternative[0].arrival = 0;
+        node_data.arrival[1] = node_data.best_alternative[1].arrival = lib_inv_delay;
+      } );
+      return true;
+    }
+
+    if ( ps.arrival_times.size() != ntk.num_pis() )
+    {
+      std::cerr << "[e] MAP ERROR: arrival time vector does not match the input size of the network" << std::endl;
+      st.mapping_error = true;
+      return false;
+    }
+
+    ntk.foreach_pi( [&]( auto const& n, uint32_t i ) {
+      auto& node_data = node_match[ntk.node_to_index( n )];
+      node_data.arrival[0] = node_data.best_alternative[0].arrival = ps.arrival_times[i];
+      node_data.arrival[1] = node_data.best_alternative[1].arrival = ps.arrival_times[i] + lib_inv_delay;
+    } );
+
+    return true;
+  }
+
   void finalize_cover( binding_view<klut_network>& res, klut_map& old2new )
   {
     uint32_t multioutput_count = 0;
@@ -3958,7 +4176,7 @@ class emap_impl
       /* add inverter at PI if needed */
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -3985,7 +4203,7 @@ class emap_impl
         }
       }
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       /* add used cut */
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
@@ -4000,7 +4218,7 @@ class emap_impl
         }
 
         /* count multioutput gates */
-        if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 && node_data.multioutput_match[phase] )
+        if ( ps.map_multioutput && node_tuple_match[index].lowest_index && node_data.multioutput_match[phase] )
         {
           ++multioutput_count;
         }
@@ -4013,7 +4231,7 @@ class emap_impl
         create_lut_for_gate( res, old2new, index, phase );
 
         /* count multioutput gates */
-        if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 && node_data.multioutput_match[phase] )
+        if ( ps.map_multioutput && node_tuple_match[index].lowest_index && node_data.multioutput_match[phase] )
         {
           ++multioutput_count;
         }
@@ -4076,7 +4294,7 @@ class emap_impl
       /* add inverter at PI if needed */
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -4103,7 +4321,7 @@ class emap_impl
         }
       }
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       /* add used cut */
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
@@ -4113,11 +4331,10 @@ class emap_impl
         {
           assert( node_data.same_match == true );
 
-          if ( node_tuple_match[index] < UINT32_MAX - 1 )
+          if ( node_tuple_match[index].has_info && node_tuple_match[index].lowest_index )
           {
             ++multioutput_count;
             create_block_for_gate( res, old2new, index, phase, genlib_to_cell );
-            /* TODO: implement */
           }
           continue;
         }
@@ -4175,7 +4392,7 @@ class emap_impl
   {
     auto const& node_data = node_match[index];
     auto const& best_cut = cuts[index][node_data.best_cut[phase]];
-    auto const& gate = node_data.best_supergate[phase]->root;
+    auto const& gate = node_data.best_gate[phase]->root;
 
     /* permutate and negate to obtain the matched gate truth table */
     std::vector<signal<klut_network>> children( gate->num_vars );
@@ -4185,7 +4402,7 @@ class emap_impl
     {
       if ( ctr >= gate->num_vars )
         break;
-      children[node_data.best_supergate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
+      children[node_data.best_gate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
@@ -4236,7 +4453,7 @@ class emap_impl
   {
     auto const& node_data = node_match[index];
     auto const& best_cut = cuts[index][node_data.best_cut[phase]];
-    auto const& gate = node_data.best_supergate[phase]->root;
+    auto const& gate = node_data.best_gate[phase]->root;
 
     /* permutate and negate to obtain the matched gate truth table */
     std::vector<signal<block_network>> children( gate->num_vars );
@@ -4246,7 +4463,7 @@ class emap_impl
     {
       if ( ctr >= gate->num_vars )
         break;
-      children[node_data.best_supergate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
+      children[node_data.best_gate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
@@ -4296,7 +4513,7 @@ class emap_impl
   void create_block_for_gate( cell_view<block_network>& res, block_map& old2new, uint32_t index, unsigned phase, std::vector<uint32_t> const& genlib_to_cell )
   {
     std::vector<standard_cell> const& lib = res.get_library();
-    composed_gate<NInputs> const* local_gate = node_match[index].best_supergate[phase]->root;
+    composed_gate<NInputs> const* local_gate = node_match[index].best_gate[phase]->root;
     standard_cell const& cell = lib[genlib_to_cell.at( local_gate->root->id )];
 
     assert( !local_gate->is_super );
@@ -4311,11 +4528,11 @@ class emap_impl
     {
       if ( ctr >= local_gate->num_vars )
         break;
-      children[node_match[index].best_supergate[phase]->permutation[ctr]] = old2new[l][( node_match[index].phase[phase] >> ctr ) & 1];
+      children[node_match[index].best_gate[phase]->permutation[ctr]] = old2new[l][( node_match[index].phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
     std::vector<uint32_t> outputs;
     std::vector<kitty::dynamic_truth_table> functions;
 
@@ -4327,10 +4544,10 @@ class emap_impl
       {
         uint32_t node_index = tuple_data[j].node_index;
         assert( node_match[node_index].same_match );
-        uint8_t node_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t node_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         assert( node_match[node_index].multioutput_match[node_phase] );
 
-        gate const* node_gate = node_match[node_index].best_supergate[node_phase]->root->root;
+        gate const* node_gate = node_match[node_index].best_gate[node_phase]->root->root;
 
         /* wrong output */
         if ( node_gate->id != g.id )
@@ -4350,7 +4567,7 @@ class emap_impl
     for ( uint32_t s : outputs )
     {
       /* add inverted version if used */
-      uint8_t node_phase = node_match[s].best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t node_phase = node_match[s].best_gate[0] != nullptr ? 0 : 1;
       assert( node_match[s].same_match );
 
       /* add the node in the data structure */
@@ -4467,7 +4684,6 @@ class emap_impl
 #pragma endregion
 
 #pragma region Cuts and matching utils
-  template<bool DO_AREA>
   void compute_cut_data( cut_t& cut, node<Ntk> const& n )
   {
     cut->delay = std::numeric_limits<float>::max();
@@ -4707,7 +4923,7 @@ class emap_impl
 #pragma endregion
 
   template<bool DO_AREA>
-  inline bool compare_map( double arrival, double best_arrival, double area_flow, double best_area_flow, uint32_t size, uint32_t best_size )
+  inline bool compare_map( double arrival, double best_arrival, float area_flow, float best_area_flow, uint32_t size, uint32_t best_size )
   {
     if constexpr ( DO_AREA )
     {
@@ -4762,7 +4978,7 @@ class emap_impl
 
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -4776,7 +4992,7 @@ class emap_impl
       if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
       {
@@ -5007,7 +5223,12 @@ class emap_impl
       if constexpr ( OverlapFilter )
       {
         multi_gate_mark_visited( index1, index2, cut1 );
-        node_tuple_match[index2] = multi_node_match.size();
+        node_tuple_match[index1].has_info = 1;
+        node_tuple_match[index1].lowest_index = 1;
+        node_tuple_match[index1].index = multi_node_match.size();
+        node_tuple_match[index2].has_info = 1;
+        node_tuple_match[index2].highest_index = 1;
+        node_tuple_match[index2].index = multi_node_match.size();
       }
       else
       {
@@ -5036,16 +5257,6 @@ class emap_impl
         multi_node_match[insertion_index].push_back( p );
       }
     }
-
-    /* remove indexing for lower index for compatible overlapping cuts */
-    if constexpr ( !OverlapFilter )
-    {
-      for ( auto const& entry : multi_node_match )
-      {
-        multi_match_t const& p = entry[0];
-        node_tuple_match[p[0].node_index] = UINT32_MAX;
-      }
-    }
   }
 
   bool multi_compute_cut_data( std::array<cut_t, max_multioutput_output_size>& cut_tuple )
@@ -5152,24 +5363,30 @@ class emap_impl
   inline bool multi_gate_check_incompatible( uint32_t index1, uint32_t index2, bool& is_new, uint32_t& data_index )
   {
     /* check cut assigned cut outputs, specialized code for 2 outputs */
-    uint32_t current_assignment = node_tuple_match[index1];
-    if ( current_assignment != node_tuple_match[index2] )
-      return true;
+    if ( !node_tuple_match[index1].has_info && !node_tuple_match[index2].has_info )
+      return false;
 
-    /* load data */
-    if ( current_assignment != UINT32_MAX )
+    if ( node_tuple_match[index1].has_info && node_tuple_match[index2].has_info )
     {
+      uint32_t current_assignment = node_tuple_match[index1].index;
+      if ( current_assignment != node_tuple_match[index2].index )
+        return true;
       is_new = false;
       data_index = current_assignment;
+      return false;
     }
 
-    return false;
+    return true;
   }
 
   inline void multi_gate_mark_compatibility( uint32_t index1, uint32_t index2, uint32_t mark_value )
   {
-    node_tuple_match[index1] = mark_value;
-    node_tuple_match[index2] = mark_value;
+    node_tuple_match[index1].has_info = 1;
+    node_tuple_match[index1].lowest_index = 1;
+    node_tuple_match[index1].index = mark_value;
+    node_tuple_match[index2].has_info = 1;
+    node_tuple_match[index2].highest_index = 1;
+    node_tuple_match[index2].index = mark_value;
   }
 
   inline void multi_gate_mark_visited( uint32_t index1, uint32_t index2, multi_cut_t const& cut )
@@ -5346,8 +5563,11 @@ class emap_impl
         ntk.set_visited( g, ntk.trav_id() - 2 );
         if ( i > 0 && n == repr )
         {
-          /* fix cycle: remove multi-output match; TODO: extend for more than 2 outputs */
-          node_tuple_match[ntk.node_to_index( g )] = UINT32_MAX;
+          /* fix cycle: remove multi-output match */
+          choice_ntk.foreach_choice( repr, [&]( auto const& p ) {
+            node_tuple_match[ntk.node_to_index( p )].data = 0;
+            return true;
+          } );
           choice_ntk.remove_choice( g );
           check = true;
         }
@@ -5479,7 +5699,7 @@ class emap_impl
 
   std::vector<node<Ntk>> topo_order;
   node_match_t node_match;
-  std::vector<uint32_t> node_tuple_match;
+  std::vector<multioutput_info> node_tuple_match;
   std::vector<float> switch_activity;
   std::vector<uint64_t> tmp_visited;
 
@@ -5507,7 +5727,7 @@ class emap_impl
  * The function takes the size of the cuts in the template parameter `CutSize`.
  *
  * The function returns a block network that supports multi-output cells.
- * 
+ *
  * The novelties of this mapper are contained in 2 publications:
  * - A. Tempia Calvino and G. De Micheli, "Technology Mapping Using Multi-Output Library Cells," ICCAD, 2023.
  * - G. Radi, A. Tempia Calvino, and G. De Micheli, "In Medio Stat Virtus: Combining Boolean and Pattern Matching," ASP-DAC, 2024.
@@ -5566,7 +5786,7 @@ cell_view<block_network> emap( Ntk const& ntk, tech_library<NInputs, Configurati
  * The function takes the size of the cuts in the template parameter `CutSize`.
  *
  * The function returns a k-LUT network. Each LUT abstacts a gate of the technology library.
- * 
+ *
  * The novelties of this mapper are contained in 2 publications:
  * - A. Tempia Calvino and G. De Micheli, "Technology Mapping Using Multi-Output Library Cells," ICCAD, 2023.
  * - G. Radi, A. Tempia Calvino, and G. De Micheli, "In Medio Stat Virtus: Combining Boolean and Pattern Matching," ASP-DAC, 2024.
diff --git a/include/mockturtle/utils/struct_library.hpp b/include/mockturtle/utils/struct_library.hpp
index 7787b906b..c239ca946 100644
--- a/include/mockturtle/utils/struct_library.hpp
+++ b/include/mockturtle/utils/struct_library.hpp
@@ -371,6 +371,12 @@ class struct_library
                                   perm,
                                   gate_pol };
 
+        /* permute pin-to-pin delays */
+        for ( uint32_t i = 0; i < gate.num_vars; ++i )
+        {
+          sg.tdelay[i] = gate.tdelay[perm[i]];
+        }
+
         auto& v = _label_to_gate[index_rule.data];
 
         auto it = std::lower_bound( v.begin(), v.end(), sg, [&]( auto const& s1, auto const& s2 ) {
diff --git a/include/mockturtle/utils/super_utils.hpp b/include/mockturtle/utils/super_utils.hpp
index a2771f726..c7a74b1c1 100644
--- a/include/mockturtle/utils/super_utils.hpp
+++ b/include/mockturtle/utils/super_utils.hpp
@@ -212,8 +212,8 @@ class super_utils
 
     if ( _ps.verbose )
     {
-      std::cout << fmt::format( "[i] Loading {} simple cells in the library\n", simple_gates_size + large_gates );
-      std::cout << fmt::format( "[i] Loading {} multi-output cells in the library\n", _multioutput_gates.size() );
+      std::cout << fmt::format( "[i] Loading {} simple library cells\n", simple_gates_size + large_gates );
+      std::cout << fmt::format( "[i] Loading {} multi-output library cells\n", _multioutput_gates.size() );
     }
 
     if ( ignored > 0 )
diff --git a/include/mockturtle/utils/tech_library.hpp b/include/mockturtle/utils/tech_library.hpp
index a671e7a25..cb3c83e33 100644
--- a/include/mockturtle/utils/tech_library.hpp
+++ b/include/mockturtle/utils/tech_library.hpp
@@ -109,6 +109,9 @@ struct tech_library_params
   /*! \brief Loads multioutput gates in the library */
   bool load_multioutput_gates{ true };
 
+  /*! \brief Don't load symmetrical permutations of gate pins (drastically speeds-up mapping) */
+  bool ignore_symmetries{ false };
+
   /*! \brief Load gates with minimum size only */
   bool load_minimum_size_only{ true };
 
@@ -473,7 +476,7 @@ class tech_library
             if ( sg.root->id == it->root->id )
             {
               /* if already in the library exit, else ignore permutations if with equal delay cost */
-              if ( sg.polarity == it->polarity && sg.tdelay == it->tdelay )
+              if ( sg.polarity == it->polarity && ( _ps.ignore_symmetries || sg.tdelay == it->tdelay ) )
               {
                 to_add = false;
                 break;
@@ -534,7 +537,7 @@ class tech_library
               if ( sg.root->id == it->root->id )
               {
                 /* if already in the library exit, else ignore permutations if with equal delay cost */
-                if ( sg.polarity == it->polarity && sg.tdelay == it->tdelay )
+                if ( sg.polarity == it->polarity && ( _ps.ignore_symmetries || sg.tdelay == it->tdelay ) )
                 {
                   to_add = false;
                   break;
diff --git a/test/algorithms/emap.cpp b/test/algorithms/emap.cpp
index 0e95a7a26..514c405e8 100644
--- a/test/algorithms/emap.cpp
+++ b/test/algorithms/emap.cpp
@@ -170,8 +170,7 @@ TEST_CASE( "Emap on full adder 2", "[emap]" )
 
   emap_params ps;
   ps.cut_enumeration_ps.minimize_truth_table = false;
-  ps.use_fast_area_recovery = false;
-  ps.ela_rounds = 0;
+  ps.ela_rounds = 1;
   ps.eswp_rounds = 2;
   emap_stats st;
   binding_view<klut_network> luts = emap_klut( aig, lib, ps, &st );
@@ -244,8 +243,7 @@ TEST_CASE( "Emap on full adder 2 with cells", "[emap]" )
 
   emap_params ps;
   ps.cut_enumeration_ps.minimize_truth_table = false;
-  ps.use_fast_area_recovery = false;
-  ps.ela_rounds = 0;
+  ps.ela_rounds = 1;
   ps.eswp_rounds = 2;
   emap_stats st;
   cell_view<block_network> luts = emap( aig, lib, ps, &st );
@@ -382,12 +380,12 @@ TEST_CASE( "Emap on multiplier with multi-output gates", "[emap]" )
 
   const float eps{ 0.005f };
 
-  CHECK( luts.size() == 233u );
+  CHECK( luts.size() == 235u );
   CHECK( luts.num_pis() == 16u );
   CHECK( luts.num_pos() == 16u );
-  CHECK( luts.num_gates() == 215u );
-  CHECK( st.area > 575.0f - eps );
-  CHECK( st.area < 575.0f + eps );
+  CHECK( luts.num_gates() == 217u );
+  CHECK( st.area > 612.0f - eps );
+  CHECK( st.area < 612.0f + eps );
   CHECK( st.delay > 33.60f - eps );
   CHECK( st.delay < 33.60f + eps );
   CHECK( st.multioutput_gates == 40 );
@@ -654,6 +652,185 @@ TEST_CASE( "Emap with hybrid matching", "[emap]" )
   CHECK( st.delay < 5.8f + eps );
 }
 
+TEST_CASE( "Emap with arrival times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( large_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  const auto a = aig.create_pi();
+  const auto b = aig.create_pi();
+  const auto c = aig.create_pi();
+  const auto d = aig.create_pi();
+  const auto e = aig.create_pi();
+  const auto f = aig.create_pi();
+  const auto g = aig.create_pi();
+  const auto h = aig.create_pi();
+
+  const auto f1 = aig.create_and( !a, b );
+  const auto f2 = aig.create_and( f1, !c );
+  const auto f3 = aig.create_and( d, e );
+  const auto f4 = aig.create_and( f, !g );
+  const auto f5 = aig.create_and( f4, h );
+  const auto f6 = aig.create_and( f2, f3 );
+  const auto f7 = aig.create_and( f5, f6 );
+
+  aig.create_po( f7 );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  emap_stats st;
+
+  ps.arrival_times = std::vector<double>( 8 );
+  ps.arrival_times[0] = 0.0;
+  ps.arrival_times[1] = 1.0;
+  ps.arrival_times[2] = 2.0;
+  ps.arrival_times[3] = 3.0;
+  ps.arrival_times[4] = 4.0;
+  ps.arrival_times[5] = 5.0;
+  ps.arrival_times[6] = 6.0;
+  ps.arrival_times[7] = 7.0;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 27u );
+  CHECK( ntk.num_pis() == 8u );
+  CHECK( ntk.num_pos() == 1u );
+  CHECK( ntk.num_gates() == 17u );
+  CHECK( st.area > 24.0f - eps );
+  CHECK( st.area < 24.0f + eps );
+  CHECK( st.delay > 12.6f - eps );
+  CHECK( st.delay < 12.6f + eps );
+}
+
+TEST_CASE( "Emap with global required times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); } );
+  aig.create_po( carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  ps.required_time = 20.0; // real delay 15.7
+  emap_stats st;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
+TEST_CASE( "Emap with required times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  // ps.required_time = 20.0; // real delay 15.7
+  emap_stats st;
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); ps.required_times.push_back( 19.0 ); } );
+  aig.create_po( carry );
+  ps.required_times.push_back( 20.0 );
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
+TEST_CASE( "Emap with required time relaxation", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); } );
+  aig.create_po( carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  ps.relax_required = 27.5; // real delay 15.7
+  emap_stats st;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
 TEST_CASE( "Emap with supergates", "[emap]" )
 {
   std::vector<gate> gates;