From 4d29ec04bc4388d705f340acdec9b463064f18e0 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 24 Apr 2024 18:03:35 +0200
Subject: [PATCH 01/27] First incomplete tuned emap

---
 include/mockturtle/algorithms/emap.hpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index bf557860a..13b46c8f2 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -1089,10 +1089,10 @@ class emap_impl
     {
       if ( cuts[index].size() != 0 )
         return false;
-      /* all terminals have flow 0.0 */
-      node_data.flows[0] = node_data.flows[1] = 0.0f;
+      node_data.flows[0] = 0.0f;
       node_data.arrival[0] = 0.0f;
       /* PIs have the negative phase implemented with an inverter */
+      node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
       node_data.arrival[1] = lib_inv_delay;
       add_unit_cut( index );
       return false;
@@ -1594,8 +1594,13 @@ class emap_impl
       /* reset mapping */
       node_match[index].map_refs[0] = node_match[index].map_refs[1] = node_match[index].map_refs[2] = 0u;
 
-      if ( ntk.is_constant( n ) || ntk.is_pi( n ) )
+      if ( ntk.is_constant( n ) )
         continue;
+      if ( ntk.is_pi( n ) )
+      {
+        node_match[index].flows[1] = lib_inv_area / node_match[index].est_refs[1];
+        continue;
+      }
 
       /* don't touch box */
       if constexpr ( has_is_dont_touch_v<Ntk> )
@@ -2001,6 +2006,10 @@ class emap_impl
         area += node_data.area[use_phase];
         if ( node_data.same_match && node_data.map_refs[use_phase ^ 1] > 0 )
         {
+          if ( iteration < ps.area_flow_rounds )
+          {
+            ++node_data.map_refs[use_phase];
+          }
           area += lib_inv_area;
           ++inv;
         }
@@ -2683,8 +2692,8 @@ class emap_impl
     node_data.phase[phase_n] = node_data.phase[phase];
     node_data.arrival[phase_n] = worst_arrival_n;
     node_data.area[phase_n] = node_data.area[phase];
-    node_data.flows[phase] = node_data.flows[phase] / node_data.est_refs[2];
-    node_data.flows[phase_n] = node_data.flows[phase] + lib_inv_area;
+    node_data.flows[phase_n] = ( node_data.flows[phase] + lib_inv_area ) / node_data.est_refs[phase_n];
+    node_data.flows[phase] = node_data.flows[phase] / node_data.est_refs[phase];
   }
 
   void reindex_multioutput_data()

From ac376e83d0d501e532ca5608cebe8129e23e59c7 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 24 Apr 2024 19:35:12 +0200
Subject: [PATCH 02/27] Fixes for tuning estimated references and inverters

---
 include/mockturtle/algorithms/emap.hpp | 69 ++++++++++++--------------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 13b46c8f2..9541540a9 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -82,7 +82,7 @@ struct emap_params
   /*! \brief Parameters for cut enumeration
    *
    * The default cut limit is 16.
-   * The maximum cut limit is 15.
+   * The maximum cut limit is 31.
    * By default, truth table minimization
    * is performed.
    */
@@ -1076,25 +1076,29 @@ class emap_impl
 
     if ( ntk.is_constant( n ) )
     {
-      if ( cuts[index].size() != 0 )
-        return false;
       /* all terminals have flow 0.0 */
       node_data.flows[0] = node_data.flows[1] = 0.0f;
       node_data.arrival[0] = node_data.arrival[1] = 0.0f;
-      add_zero_cut( index );
-      match_constants( index );
+      /* skip if cuts have been computed before */
+      if ( cuts[index].size() == 0 )
+      {
+        add_zero_cut( index );
+        match_constants( index );
+      }
       return false;
     }
     else if ( ntk.is_pi( n ) )
     {
-      if ( cuts[index].size() != 0 )
-        return false;
       node_data.flows[0] = 0.0f;
       node_data.arrival[0] = 0.0f;
       /* PIs have the negative phase implemented with an inverter */
       node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
       node_data.arrival[1] = lib_inv_delay;
-      add_unit_cut( index );
+      /* skip if cuts have been computed before */
+      if ( cuts[index].size() == 0 )
+      {
+        add_unit_cut( index );
+      }
       return false;
     }
 
@@ -1106,14 +1110,8 @@ class emap_impl
     {
       if ( ntk.is_dont_touch( n ) )
       {
-        if ( cuts[index].size() != 0 )
-        {
-          propagate_data_forward_white_box( n );
-        }
-        else
-        {
-          warning_box |= initialize_box( n );
-        }
+        
+        warning_box |= initialize_box( n );
         return false;
       }
     }
@@ -1358,26 +1356,14 @@ class emap_impl
       auto const index = ntk.node_to_index( n );
       auto& node_data = node_match[index];
 
-      node_data.est_refs[0] = node_data.est_refs[1] = node_data.est_refs[2] = static_cast<float>( ntk.fanout_size( n ) );
-      node_data.map_refs[0] = node_data.map_refs[1] = node_data.map_refs[2] = 0;
-      node_data.required[0] = node_data.required[1] = std::numeric_limits<float>::max();
-
       if ( ntk.is_constant( n ) )
       {
-        /* all terminals have flow 0.0 */
-        node_data.flows[0] = node_data.flows[1] = 0.0f;
-        node_data.arrival[0] = node_data.arrival[1] = 0.0f;
         add_zero_cut( index );
         match_constants( index );
         continue;
       }
       else if ( ntk.is_pi( n ) )
       {
-        /* all terminals have flow 0.0 */
-        node_data.flows[0] = node_data.flows[1] = 0.0f;
-        node_data.arrival[0] = 0.0f;
-        /* PIs have the negative phase implemented with an inverter */
-        node_data.arrival[1] = lib_inv_delay;
         add_unit_cut( index );
         continue;
       }
@@ -1387,7 +1373,7 @@ class emap_impl
       {
         if ( ntk.is_dont_touch( n ) )
         {
-          warning_box |= initialize_box( n );
+          add_unit_cut( index );
           continue;
         }
       }
@@ -1513,9 +1499,10 @@ class emap_impl
       else if ( ntk.is_pi( n ) )
       {
         /* all terminals have flow 0 */
-        node_data.flows[0] = node_data.flows[1] = 0.0f;
+        node_data.flows[0] = 0.0f;
         node_data.arrival[0] = 0.0f;
         /* PIs have the negative phase implemented with an inverter */
+        node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
         node_data.arrival[1] = lib_inv_delay;
         add_unit_cut( index );
         continue;
@@ -2008,7 +1995,8 @@ class emap_impl
         {
           if ( iteration < ps.area_flow_rounds )
           {
-            ++node_data.map_refs[use_phase];
+            // ++node_data.map_refs[use_phase];
+            node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
           }
           area += lib_inv_area;
           ++inv;
@@ -2079,6 +2067,11 @@ class emap_impl
       area += node_match[index].area[0];
       if ( node_match[index].map_refs[1] )
       {
+        if ( iteration < ps.area_flow_rounds )
+        {
+          // ++node_match[index].map_refs[0];
+          node_match[index].map_refs[0] += node_match[index].map_refs[1];
+        }
         area += lib_inv_area;
         ++inv;
       }
@@ -2717,9 +2710,11 @@ class emap_impl
   bool initialize_box( node<Ntk> const& n )
   {
     uint32_t index = ntk.node_to_index( n );
-    auto& node_data = node_match[index];
-    add_unit_cut( index );
 
+    if ( cuts[index].size() == 0 )
+      add_unit_cut( index );
+
+    auto& node_data = node_match[index];
     node_data.same_match = true;
 
     /* if it has mapping data propagate the delays and measure the data */
@@ -2730,11 +2725,11 @@ class emap_impl
     }
 
     /* consider as a black box */
-    node_data.flows[0] = node_data.flows[1] = 0.0f;
+    node_data.flows[0] = 0.0f;
+    node_data.flows[1] = lib_inv_area / node_data.est_ref[1];
     node_data.arrival[0] = 0.0f;
     node_data.arrival[1] = lib_inv_delay;
     node_data.area[0] = node_data.area[1] = 0;
-    node_data.flows[0] = 0;
 
     return true;
   }
@@ -2758,8 +2753,8 @@ class emap_impl
     node_data.arrival[0] = arrival;
     node_data.arrival[1] = arrival + lib_inv_delay;
     node_data.area[0] = node_data.area[1] = gate.area;
-    node_data.flows[0] = node_data.area[0] / node_data.est_refs[2];
-    node_data.flows[1] = node_data.flows[0] + lib_inv_area;
+    node_data.flows[1] = ( node_data.flows[0] + lib_inv_area ) / node_data.est_refs[1];
+    node_data.flows[0] = node_data.area[0] / node_data.est_refs[0];
   }
 
   void propagate_data_backward_white_box( node<Ntk> const& n )

From 28091a09bedf5bdc9f7f1a475cc161b9502aedb7 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Thu, 25 Apr 2024 10:08:14 +0200
Subject: [PATCH 03/27] Removing collective references and collective
 estimations, removing default value initilization for cuts

---
 include/mockturtle/algorithms/emap.hpp | 148 ++++++++++++-------------
 1 file changed, 71 insertions(+), 77 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 9541540a9..2e61319a5 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -202,20 +202,20 @@ template<unsigned NInputs>
 struct cut_enumeration_emap_cut
 {
   /* stats */
-  double delay{ 0 };
-  double flow{ 0 };
-  bool ignore{ false };
+  uint32_t delay;
+  float flow;
+  bool ignore;
 
   /* pattern index for structural matching*/
-  uint32_t pattern_index{ 0 };
+  uint32_t pattern_index;
 
   /* function */
   kitty::static_truth_table<6> function;
 
   /* list of supergates matching the cut for positive and negative output phases */
-  std::array<std::vector<supergate<NInputs>> const*, 2> supergates = { nullptr, nullptr };
+  std::array<std::vector<supergate<NInputs>> const*, 2> supergates;
   /* input negations, 0: pos, 1: neg */
-  std::array<uint16_t, 2> negations{ 0, 0 };
+  std::array<uint16_t, 2> negations;
 };
 
 struct cut_enumeration_emap_multi_cut
@@ -692,10 +692,10 @@ struct node_match_emap
   /* area of the best matches */
   float area[2];
 
-  /* number of references in the cover 0: pos, 1: neg, 2: pos+neg */
-  uint32_t map_refs[3];
+  /* number of references in the cover 0: pos, 1: neg */
+  uint32_t map_refs[2];
   /* references estimation */
-  float est_refs[3];
+  float est_refs[2];
   /* area flow */
   float flows[2];
 };
@@ -1070,8 +1070,8 @@ class emap_impl
     auto const index = ntk.node_to_index( n );
     auto& node_data = node_match[index];
 
-    node_data.est_refs[0] = node_data.est_refs[1] = node_data.est_refs[2] = static_cast<double>( ntk.fanout_size( n ) );
-    node_data.map_refs[0] = node_data.map_refs[1] = node_data.map_refs[2] = 0;
+    node_data.est_refs[0] = node_data.est_refs[1] = static_cast<double>( ntk.fanout_size( n ) );
+    node_data.map_refs[0] = node_data.map_refs[1] = 0;
     node_data.required[0] = node_data.required[1] = std::numeric_limits<float>::max();
 
     if ( ntk.is_constant( n ) )
@@ -1390,7 +1390,7 @@ class emap_impl
     /* round stats */
     if ( ps.verbose )
     {
-      st.round_stats.push_back( fmt::format( "[i] SCuts    : Cuts  = {:>12d} Time = {:>5.2f}\n", cuts_total, to_seconds( clock::now() - time_begin ) ) );
+      st.round_stats.push_back( fmt::format( "[i] SCuts    : Cuts  = {:>12d}  Time = {:>5.2f}\n", cuts_total, to_seconds( clock::now() - time_begin ) ) );
     }
 
     return true;
@@ -1484,8 +1484,8 @@ class emap_impl
       node_data.same_match = 0;
       node_data.multioutput_match[0] = node_data.multioutput_match[1] = false;
       node_data.required[0] = node_data.required[1] = std::numeric_limits<float>::max();
-      node_data.map_refs[0] = node_data.map_refs[1] = node_data.map_refs[2] = 0;
-      node_data.est_refs[0] = node_data.est_refs[1] = node_data.est_refs[2] = static_cast<float>( ntk.fanout_size( n ) );
+      node_data.map_refs[0] = node_data.map_refs[1] = 0;
+      node_data.est_refs[0] = node_data.est_refs[1] = static_cast<float>( ntk.fanout_size( n ) );
 
       if ( ntk.is_constant( n ) )
       {
@@ -1579,7 +1579,7 @@ class emap_impl
       uint32_t index = ntk.node_to_index( n );
 
       /* reset mapping */
-      node_match[index].map_refs[0] = node_match[index].map_refs[1] = node_match[index].map_refs[2] = 0u;
+      node_match[index].map_refs[0] = node_match[index].map_refs[1] = 0u;
 
       if ( ntk.is_constant( n ) )
         continue;
@@ -1678,7 +1678,7 @@ class emap_impl
 
       /* recursively deselect the best cut shared between
        * the two phases if in use in the cover */
-      if ( node_data.same_match && node_data.map_refs[2] != 0 )
+      if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
       {
         uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
         auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
@@ -1741,7 +1741,7 @@ class emap_impl
       auto& node_data = node_match[index];
 
       /* skip not mapped nodes */
-      if ( node_match[index].map_refs[2] == 0 )
+      if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
       /* don't touch box */
@@ -1908,7 +1908,6 @@ class emap_impl
 
       if constexpr ( !ELA )
       {
-        node_match[index].map_refs[2]++;
         if ( ntk.is_complemented( s ) )
           node_match[index].map_refs[1]++;
         else
@@ -1927,7 +1926,7 @@ class emap_impl
       /* skip constants and PIs */
       if ( ntk.is_constant( *it ) )
       {
-        if ( node_match[index].map_refs[2] > 0u )
+        if ( node_data.map_refs[0] || node_data.map_refs[1] )
         {
           /* if used and not available in the library launch a mapping error */
           if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
@@ -1951,7 +1950,7 @@ class emap_impl
       }
 
       /* continue if not referenced in the cover */
-      if ( node_match[index].map_refs[2] == 0u )
+      if ( !node_match[index].map_refs[0] && !node_match[index].map_refs[1] )
         continue;
 
       /* don't touch box */
@@ -1983,7 +1982,6 @@ class emap_impl
 
           for ( auto const leaf : best_cut )
           {
-            node_match[leaf].map_refs[2]++;
             if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
               node_match[leaf].map_refs[1]++;
             else
@@ -2016,7 +2014,6 @@ class emap_impl
           auto ctr = 0u;
           for ( auto const leaf : best_cut )
           {
-            node_match[leaf].map_refs[2]++;
             if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
               node_match[leaf].map_refs[1]++;
             else
@@ -2037,9 +2034,8 @@ class emap_impl
     /* blend estimated references */
     for ( auto i = 0u; i < ntk.size(); ++i )
     {
-      node_match[i].est_refs[2] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[2] + 2.0f * node_match[i].map_refs[2] ) / 3.0 );
-      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
       node_match[i].est_refs[0] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[0] + 2.0f * node_match[i].map_refs[0] ) / 3.0 );
+      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
     }
 
     return true;
@@ -2054,7 +2050,6 @@ class emap_impl
       ntk.foreach_fanin( n, [&]( auto const& f ) {
         uint32_t leaf = ntk.node_to_index( ntk.get_node( f ) );
         uint8_t phase = ntk.is_complemented( f ) ? 1 : 0;
-        node_match[leaf].map_refs[2]++;
         node_match[leaf].map_refs[phase]++;
       } );
     }
@@ -2131,7 +2126,7 @@ class emap_impl
 
       const auto index = ntk.node_to_index( *it );
 
-      if ( node_match[index].map_refs[2] == 0 )
+      if ( !node_match[index].map_refs[0] && !node_match[index].map_refs[1] )
         continue;
 
       match_propagate_required( index );
@@ -2176,7 +2171,7 @@ class emap_impl
           if constexpr ( has_has_binding_v<Ntk> )
           {
             propagate_data_forward_white_box( n );
-            if ( node_data.map_refs[2] )
+            if ( node_match[index].map_refs[0] || node_match[index].map_refs[1] )
               area += node_data.area[0];
             if ( node_data.map_refs[1] )
             {
@@ -2205,7 +2200,7 @@ class emap_impl
       node_data.arrival[use_phase] = worst_arrival;
 
       /* compute area */
-      if ( ( node_data.map_refs[2] && node_data.same_match ) || node_data.map_refs[use_phase] > 0 )
+      if ( node_data.map_refs[use_phase] > 0 || ( node_data.same_match && ( node_match[index].map_refs[0] || node_match[index].map_refs[1] ) ) )
       {
         area += node_data.area[use_phase];
         if ( node_data.same_match && node_data.map_refs[use_phase ^ 1] > 0 )
@@ -2529,7 +2524,7 @@ class emap_impl
       set_match_complemented_phase( index, 1, worst_arrival_npos );
       if constexpr ( ELA )
       {
-        if ( node_data.map_refs[2] )
+        if ( node_data.map_refs[0] || node_data.map_refs[1] )
           cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
       }
       return;
@@ -2539,7 +2534,7 @@ class emap_impl
       set_match_complemented_phase( index, 0, worst_arrival_nneg );
       if constexpr ( ELA )
       {
-        if ( node_data.map_refs[2] )
+        if ( node_data.map_refs[0] || node_data.map_refs[1] )
           cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
       }
       return;
@@ -2646,10 +2641,10 @@ class emap_impl
           if ( node_data.map_refs[1] > 0 )
             cut_deref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
           /* reference the positive cut if not in use before */
-          if ( node_data.map_refs[0] == 0 && node_data.map_refs[2] )
+          if ( node_data.map_refs[0] == 0 && node_data.map_refs[1] > 0 )
             cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
         }
-        else if ( node_data.map_refs[2] )
+        else if ( node_data.map_refs[0] || node_data.map_refs[1] )
           cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
       }
       set_match_complemented_phase( index, 0, worst_arrival_nneg );
@@ -2665,10 +2660,10 @@ class emap_impl
           if ( node_data.map_refs[0] > 0 )
             cut_deref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
           /* reference the negative cut if not in use before */
-          if ( node_data.map_refs[1] == 0 && node_data.map_refs[2] )
+          if ( node_data.map_refs[1] == 0 && node_data.map_refs[0] > 0 )
             cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
         }
-        else if ( node_data.map_refs[2] )
+        else if ( node_data.map_refs[0] || node_data.map_refs[1] )
           cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
       }
       set_match_complemented_phase( index, 1, worst_arrival_npos );
@@ -2929,13 +2924,15 @@ class emap_impl
             for ( auto k = 0; k < max_multioutput_output_size; ++k )
             {
               uint32_t index_k = tuple_data[k].node_index;
-              k_est += node_match[index_k].est_refs[2];
+              auto used_phase = node_match[index_k].supergate[0] == nullptr ? 1 : 0;
+              k_est += node_match[index_k].est_refs[used_phase]; /* TODO: review */
             }
             mapped_flow *= k_est;
           }
           else
           {
-            mapped_flow *= node_data.est_refs[2];
+            auto used_phase = node_data.supergate[0] == nullptr ? 1 : 0; /* TODO: review */
+            mapped_flow *= node_data.est_refs[used_phase];
           }
 
           auto const& mapped_cut = cuts[node_index][node_data.best_cut[phase[j]]];
@@ -2948,7 +2945,7 @@ class emap_impl
 
         /* quit exit to not unmap phases, TODO: implement it well */
         /* current version may lead to delay increase */
-        est_refs[j] = node_data.est_refs[2];
+        est_refs[j] = node_data.est_refs[phase[j]];
       }
 
       /* not better than individual gates */
@@ -2969,7 +2966,7 @@ class emap_impl
         flow_sum += area_flow[j];
         combined_est_refs += est_refs[j];
       }
-      flow_sum = flow_sum / combined_est_refs;
+      flow_sum = flow_sum ;
 
       /* not better than individual gates */
       if ( respects_required && ( flow_sum > old_flow_sum + epsilon ) )
@@ -3038,7 +3035,7 @@ class emap_impl
       for ( uint32_t j = 0; j < max_multioutput_output_size; ++j )
       {
         uint32_t node_index = tuple_data[j].node_index;
-        if ( node_match[node_index].map_refs[2] == 0 )
+        if ( !node_match[node_index].map_refs[0] && !node_match[node_index].map_refs[1] )
         {
           return false;
         }
@@ -3049,10 +3046,9 @@ class emap_impl
     for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
     {
       uint32_t node_index = tuple_data[j].node_index;
-      best_exact_area[j] = node_match[node_index].flows[0] * node_match[node_index].est_refs[2];
       uint8_t selected_phase = node_match[node_index].best_supergate[0] == nullptr ? 1 : 0;
 
-      if ( node_match[node_index].map_refs[2] != 0 )
+      if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
         /* match is always single output here */
         auto const& cut = cuts[node_index][node_match[node_index].best_cut[0]];
@@ -3074,7 +3070,7 @@ class emap_impl
     {
       uint32_t node_index = tuple_data[j].node_index;
 
-      if ( node_match[node_index].map_refs[2] != 0 )
+      if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
         uint8_t use_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
         auto const& best_cut = cuts[node_index][node_match[node_index].best_cut[use_phase]];
@@ -3221,7 +3217,7 @@ class emap_impl
         node_data.arrival[mapped_phase] = arrival[j];
         node_data.area[mapped_phase] = area[j]; /* partial area contribution */
 
-        node_data.flows[mapped_phase] = area_exact[j] / node_data.est_refs[2]; /* partial exact area contribution */
+        node_data.flows[mapped_phase] = area_exact[j]; /* partial exact area contribution */
         /* select opposite phase */
         mapped_phase ^= 1;
         node_data.multioutput_match[mapped_phase] = true;
@@ -3230,7 +3226,7 @@ class emap_impl
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
         node_data.area[mapped_phase] = area[j]; /* partial area contribution */
-        node_data.flows[mapped_phase] = area_exact[j] / node_data.est_refs[2];
+        node_data.flows[mapped_phase] = area_exact[j];
 
         assert( node_data.arrival[mapped_phase] < node_data.required[mapped_phase] + epsilon );
       }
@@ -3380,7 +3376,7 @@ class emap_impl
     assert( !node_data.multioutput_match[0] );
     assert( !node_data.multioutput_match[1] );
 
-    if ( node_data.same_match && node_data.map_refs[2] != 0 )
+    if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
     {
       uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
       auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
@@ -3529,14 +3525,14 @@ class emap_impl
 
         if ( node_data.best_supergate[0] != nullptr && node_data.multioutput_match[0] )
         {
-          if ( node_data.map_refs[0] > 0 || ( node_data.same_match && node_data.map_refs[2] > 0 ) )
+          if ( node_data.map_refs[0] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
             used = true;
           else
             unused = true;
         }
         else if ( node_data.best_supergate[1] != nullptr && node_data.multioutput_match[1] )
         {
-          if ( node_data.map_refs[1] > 0 || ( node_data.same_match && node_data.map_refs[2] > 0 ) )
+          if ( node_data.map_refs[1] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
             used = true;
           else
             unused = true;
@@ -3553,12 +3549,12 @@ class emap_impl
         auto& node_data = node_match[node_index];
         auto const n = ntk.index_to_node( node_index );
 
-        if ( node_data.map_refs[2] == 0 )
+        if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
           continue;
 
         /* recursively deselect the best cut shared between
          * the two phases if in use in the cover */
-        if ( node_data.same_match && node_data.map_refs[2] != 0 )
+        if ( node_data.same_match )
         {
           uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
           auto const& best_cut = cuts[node_index][node_data.best_cut[use_phase]];
@@ -3649,18 +3645,23 @@ class emap_impl
             else
               count += lib_inv_area;
           }
-          ++node_match[leaf].map_refs[2];
         }
         else
         {
           ++node_match[leaf].map_refs[0];
-          ++node_match[leaf].map_refs[2];
         }
         continue;
       }
 
       if ( node_match[leaf].same_match )
       {
+        /* Recursive referencing if leaf was not referenced */
+        if ( !node_match[leaf].map_refs[0] && !node_match[leaf].map_refs[1] )
+        {
+          auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
+          count += cut_ref<SwitchActivity>( best_cut, ntk.index_to_node( leaf ), leaf_phase );
+        }
+
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
         if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
         {
@@ -3669,16 +3670,9 @@ class emap_impl
           else
             count += lib_inv_area;
         }
-        /* Recursive referencing if leaf was not referenced */
-        if ( node_match[leaf].map_refs[2]++ == 0u )
-        {
-          auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
-          count += cut_ref<SwitchActivity>( best_cut, ntk.index_to_node( leaf ), leaf_phase );
-        }
       }
       else
       {
-        ++node_match[leaf].map_refs[2];
         if ( node_match[leaf].map_refs[leaf_phase]++ == 0u )
         {
           auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
@@ -3731,12 +3725,10 @@ class emap_impl
             else
               count += lib_inv_area;
           }
-          --node_match[leaf].map_refs[2];
         }
         else
         {
           --node_match[leaf].map_refs[0];
-          --node_match[leaf].map_refs[2];
         }
         continue;
       }
@@ -3752,7 +3744,7 @@ class emap_impl
             count += lib_inv_area;
         }
         /* Recursive dereferencing */
-        if ( --node_match[leaf].map_refs[2] == 0u )
+        if ( !node_match[leaf].map_refs[0] && !node_match[leaf].map_refs[1] )
         {
           auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
           count += cut_deref<SwitchActivity>( best_cut, ntk.index_to_node( leaf ), leaf_phase );
@@ -3760,7 +3752,6 @@ class emap_impl
       }
       else
       {
-        --node_match[leaf].map_refs[2];
         if ( --node_match[leaf].map_refs[leaf_phase] == 0u )
         {
           auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
@@ -3782,7 +3773,6 @@ class emap_impl
     for ( auto s : tmp_visited )
     {
       uint32_t leaf = s >> 1;
-      --node_match[leaf].map_refs[2];
       --node_match[leaf].map_refs[s & 1];
     }
 
@@ -3835,18 +3825,23 @@ class emap_impl
             else
               count += lib_inv_area;
           }
-          ++node_match[leaf].map_refs[2];
         }
         else
         {
           ++node_match[leaf].map_refs[0];
-          ++node_match[leaf].map_refs[2];
         }
         continue;
       }
 
       if ( node_match[leaf].same_match )
       {
+        /* Recursive referencing if leaf was not referenced */
+        if ( !node_match[leaf].map_refs[0] && !node_match[leaf].map_refs[1] )
+        {
+          auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
+          count += cut_ref_visit<SwitchActivity>( best_cut, ntk.index_to_node( leaf ), leaf_phase );
+        }
+
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
         if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
         {
@@ -3855,16 +3850,9 @@ class emap_impl
           else
             count += lib_inv_area;
         }
-        /* Recursive referencing if leaf was not referenced */
-        if ( node_match[leaf].map_refs[2]++ == 0u )
-        {
-          auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
-          count += cut_ref_visit<SwitchActivity>( best_cut, ntk.index_to_node( leaf ), leaf_phase );
-        }
       }
       else
       {
-        ++node_match[leaf].map_refs[2];
         if ( node_match[leaf].map_refs[leaf_phase]++ == 0u )
         {
           auto const& best_cut = cuts[leaf][node_match[leaf].best_cut[leaf_phase]];
@@ -3977,7 +3965,7 @@ class emap_impl
       }
 
       /* continue if cut is not in the cover */
-      if ( node_data.map_refs[2] == 0u )
+      if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
       /* don't touch box */
@@ -4095,7 +4083,7 @@ class emap_impl
       }
 
       /* continue if cut is not in the cover */
-      if ( node_data.map_refs[2] == 0u )
+      if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
       /* don't touch box */
@@ -4562,8 +4550,8 @@ class emap_impl
   void recompute_cut_data( cut_t& cut, node<Ntk> const& n )
   {
     /* compute cut cost based on LUT area */
-    double best_arrival = 0;
-    double best_area_flow = cut.size() > 1 ? cut.size() : 0;
+    uint32_t best_arrival = 0;
+    float best_area_flow = cut.size() > 1 ? cut.size() : 0;
 
     for ( auto leaf : cut )
     {
@@ -4604,7 +4592,10 @@ class emap_impl
   {
     auto& cut = cuts[index].add_cut( &index, &index ); /* fake iterator for emptyness */
     cut->ignore = true;
+    cut->delay = 0;
+    cut->flow = 0;
     cut->pattern_index = 0;
+    cut->negations[0] = cut->negations[1] = 0;
   }
 
   void add_unit_cut( uint32_t index )
@@ -4613,7 +4604,10 @@ class emap_impl
 
     kitty::create_nth_var( cut->function, 0 );
     cut->ignore = true;
+    cut->delay = 0;
+    cut->flow = 0;
     cut->pattern_index = 1;
+    cut->negations[0] = cut->negations[1] = 0;
   }
 
   inline void create_structural_cut( cut_t& new_cut, std::vector<cut_t const*> const& vcuts, uint32_t new_pattern, uint32_t pattern_id1, uint32_t pattern_id2 )
@@ -4772,7 +4766,7 @@ class emap_impl
       }
 
       /* continue if cut is not in the cover */
-      if ( node_match[index].map_refs[2] == 0u )
+      if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
       unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;

From 0a8f4b8ddd1497c9523b4168532bcd87028ae493 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Thu, 25 Apr 2024 11:37:01 +0200
Subject: [PATCH 04/27] Experiment file

---
 experiments/emap.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index 5b1bc2f10..424d96d6a 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -66,9 +66,9 @@ int main()
 
   tech_library_params tps;
   tps.verbose = true;
-  tech_library<9> tech_lib( gates, tps );
+  tech_library<6> tech_lib( gates, tps );
 
-  for ( auto const& benchmark : epfl_benchmarks() )
+  for ( auto const& benchmark : iwls_benchmarks() )
   {
     fmt::print( "[i] processing {}\n", benchmark );
 
@@ -78,21 +78,25 @@ int main()
       continue;
     }
 
+    if ( aig.num_gates() > 100000 )
+      continue;
+
     /* remove structural redundancies */
     aig_balancing_params bps;
     bps.minimize_levels = false;
-    bps.fast_mode = false;
+    bps.fast_mode = true;
     aig_balance( aig, bps );
 
     const uint32_t size_before = aig.num_gates();
     const uint32_t depth_before = depth_view( aig ).depth();
 
     emap_params ps;
-    ps.matching_mode = emap_params::hybrid;
+    ps.matching_mode = emap_params::boolean;
     ps.area_oriented_mapping = false;
     ps.map_multioutput = true;
+    ps.verbose = true;
     emap_stats st;
-    cell_view<block_network> res = emap<9>( aig, tech_lib, ps, &st );
+    cell_view<block_network> res = emap<6>( aig, tech_lib, ps, &st );
 
     names_view res_names{ res };
     restore_network_name( aig, res_names );

From 25b0bf1cbbad26d333ef1d89ebf83249297182ec Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Thu, 25 Apr 2024 16:15:44 +0200
Subject: [PATCH 05/27] Add new multi-output cut initialization

---
 include/mockturtle/algorithms/emap.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 2e61319a5..ef1f68237 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -5082,6 +5082,9 @@ class emap_impl
     /* add cut matches */
     for ( auto i = 0; i < max_multioutput_output_size; ++i )
     {
+      cut_tuple[order[i]]->supergates[0] = nullptr;
+      cut_tuple[order[i]]->supergates[1] = nullptr;
+      cut_tuple[order[i]]->ignore = false;
       std::vector<supergate<NInputs>> const* multigate = &( ( *multigates_match )[i] );
       cut_tuple[order[i]]->supergates[phase_order[i]] = multigate;
     }

From 996c06854b8148b6e89238bdc4b544aae0b4b1a9 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Thu, 25 Apr 2024 19:23:58 +0200
Subject: [PATCH 06/27] Fixes to emap

---
 include/mockturtle/algorithms/emap.hpp | 87 ++++++++++++++------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index ef1f68237..88c64a722 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -1166,6 +1166,7 @@ class emap_impl
     rcuts.set_cut_limit( ps.cut_enumeration_ps.cut_limit );
 
     cut_t new_cut;
+    new_cut->pattern_index = 0;
     fanin_cut_t vcuts;
 
     for ( auto const& c1 : *lcuts[0] )
@@ -2871,7 +2872,6 @@ class emap_impl
         /* get the output phase */
         pin_phase[j] = gate.polarity;
         phase[j] = ( gate.polarity >> NInputs ) ^ phase_inverted;
-        uint8_t old_phase = node_data.phase[phase[j]];
 
         /* compute arrival */
         arrival[j] = 0.0;
@@ -2905,45 +2905,49 @@ class emap_impl
           respects_required = false;
 
         /* compute area flow */
-        old_flow_sum += node_data.flows[phase[j]];
+        if ( j == 0 || !node_data.multioutput_match[0] )
+        {
+          uint8_t current_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+          old_flow_sum += node_data.flows[current_phase];
+        }
+        uint8_t old_phase = node_data.phase[phase[j]];
         node_data.phase[phase[j]] = gate.polarity;
         area[j] = gate.area;
         area_flow[j] = gate.area + cut_leaves_flow( cut, n, phase[j] );
         node_data.phase[phase[j]] = old_phase;
 
         /* local evaluation for delay (area flow improvement is approximated) */
-        if constexpr ( !DO_AREA )
-        {
-          /* recompute local area flow of previous matches */
-          double mapped_flow = node_data.flows[phase[j]];
-
-          if ( node_data.multioutput_match[phase[j]] )
-          {
-            /* recompute estimation for multi-output gate */
-            float k_est = 0;
-            for ( auto k = 0; k < max_multioutput_output_size; ++k )
-            {
-              uint32_t index_k = tuple_data[k].node_index;
-              auto used_phase = node_match[index_k].supergate[0] == nullptr ? 1 : 0;
-              k_est += node_match[index_k].est_refs[used_phase]; /* TODO: review */
-            }
-            mapped_flow *= k_est;
-          }
-          else
-          {
-            auto used_phase = node_data.supergate[0] == nullptr ? 1 : 0; /* TODO: review */
-            mapped_flow *= node_data.est_refs[used_phase];
-          }
-
-          auto const& mapped_cut = cuts[node_index][node_data.best_cut[phase[j]]];
-          if ( !compare_map<DO_AREA>( arrival[j], node_data.arrival[phase[j]], area_flow[j], mapped_flow, cut.size(), mapped_cut.size() ) )
-          {
-            is_best = false;
-            break;
-          }
-        }
+      //   if constexpr ( !DO_AREA )
+      //   {
+      //     /* recompute local area flow of previous matches */
+      //     double mapped_flow = node_data.flows[phase[j]];
+
+      //     if ( node_data.multioutput_match[phase[j]] )
+      //     {
+      //       /* recompute estimation for multi-output gate */
+      //       float k_est = 0;
+      //       for ( auto k = 0; k < max_multioutput_output_size; ++k )
+      //       {
+      //         uint32_t index_k = tuple_data[k].node_index;
+      //         auto used_phase = node_match[index_k].supergate[0] == nullptr ? 1 : 0;
+      //         k_est += node_match[index_k].est_refs[used_phase]; /* TODO: review */
+      //       }
+      //       mapped_flow *= k_est;
+      //     }
+      //     else
+      //     {
+      //       auto used_phase = node_data.supergate[0] == nullptr ? 1 : 0; /* TODO: review */
+      //       mapped_flow *= node_data.est_refs[used_phase];
+      //     }
+
+      //     auto const& mapped_cut = cuts[node_index][node_data.best_cut[phase[j]]];
+      //     if ( !compare_map<DO_AREA>( arrival[j], node_data.arrival[phase[j]], area_flow[j], mapped_flow, cut.size(), mapped_cut.size() ) )
+      //     {
+      //       is_best = false;
+      //       break;
+      //     }
+      //   }
 
-        /* quit exit to not unmap phases, TODO: implement it well */
         /* current version may lead to delay increase */
         est_refs[j] = node_data.est_refs[phase[j]];
       }
@@ -2959,20 +2963,23 @@ class emap_impl
       }
 
       /* combine evaluation for precise area flow estimantion */
-      double flow_sum = 0;
-      double combined_est_refs = 0;
+      /* compute equation AF(n) = ( Area(G) + |roots| * SUM_{l in leaves} AF(l) ) / SUM_{p in roots} est_refs( p ) */
+      float flow_sum_pos = 0, flow_sum_neg;
+      float combined_est_refs = 0;
       for ( auto j = 0; j < max_multioutput_output_size; ++j )
       {
-        flow_sum += area_flow[j];
+        flow_sum_pos += area_flow[j];
         combined_est_refs += est_refs[j];
       }
-      flow_sum = flow_sum ;
+      flow_sum_neg = flow_sum_pos;
+      flow_sum_pos /= combined_est_refs;
 
       /* not better than individual gates */
-      if ( respects_required && ( flow_sum > old_flow_sum + epsilon ) )
+      if ( respects_required && ( flow_sum_pos > old_flow_sum + epsilon ) )
         continue;
 
       mapped_multioutput = true;
+      flow_sum_neg = ( flow_sum_neg + lib_inv_area ) / combined_est_refs;
 
       /* commit multi-output gate */
       for ( uint32_t j = 0; j < max_multioutput_output_size; ++j )
@@ -2991,7 +2998,7 @@ class emap_impl
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j];
         node_data.area[mapped_phase] = area[j]; /* partial area contribution */
-        node_data.flows[mapped_phase] = flow_sum;
+        node_data.flows[mapped_phase] = flow_sum_pos;
 
         assert( node_data.arrival[mapped_phase] < node_data.required[mapped_phase] + epsilon );
 
@@ -3003,7 +3010,7 @@ class emap_impl
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
         node_data.area[mapped_phase] = area[j];                  /* partial area contribution */
-        node_data.flows[mapped_phase] = flow_sum + lib_inv_area; /* TODO: check quality */
+        node_data.flows[mapped_phase] = flow_sum_neg;
 
         assert( node_data.arrival[mapped_phase] < node_data.required[mapped_phase] + epsilon );
       }

From 4d6ba961ce42656809a44ee57a4043f9e17ceb7a Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Fri, 26 Apr 2024 09:50:37 +0200
Subject: [PATCH 07/27] Lowering the default memory usage and formatting the
 code

---
 include/mockturtle/algorithms/emap.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 88c64a722..aaf9b7285 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -82,7 +82,7 @@ struct emap_params
   /*! \brief Parameters for cut enumeration
    *
    * The default cut limit is 16.
-   * The maximum cut limit is 31.
+   * The maximum cut limit is 19.
    * By default, truth table minimization
    * is performed.
    */
@@ -716,7 +716,7 @@ class emap_impl
 {
 public:
   static constexpr float epsilon = 0.0005;
-  static constexpr uint32_t max_cut_num = 32;
+  static constexpr uint32_t max_cut_num = 20;
   using cut_t = cut<CutSize, cut_enumeration_emap_cut<NInputs>>;
   using cut_set_t = emap_cut_set<cut_t, max_cut_num>;
   using cut_merge_t = typename std::array<cut_set_t*, Ntk::max_fanin_size + 1>;
@@ -1391,7 +1391,7 @@ class emap_impl
     /* round stats */
     if ( ps.verbose )
     {
-      st.round_stats.push_back( fmt::format( "[i] SCuts    : Cuts  = {:>12d}  Time = {:>5.2f}\n", cuts_total, to_seconds( clock::now() - time_begin ) ) );
+      st.round_stats.push_back( fmt::format( "[i] SCuts    : Cuts  = {:>12d}  Time = {:>12.2f}\n", cuts_total, to_seconds( clock::now() - time_begin ) ) );
     }
 
     return true;

From 72e7d7e1a6fe16002620a833bd8ffc3c3b0de15f Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Fri, 26 Apr 2024 09:54:29 +0200
Subject: [PATCH 08/27] Reverting emap experiment to default

---
 experiments/emap.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index 424d96d6a..92fed626a 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -66,9 +66,9 @@ int main()
 
   tech_library_params tps;
   tps.verbose = true;
-  tech_library<6> tech_lib( gates, tps );
+  tech_library<9> tech_lib( gates, tps );
 
-  for ( auto const& benchmark : iwls_benchmarks() )
+  for ( auto const& benchmark : epfl_benchmarks() )
   {
     fmt::print( "[i] processing {}\n", benchmark );
 
@@ -78,9 +78,6 @@ int main()
       continue;
     }
 
-    if ( aig.num_gates() > 100000 )
-      continue;
-
     /* remove structural redundancies */
     aig_balancing_params bps;
     bps.minimize_levels = false;
@@ -91,12 +88,11 @@ int main()
     const uint32_t depth_before = depth_view( aig ).depth();
 
     emap_params ps;
-    ps.matching_mode = emap_params::boolean;
+    ps.matching_mode = emap_params::hybrid;
     ps.area_oriented_mapping = false;
     ps.map_multioutput = true;
-    ps.verbose = true;
     emap_stats st;
-    cell_view<block_network> res = emap<6>( aig, tech_lib, ps, &st );
+    cell_view<block_network> res = emap<9>( aig, tech_lib, ps, &st );
 
     names_view res_names{ res };
     restore_network_name( aig, res_names );

From ecb1746aa9a03a98ab7d7dd97da4c917d271b16b Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Fri, 26 Apr 2024 10:12:14 +0200
Subject: [PATCH 09/27] Add updated tests

---
 test/algorithms/emap.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/algorithms/emap.cpp b/test/algorithms/emap.cpp
index b25dce881..0e95a7a26 100644
--- a/test/algorithms/emap.cpp
+++ b/test/algorithms/emap.cpp
@@ -382,15 +382,15 @@ TEST_CASE( "Emap on multiplier with multi-output gates", "[emap]" )
 
   const float eps{ 0.005f };
 
-  CHECK( luts.size() == 234u );
+  CHECK( luts.size() == 233u );
   CHECK( luts.num_pis() == 16u );
   CHECK( luts.num_pos() == 16u );
-  CHECK( luts.num_gates() == 216u );
-  CHECK( st.area > 577.0f - eps );
-  CHECK( st.area < 577.0f + eps );
+  CHECK( luts.num_gates() == 215u );
+  CHECK( st.area > 575.0f - eps );
+  CHECK( st.area < 575.0f + eps );
   CHECK( st.delay > 33.60f - eps );
   CHECK( st.delay < 33.60f + eps );
-  CHECK( st.multioutput_gates == 39 );
+  CHECK( st.multioutput_gates == 40 );
 }
 
 TEST_CASE( "Emap with inverters", "[emap]" )

From f75f399bcda3674e517b4419dfa56b8da42cc95c Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 29 Apr 2024 21:26:50 +0200
Subject: [PATCH 10/27] Improving phase dropping heuristics and first
 implementation of alternative matches

---
 experiments/emap.cpp                   |  28 +-
 include/mockturtle/algorithms/emap.hpp | 568 +++++++++++++++++++++++--
 2 files changed, 540 insertions(+), 56 deletions(-)

diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index 92fed626a..fa2bb17c1 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -55,7 +55,7 @@ int main()
 
   /* library to map to technology */
   fmt::print( "[i] processing technology library\n" );
-  std::string library = "multioutput";
+  std::string library = "asap7";
   std::vector<gate> gates;
   std::ifstream in( cell_libraries_path( library ) );
 
@@ -66,9 +66,9 @@ int main()
 
   tech_library_params tps;
   tps.verbose = true;
-  tech_library<9> tech_lib( gates, tps );
+  tech_library<6> tech_lib( gates, tps );
 
-  for ( auto const& benchmark : epfl_benchmarks() )
+  for ( auto const& benchmark : iwls_benchmarks() )
   {
     fmt::print( "[i] processing {}\n", benchmark );
 
@@ -78,26 +78,32 @@ int main()
       continue;
     }
 
+    // if ( aig.num_gates() > 100000 )
+    //   continue;
+
     /* remove structural redundancies */
-    aig_balancing_params bps;
-    bps.minimize_levels = false;
-    bps.fast_mode = true;
-    aig_balance( aig, bps );
+    // aig_balancing_params bps;
+    // bps.minimize_levels = false;
+    // bps.fast_mode = true;
+    // aig_balance( aig, bps );
 
     const uint32_t size_before = aig.num_gates();
     const uint32_t depth_before = depth_view( aig ).depth();
 
     emap_params ps;
-    ps.matching_mode = emap_params::hybrid;
+    ps.matching_mode = emap_params::boolean;
     ps.area_oriented_mapping = false;
-    ps.map_multioutput = true;
+    ps.map_multioutput = false;
+    ps.verbose = true;
     emap_stats st;
-    cell_view<block_network> res = emap<9>( aig, tech_lib, ps, &st );
+    cell_view<block_network> res = emap<6>( aig, tech_lib, ps, &st );
 
     names_view res_names{ res };
     restore_network_name( aig, res_names );
     restore_pio_names_by_order( aig, res_names );
-    const auto cec = benchmark == "hyp" ? true : abc_cec_mapped_cell( res_names, benchmark, library );
+    // const auto cec = benchmark == "hyp" ? true : abc_cec_mapped_cell( res_names, benchmark, library );
+    // std::cout << fmt::format( "[i] CEC = {}\n", cec );
+    const auto cec = false; /* don't run CEC */
 
     /* write verilog netlist */
     // write_verilog_with_cell( res_names, benchmark + "_mapped.v" );
diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index aaf9b7285..3badd8e7a 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -128,6 +128,9 @@ struct emap_params
   /*! \brief Fast area recovery */
   bool use_fast_area_recovery{ true };
 
+  /*! \brief Compute alternatives using a different cost functions */
+  bool use_match_alternatives{ true };
+
   /*! \brief Remove the cuts that are contained in others */
   bool remove_dominated_cuts{ false };
 
@@ -671,17 +674,33 @@ struct emap_triple_hash
 };
 #pragma endregion
 
+template<unsigned NInputs>
+struct best_gate_emap
+{
+  supergate<NInputs> const* gate;
+  double arrival;
+  float area;
+  float flow;
+  unsigned phase : 16;
+  unsigned cut   : 12;
+  unsigned size  :  4;
+};
+
 template<unsigned NInputs>
 struct node_match_emap
 {
   /* best gate match for positive and negative output phases */
   supergate<NInputs> const* best_supergate[2];
+  /* alternative best gate for positibe and negative output phase */
+  best_gate_emap<NInputs> best_alternative[2];
   /* fanin pin phases for both output phases */
   uint16_t phase[2];
   /* best cut index for both phases */
-  uint32_t best_cut[2];
+  uint16_t best_cut[2];
   /* node is mapped using only one phase */
   bool same_match;
+  /* node alternative uses only one phase */
+  bool same_match_alternative;
   /* node is mapped to a multi-output gate */
   bool multioutput_match[2];
 
@@ -1017,6 +1036,8 @@ class emap_impl
       /* try to drop one phase */
       match_drop_phase<DO_AREA, false>( n, 0 );
 
+      select_alternatives<DO_AREA>( n );
+
       /* load and try a multi-output matches */
       if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
       {
@@ -1034,7 +1055,7 @@ class emap_impl
     }
 
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs2<false>();
 
     if ( warning_box )
     {
@@ -1078,7 +1099,9 @@ class emap_impl
     {
       /* all terminals have flow 0.0 */
       node_data.flows[0] = node_data.flows[1] = 0.0f;
+      node_data.best_alternative[0].flow = node_data.best_alternative[1].flow = 0.0f;
       node_data.arrival[0] = node_data.arrival[1] = 0.0f;
+      node_data.best_alternative[0].arrival = node_data.best_alternative[1].arrival = 0.0f;
       /* skip if cuts have been computed before */
       if ( cuts[index].size() == 0 )
       {
@@ -1090,10 +1113,14 @@ class emap_impl
     else if ( ntk.is_pi( n ) )
     {
       node_data.flows[0] = 0.0f;
+      node_data.best_alternative[0].flow = 0.0f;
       node_data.arrival[0] = 0.0f;
+      node_data.best_alternative[0].arrival = 0.0f;
       /* PIs have the negative phase implemented with an inverter */
       node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
+      node_data.best_alternative[1].flow = lib_inv_area / node_data.est_refs[1];
       node_data.arrival[1] = lib_inv_delay;
+      node_data.best_alternative[1].arrival = lib_inv_delay;
       /* skip if cuts have been computed before */
       if ( cuts[index].size() == 0 )
       {
@@ -1587,6 +1614,7 @@ class emap_impl
       if ( ntk.is_pi( n ) )
       {
         node_match[index].flows[1] = lib_inv_area / node_match[index].est_refs[1];
+        node_match[index].best_alternative[1].flow = lib_inv_area / node_match[index].est_refs[1];
         continue;
       }
 
@@ -1994,8 +2022,195 @@ class emap_impl
         {
           if ( iteration < ps.area_flow_rounds )
           {
-            // ++node_data.map_refs[use_phase];
-            node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
+            ++node_data.map_refs[use_phase];
+            // node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
+          }
+          area += lib_inv_area;
+          ++inv;
+        }
+      }
+
+      /* invert the phase */
+      use_phase = use_phase ^ 1;
+
+      /* if both phases are implemented and used */
+      if ( !node_data.same_match && node_data.map_refs[use_phase] > 0 )
+      {
+        if constexpr ( !ELA )
+        {
+          auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+
+          auto ctr = 0u;
+          for ( auto const leaf : best_cut )
+          {
+            if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
+              node_match[leaf].map_refs[1]++;
+            else
+              node_match[leaf].map_refs[0]++;
+          }
+        }
+        area += node_data.area[use_phase];
+      }
+    }
+
+    ++iteration;
+
+    if constexpr ( ELA )
+    {
+      return true;
+    }
+
+    /* blend estimated references */
+    for ( auto i = 0u; i < ntk.size(); ++i )
+    {
+      node_match[i].est_refs[0] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[0] + 2.0f * node_match[i].map_refs[0] ) / 3.0 );
+      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
+    }
+
+    return true;
+  }
+
+  template<bool ELA>
+  bool set_mapping_refs2()
+  {
+    for ( auto i = 0u; i < node_match.size(); ++i )
+    {
+      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
+    }
+
+    /* compute the current worst delay and update the mapping refs */
+    delay = 0.0f;
+    ntk.foreach_po( [this]( auto s ) {
+      const auto index = ntk.node_to_index( ntk.get_node( s ) );
+
+      if ( ntk.is_complemented( s ) )
+        delay = std::max( delay, node_match[index].arrival[1] );
+      else
+        delay = std::max( delay, node_match[index].arrival[0] );
+
+      if constexpr ( !ELA )
+      {
+        if ( ntk.is_complemented( s ) )
+          node_match[index].map_refs[1]++;
+        else
+          node_match[index].map_refs[0]++;
+      }
+    } );
+
+    double required = delay;
+    /* relax delay constraints */
+    if ( iteration == 0 && ps.required_time == 0.0f && ps.relax_required > 0.0f )
+    {
+      required *= ( 100.0 + ps.relax_required ) / 100.0;
+    }
+
+    /* Global target time constraint */
+    if ( ps.required_time != 0.0f )
+    {
+      if ( ps.required_time < delay - epsilon )
+      {
+        if ( !ps.area_oriented_mapping && iteration == 1 )
+          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
+      }
+      else
+      {
+        required = ps.required_time;
+      }
+    }
+
+    /* set the required time at POs */
+    ntk.foreach_po( [&]( auto const& s ) {
+      const auto index = ntk.node_to_index( ntk.get_node( s ) );
+      if ( ntk.is_complemented( s ) )
+        node_match[index].required[1] = required;
+      else
+        node_match[index].required[0] = required;
+    } );
+
+    /* compute current area and update mapping refs in top-down order */
+    area = 0.0f;
+    inv = 0;
+    for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
+    {
+      const auto index = ntk.node_to_index( *it );
+      auto& node_data = node_match[index];
+
+      /* skip constants and PIs */
+      if ( ntk.is_constant( *it ) )
+      {
+        if ( node_match[index].map_refs[0] || node_match[index].map_refs[1] )
+        {
+          /* if used and not available in the library launch a mapping error */
+          if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+          {
+            std::cerr << "[e] MAP ERROR: technology library does not contain constant gates, impossible to perform mapping" << std::endl;
+            st.mapping_error = true;
+            return false;
+          }
+        }
+        continue;
+      }
+      else if ( ntk.is_pi( *it ) )
+      {
+        if ( node_match[index].map_refs[1] > 0u )
+        {
+          /* Add inverter area over the negated fanins */
+          area += lib_inv_area;
+          ++inv;
+        }
+        continue;
+      }
+
+      /* continue if not referenced in the cover */
+      if ( !node_match[index].map_refs[0] && !node_match[index].map_refs[1] )
+        continue;
+
+      /* don't touch box */
+      if constexpr ( has_is_dont_touch_v<Ntk> )
+      {
+        if ( ntk.is_dont_touch( *it ) )
+        {
+          set_mapping_refs_dont_touch<ELA>( *it );
+          continue;
+        }
+      }
+
+      unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
+
+      if ( node_data.best_supergate[use_phase] == nullptr )
+      {
+        /* Library is not complete, mapping is not possible */
+        std::cerr << "[e] MAP ERROR: technology library is not complete, impossible to perform mapping" << std::endl;
+        st.mapping_error = true;
+        return false;
+      }
+
+      /* refine best mathes looking at alternatives */
+      if ( ps.use_match_alternatives)
+        refine_best_matches( *it );
+
+      if ( node_data.same_match || node_data.map_refs[use_phase] > 0 )
+      {
+        if constexpr ( !ELA )
+        {
+          auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+          auto ctr = 0u;
+
+          for ( auto const leaf : best_cut )
+          {
+            if ( ( node_data.phase[use_phase] >> ctr++ ) & 1 )
+              node_match[leaf].map_refs[1]++;
+            else
+              node_match[leaf].map_refs[0]++;
+          }
+        }
+        area += node_data.area[use_phase];
+        if ( node_data.same_match && node_data.map_refs[use_phase ^ 1] > 0 )
+        {
+          if ( iteration < ps.area_flow_rounds )
+          {
+            ++node_data.map_refs[use_phase];
+            // node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
           }
           area += lib_inv_area;
           ++inv;
@@ -2023,6 +2238,11 @@ class emap_impl
         }
         area += node_data.area[use_phase];
       }
+
+      if ( !ps.area_oriented_mapping )
+      {
+        match_propagate_required( index );
+      }
     }
 
     ++iteration;
@@ -2065,8 +2285,8 @@ class emap_impl
       {
         if ( iteration < ps.area_flow_rounds )
         {
-          // ++node_match[index].map_refs[0];
-          node_match[index].map_refs[0] += node_match[index].map_refs[1];
+          ++node_match[index].map_refs[0];
+          // node_match[index].map_refs[0] += node_match[index].map_refs[1];
         }
         area += lib_inv_area;
         ++inv;
@@ -2312,24 +2532,24 @@ class emap_impl
   template<bool DO_AREA>
   void match_phase( node<Ntk> const& n, uint8_t phase )
   {
-    double best_arrival = std::numeric_limits<float>::max();
-    double best_area_flow = std::numeric_limits<float>::max();
-    float best_area = std::numeric_limits<float>::max();
-    uint32_t best_size = UINT32_MAX;
-    uint8_t best_cut = 0u;
-    uint16_t best_phase = 0u;
-    uint8_t cut_index = 0u;
     auto index = ntk.node_to_index( n );
-
     auto& node_data = node_match[index];
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[phase];
+    uint32_t cut_index = 0u;
+
+    node_data.best_supergate[phase] = nullptr;
+    node_data.arrival[phase] = std::numeric_limits<float>::max();
+    node_data.flows[phase] = std::numeric_limits<float>::max();
+    node_data.area[phase] = std::numeric_limits<float>::max();
+    uint32_t best_size = UINT32_MAX;
+
+    best_gate_emap<NInputs>& gA = node_data.best_alternative[phase];
+    gA.gate = nullptr;
+    gA.arrival = std::numeric_limits<float>::max();
+    gA.flow = std::numeric_limits<float>::max();
+    uint32_t best_sizeA = UINT32_MAX;
 
     /* unmap multioutput */
-    if ( node_data.multioutput_match[phase] )
-    {
-      best_supergate = nullptr;
-      node_data.multioutput_match[phase] = false;
-    }
+    node_data.multioutput_match[phase] = false;
 
     /* foreach cut */
     for ( auto& cut : cuts[index] )
@@ -2355,49 +2575,68 @@ class emap_impl
       {
         uint16_t gate_polarity = gate.polarity ^ negation;
         double worst_arrival = 0.0f;
-        double area_local = gate.area;
+        double worst_arrivalA = 0.0f;
+        float area_local = gate.area;
+        float area_localA = gate.area;
 
         auto ctr = 0u;
-        node_data.phase[phase] = gate_polarity;
         for ( auto l : *cut )
         {
-          double arrival_pin = node_match[l].arrival[( gate_polarity >> ctr ) & 1] + gate.tdelay[ctr];
+          uint8_t leaf_phase = ( gate_polarity >> ctr ) & 1;
+
+          double arrival_pinA = node_match[l].best_alternative[leaf_phase].arrival + gate.tdelay[ctr];
+          worst_arrivalA = std::max( worst_arrivalA, arrival_pinA );
+
+          // if constexpr ( DO_AREA )
+          // {
+          //   if ( worst_arrivalA > node_data.required[phase] + epsilon || worst_arrivalA >= std::numeric_limits<float>::max() )
+          //     break;
+          // }
+
+          double arrival_pin = node_match[l].arrival[leaf_phase] + gate.tdelay[ctr];
           worst_arrival = std::max( worst_arrival, arrival_pin );
 
-          uint8_t leaf_phase = ( node_data.phase[phase] >> ctr ) & 1;
           area_local += node_match[l].flows[leaf_phase];
+          area_localA += node_match[l].best_alternative[leaf_phase].flow;
           ++ctr;
         }
 
+        bool skip = false;
         if constexpr ( DO_AREA )
         {
-          if ( worst_arrival > node_data.required[phase] + epsilon || worst_arrival >= std::numeric_limits<float>::max() )
+          if ( ctr < cut->size() )
             continue;
+          if ( worst_arrival > node_data.required[phase] + epsilon || worst_arrival >= std::numeric_limits<float>::max() )
+            skip = true;
         }
 
-        node_data.phase[phase] = gate_polarity;
-
-        if ( compare_map<DO_AREA>( worst_arrival, best_arrival, area_local, best_area_flow, cut->size(), best_size ) )
+        if ( !skip && compare_map<DO_AREA>( worst_arrival, node_data.arrival[phase], area_local, node_data.flows[phase], cut->size(), best_size ) )
         {
-          best_arrival = worst_arrival;
-          best_area_flow = area_local;
+          node_data.best_supergate[phase] = &gate;
+          node_data.arrival[phase] = worst_arrival;
+          node_data.flows[phase] = area_local;
+          node_data.best_cut[phase] = cut_index;
+          node_data.area[phase] = gate.area;
+          node_data.phase[phase] = gate_polarity;
           best_size = cut->size();
-          best_cut = cut_index;
-          best_area = gate.area;
-          best_phase = gate_polarity;
-          best_supergate = &gate;
+        }
+
+        /* compute the alternative */
+        if ( compare_map<!DO_AREA>( worst_arrivalA, gA.arrival, area_localA, gA.flow, cut->size(), best_sizeA ) )
+        {
+          gA.gate = &gate;
+          gA.arrival = worst_arrivalA;
+          gA.area = gate.area;
+          gA.flow = area_localA;
+          gA.phase = gate_polarity;
+          gA.cut = cut_index;
+          best_sizeA = cut->size();
+          gA.size = cut->size();
         }
       }
 
       ++cut_index;
     }
-
-    node_data.flows[phase] = best_area_flow;
-    node_data.arrival[phase] = best_arrival;
-    node_data.area[phase] = best_area;
-    node_data.best_cut[phase] = best_cut;
-    node_data.phase[phase] = best_phase;
-    node_data.best_supergate[phase] = best_supergate;
   }
 
   template<bool SwitchActivity>
@@ -2625,10 +2864,50 @@ class emap_impl
     {
       auto size_zero = cuts[index][node_data.best_cut[0]].size();
       auto size_one = cuts[index][node_data.best_cut[1]].size();
-      if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
-        use_one = false;
+
+      if constexpr ( ELA )
+      {
+        if ( !node_data.same_match )
+        {
+          /* both phases were implemented --> evaluate substitution */
+          cut_deref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          node_data.flows[1] = cut_deref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+          node_data.flows[0] = cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+        }
+        if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
+          use_one = false;
+        else
+          use_zero = false;
+      }
       else
-        use_zero = false;
+      {
+        /* compare flows by looking at the most convinient and referenced */
+        if ( node_data.flows[0] / node_data.est_refs[0] + lib_inv_area < node_data.flows[1] / node_data.est_refs[1] + epsilon )
+        {
+          use_one = false;
+        }
+        else if ( node_data.flows[1] / node_data.est_refs[1] + lib_inv_area < node_data.flows[0] / node_data.est_refs[0] + epsilon )
+        {
+          use_zero = false;
+        }
+        else
+        {
+          if ( iteration < ps.area_flow_rounds )
+          {
+            /* delay the decision on what to keep --> wait for better estimations */
+            node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
+            node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
+            node_data.same_match = false;
+            return;
+          }
+          /* commit to one of the two before going to exact area */
+          if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
+            use_one = false;
+          else
+            use_zero = false;
+        }
+      }
     }
 
     if ( use_zero )
@@ -2685,6 +2964,205 @@ class emap_impl
     node_data.flows[phase] = node_data.flows[phase] / node_data.est_refs[phase];
   }
 
+  template<bool DO_AREA>
+  inline void select_alternatives( node<Ntk> const& n )
+  {
+    if ( !ps.use_match_alternatives )
+      return;
+
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+
+    best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
+    best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
+
+    if constexpr ( DO_AREA )
+    {
+      /* process for best delay */
+      if ( g0.arrival + lib_inv_delay < g1.arrival + epsilon )
+      {
+        node_data.same_match_alternative = true;
+        g1 = g0;
+        g1.gate = nullptr;
+        g1.arrival += lib_inv_delay;
+        g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
+        g0.flow /= node_data.est_refs[0];
+        return;
+      }
+      else if ( g1.arrival + lib_inv_delay < g0.arrival + epsilon )
+      {
+        node_data.same_match_alternative = true;
+        g0 = g1;
+        g0.gate = nullptr;
+        g0.arrival += lib_inv_delay;
+        g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
+        g1.flow /= node_data.est_refs[1];
+        return;
+      }
+    }
+    else
+    {
+      /* process for best area */ /* removed check on required since this is executed only during a delay pass */
+      if ( g0.gate != nullptr && g0.flow + lib_inv_area < g1.flow + epsilon )
+      {
+        node_data.same_match_alternative = true;
+        g1 = g0;
+        g1.gate = nullptr;
+        g1.arrival += lib_inv_delay;
+        g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
+        g0.flow /= node_data.est_refs[0];
+        return;
+      }
+      else if ( g1.gate != nullptr && g1.flow + lib_inv_area < g0.flow + epsilon )
+      {
+        node_data.same_match_alternative = true;
+        g0 = g1;
+        g0.gate = nullptr;
+        g0.arrival += lib_inv_delay;
+        g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
+        g1.flow /= node_data.est_refs[1];
+        return;
+      }
+    }
+
+    node_data.same_match_alternative = false;
+    g0.flow /= node_data.est_refs[0];
+    g1.flow /= node_data.est_refs[1];
+  }
+
+  inline void refine_best_matches( node<Ntk> const& n )
+  {
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+
+    /* evaluate to change the best matches with the best alternative */
+    best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
+    best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
+
+    /* if same match, try to keep it that way */
+    if ( node_data.same_match )
+    {
+      /* pick best implementation between the two alternatives */
+      unsigned best_match_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+      unsigned use_phase = g0.gate == nullptr ? 1 : 0;
+      if ( !node_data.same_match_alternative )
+      {
+        int valid = 0;
+        float flow0 = g0.flow + ( node_data.map_refs[0] ? 0 : lib_inv_area );
+        float flow1 = g0.flow + ( node_data.map_refs[1] ? 0 : lib_inv_area );
+        if ( g0.arrival < node_data.required[0] + epsilon && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon )
+          valid = 1;
+        if ( g1.arrival < node_data.required[1] + epsilon && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon )
+          valid |= 2;
+
+        if ( valid == 0 )
+          return;
+        else if ( valid != 3 )
+          use_phase = valid >> 1;
+        else if ( compare_map<true>( g1.arrival, g0.arrival, flow1, flow0, g1.size, g0.size ) )
+          use_phase = 1;
+      }
+      else
+      {
+        best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
+        if ( gUse.arrival > node_data.required[use_phase] + epsilon || gUse.arrival + lib_inv_delay > node_data.required[use_phase ^ 1] + epsilon )
+        {
+          return;
+        }
+      }
+
+      best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
+      float flowUse = gUse.flow * node_data.est_refs[use_phase] + ( node_data.map_refs[use_phase] ? 0 : lib_inv_area );
+      float flowCurrent = node_data.flows[best_match_phase] * node_data.est_refs[best_match_phase] + ( node_data.map_refs[best_match_phase] ? 0 : lib_inv_area );
+      if ( flowUse < flowCurrent )
+      {
+        refine_best_matches_copy_refinement( n, use_phase, true );
+      }
+      return;
+    }
+
+    /* TODO: should I check the potential gain of merging the two current implementations before replacing them? */
+
+    /* not same match: evaluate both zero and one phase */
+    if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
+    {
+      if ( compare_map<true>( g0.arrival, node_data.arrival[0], g0.flow, node_data.flows[0], g0.size, cuts[index][node_data.best_cut[0]].size() ) )
+      {
+        refine_best_matches_copy_refinement( n, 0, false );
+      }
+    }
+    if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
+    {
+      if ( compare_map<true>( g1.arrival, node_data.arrival[1], g1.flow, node_data.flows[1], g1.size, cuts[index][node_data.best_cut[1]].size() ) )
+      {
+        refine_best_matches_copy_refinement( n, 1, false );
+      }
+    }
+
+    /* evaluate change of phase + inverter */
+    if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
+    {
+      unsigned phase = node_data.map_refs[0] == 0 ? 1 : 0;
+      bool valid = node_data.arrival[phase ^ 1] + lib_inv_delay < node_data.required[phase];
+      if ( valid && compare_map<true>( node_data.arrival[phase ^ 1], node_data.arrival[phase], node_data.flows[phase ^ 1], node_data.flows[phase], cuts[index][node_data.best_cut[phase ^ 1]].size(), cuts[index][node_data.best_cut[phase]].size() ) )
+      {
+        set_match_complemented_phase( index, phase ^ 1, node_data.arrival[phase ^ 1] + lib_inv_delay );
+      }
+    }
+
+    /* TODO: check if it is possible to merge the gates? */
+    // if ( node_data.map_refs[0] && node_data.map_refs[1] )
+    // {
+    //   bool use_zero = node_data.arrival[0] + lib_inv_delay < node_data.required[1];
+    //   bool use_one = node_data.arrival[1] + lib_inv_delay < node_data.required[0];
+    //   if ( use_zero && use_one )
+    //   {
+    //     if ( compare_map<true>( node_data.arrival[0], node_data.arrival[1], node_data.flows[0], node_data.flows[1], cuts[index][node_data.best_cut[0]].size(), cuts[index][node_data.best_cut[1]].size() ) )
+    //     {
+    //       use_one = false;
+    //     }
+    //     else
+    //     {
+    //       use_zero = false;
+    //     }
+    //   }
+
+    //   if ( use_zero )
+    //   {
+    //     set_match_complemented_phase( index, 0, node_data.arrival[0] + lib_inv_delay );
+    //   }
+    //   else if ( use_one )
+    //   {
+    //     set_match_complemented_phase( index, 1, node_data.arrival[1] + lib_inv_delay );
+    //   }
+    // }
+  }
+
+  inline void refine_best_matches_copy_refinement( node<Ntk> const& n, unsigned phase, bool both_phases )
+  {
+    auto index = ntk.node_to_index( n );
+    auto& node_data = node_match[index];
+    best_gate_emap<NInputs>& bg = node_data.best_alternative[phase];
+
+    node_data.best_supergate[phase] = bg.gate;
+    node_data.phase[phase] = bg.phase;
+    node_data.best_cut[phase] = bg.cut;
+    node_data.arrival[phase] = bg.arrival;
+    node_data.area[phase] = bg.area;
+    node_data.flows[phase] = bg.flow;
+
+    if ( !both_phases )
+      return;
+    
+    phase ^= 1;
+    node_data.best_supergate[phase] = nullptr;
+    node_data.phase[phase] = bg.phase;
+    node_data.best_cut[phase] = bg.cut;
+    node_data.arrival[phase] = bg.arrival + lib_inv_delay;
+    node_data.area[phase] = bg.area;
+    node_data.flows[phase] = ( bg.flow * node_data.est_refs[phase ^ 1] + lib_inv_area ) / node_data.est_refs[phase];
+  }
+
   void reindex_multioutput_data()
   {
     /* re-index the multioutput list using the lowest index output instead of the greatest one */
@@ -4707,7 +5185,7 @@ class emap_impl
 #pragma endregion
 
   template<bool DO_AREA>
-  inline bool compare_map( double arrival, double best_arrival, double area_flow, double best_area_flow, uint32_t size, uint32_t best_size )
+  inline bool compare_map( double arrival, double best_arrival, float area_flow, float best_area_flow, uint32_t size, uint32_t best_size )
   {
     if constexpr ( DO_AREA )
     {

From e82ffaf919f700b27c733886cab56dc4763bec3a Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Tue, 30 Apr 2024 19:00:41 +0200
Subject: [PATCH 11/27] Improving emap and integration of alternatives

---
 include/mockturtle/algorithms/emap.hpp | 249 +++++++++++--------------
 1 file changed, 113 insertions(+), 136 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 3badd8e7a..b24bac77d 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -114,7 +114,7 @@ struct emap_params
   double relax_required{ 0.0f };
 
   /*! \brief Number of rounds for area flow optimization. */
-  uint32_t area_flow_rounds{ 2u };
+  uint32_t area_flow_rounds{ 3u };
 
   /*! \brief Number of rounds for exact area optimization. */
   uint32_t ela_rounds{ 2u };
@@ -1713,6 +1713,13 @@ class emap_impl
         auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
         cut_deref<SwitchActivity>( best_cut, n, use_phase );
       }
+      else if ( !node_data.map_refs[0] || !node_data.map_refs[1] )
+      {
+        uint8_t use_phase = node_data.map_refs[0] ? 0 : 1;
+        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+        cut_deref<SwitchActivity>( best_cut, n, use_phase );
+        node_data.same_match = true;
+      }
 
       /* match positive phase */
       match_phase_exact<SwitchActivity>( n, 0u );
@@ -1803,6 +1810,13 @@ class emap_impl
           node_data.required[use_phase] = std::min( node_data.required[use_phase], node_data.required[use_phase ^ 1] - lib_inv_delay );
         }
       }
+      else if ( !node_data.map_refs[0] || !node_data.map_refs[1] )
+      {
+        use_phase = node_data.map_refs[0] ? 0 : 1;
+        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
+        cut_deref<SwitchActivity>( best_cut, *it, use_phase );
+        node_data.same_match = true;
+      }
 
       /* match positive phase */
       match_phase_exact<SwitchActivity>( *it, 0u );
@@ -2061,10 +2075,11 @@ class emap_impl
     }
 
     /* blend estimated references */
+    float const coef = 1.0f / ( ( iteration + 1.0f ) * ( iteration + 1.0f ) );
     for ( auto i = 0u; i < ntk.size(); ++i )
     {
-      node_match[i].est_refs[0] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[0] + 2.0f * node_match[i].map_refs[0] ) / 3.0 );
-      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
+      node_match[i].est_refs[0] = std::max( 1.0f, coef * node_match[i].est_refs[0] + ( 1 - coef ) * node_match[i].map_refs[0] );
+      node_match[i].est_refs[1] = std::max( 1.0f, coef * node_match[i].est_refs[1] + ( 1 - coef ) * node_match[i].map_refs[1] );
     }
 
     return true;
@@ -2175,8 +2190,11 @@ class emap_impl
         }
       }
 
+      /* refine best mathes looking at alternatives */
+      if ( ps.use_match_alternatives )
+        refine_best_matches( *it );
+      
       unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
-
       if ( node_data.best_supergate[use_phase] == nullptr )
       {
         /* Library is not complete, mapping is not possible */
@@ -2185,10 +2203,6 @@ class emap_impl
         return false;
       }
 
-      /* refine best mathes looking at alternatives */
-      if ( ps.use_match_alternatives)
-        refine_best_matches( *it );
-
       if ( node_data.same_match || node_data.map_refs[use_phase] > 0 )
       {
         if constexpr ( !ELA )
@@ -2253,10 +2267,11 @@ class emap_impl
     }
 
     /* blend estimated references */
+    float const coef = 1.0f / ( ( iteration + 1.0f ) * ( iteration + 1.0f ) );
     for ( auto i = 0u; i < ntk.size(); ++i )
     {
-      node_match[i].est_refs[0] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[0] + 2.0f * node_match[i].map_refs[0] ) / 3.0 );
-      node_match[i].est_refs[1] = std::max( 1.0, ( 1.0 * node_match[i].est_refs[1] + 2.0f * node_match[i].map_refs[1] ) / 3.0 );
+      node_match[i].est_refs[0] = std::max( 1.0f, coef * node_match[i].est_refs[0] + ( 1 - coef ) * node_match[i].map_refs[0] );
+      node_match[i].est_refs[1] = std::max( 1.0f, coef * node_match[i].est_refs[1] + ( 1 - coef ) * node_match[i].map_refs[1] );
     }
 
     return true;
@@ -2803,37 +2818,34 @@ class emap_impl
     /* condition on not used phases, evaluate a substitution during exact area recovery */
     if constexpr ( ELA )
     {
-      if ( iteration != 0 )
+      if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
       {
-        if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
+        /* select the used match */
+        auto phase = 0;
+        auto nphase = 0;
+        if ( node_data.map_refs[0] == 0 )
         {
-          /* select the used match */
-          auto phase = 0;
-          auto nphase = 0;
-          if ( node_data.map_refs[0] == 0 )
-          {
-            phase = 1;
-            use_one = true;
-            use_zero = false;
-          }
-          else
-          {
-            nphase = 1;
-            use_one = false;
-            use_zero = true;
-          }
-          /* select the not used match instead if it leads to area improvement and doesn't violate the required time */
-          if ( node_data.arrival[nphase] + lib_inv_delay < node_data.required[phase] + epsilon )
-          {
-            auto size_phase = cuts[index][node_data.best_cut[phase]].size();
-            auto size_nphase = cuts[index][node_data.best_cut[nphase]].size();
+          phase = 1;
+          use_one = true;
+          use_zero = false;
+        }
+        else
+        {
+          nphase = 1;
+          use_one = false;
+          use_zero = true;
+        }
+        /* select the not used match instead if it leads to area improvement and doesn't violate the required time */
+        if ( node_data.arrival[nphase] + lib_inv_delay < node_data.required[phase] + epsilon )
+        {
+          auto size_phase = cuts[index][node_data.best_cut[phase]].size();
+          auto size_nphase = cuts[index][node_data.best_cut[nphase]].size();
 
-            if ( compare_map<DO_AREA>( node_data.arrival[nphase] + lib_inv_delay, node_data.arrival[phase], node_data.flows[nphase] + lib_inv_area, node_data.flows[phase], size_nphase, size_phase ) )
-            {
-              /* invert the choice */
-              use_zero = !use_zero;
-              use_one = !use_one;
-            }
+          if ( compare_map<DO_AREA>( node_data.arrival[nphase] + lib_inv_delay, node_data.arrival[phase], node_data.flows[nphase] + lib_inv_area, node_data.flows[phase], size_nphase, size_phase ) )
+          {
+            /* invert the choice */
+            use_zero = !use_zero;
+            use_one = !use_one;
           }
         }
       }
@@ -2893,19 +2905,11 @@ class emap_impl
         }
         else
         {
-          if ( iteration < ps.area_flow_rounds )
-          {
-            /* delay the decision on what to keep --> wait for better estimations */
-            node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
-            node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
-            node_data.same_match = false;
-            return;
-          }
-          /* commit to one of the two before going to exact area */
-          if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
-            use_one = false;
-          else
-            use_zero = false;
+          /* delay the decision on what to keep --> wait for better estimations */
+          node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
+          node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
+          node_data.same_match = false;
+          return;
         }
       }
     }
@@ -2975,6 +2979,8 @@ class emap_impl
 
     best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
     best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
+    float g0flow = g0.flow / node_data.est_refs[0];
+    float g1flow = g1.flow / node_data.est_refs[1];
 
     if constexpr ( DO_AREA )
     {
@@ -2986,7 +2992,7 @@ class emap_impl
         g1.gate = nullptr;
         g1.arrival += lib_inv_delay;
         g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
-        g0.flow /= node_data.est_refs[0];
+        g0.flow = g0flow;
         return;
       }
       else if ( g1.arrival + lib_inv_delay < g0.arrival + epsilon )
@@ -2996,38 +3002,38 @@ class emap_impl
         g0.gate = nullptr;
         g0.arrival += lib_inv_delay;
         g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
-        g1.flow /= node_data.est_refs[1];
+        g1.flow = g1flow;
         return;
       }
     }
     else
     {
       /* process for best area */ /* removed check on required since this is executed only during a delay pass */
-      if ( g0.gate != nullptr && g0.flow + lib_inv_area < g1.flow + epsilon )
+      if ( g0.gate != nullptr && g0flow + lib_inv_area < g1flow + epsilon )
       {
         node_data.same_match_alternative = true;
         g1 = g0;
         g1.gate = nullptr;
         g1.arrival += lib_inv_delay;
         g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
-        g0.flow /= node_data.est_refs[0];
+        g0.flow = g0flow;
         return;
       }
-      else if ( g1.gate != nullptr && g1.flow + lib_inv_area < g0.flow + epsilon )
+      else if ( g1.gate != nullptr && g1flow + lib_inv_area < g0flow + epsilon )
       {
         node_data.same_match_alternative = true;
         g0 = g1;
         g0.gate = nullptr;
         g0.arrival += lib_inv_delay;
         g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
-        g1.flow /= node_data.est_refs[1];
+        g1.flow = g1flow;
         return;
       }
     }
 
     node_data.same_match_alternative = false;
-    g0.flow /= node_data.est_refs[0];
-    g1.flow /= node_data.est_refs[1];
+    g0.flow = g0flow;
+    g1.flow = g1flow;
   }
 
   inline void refine_best_matches( node<Ntk> const& n )
@@ -3039,103 +3045,73 @@ class emap_impl
     best_gate_emap<NInputs>& g0 = node_data.best_alternative[0];
     best_gate_emap<NInputs>& g1 = node_data.best_alternative[1];
 
-    /* if same match, try to keep it that way */
-    if ( node_data.same_match )
+    if ( node_data.map_refs[0] && node_data.map_refs[1] )
     {
-      /* pick best implementation between the two alternatives */
-      unsigned best_match_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
-      unsigned use_phase = g0.gate == nullptr ? 1 : 0;
-      if ( !node_data.same_match_alternative )
+      if ( node_data.same_match )
       {
-        int valid = 0;
-        float flow0 = g0.flow + ( node_data.map_refs[0] ? 0 : lib_inv_area );
-        float flow1 = g0.flow + ( node_data.map_refs[1] ? 0 : lib_inv_area );
-        if ( g0.arrival < node_data.required[0] + epsilon && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon )
-          valid = 1;
-        if ( g1.arrival < node_data.required[1] + epsilon && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon )
-          valid |= 2;
+        /* pick best implementation between the two alternatives */
+        unsigned best_match_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+        unsigned use_phase = g0.gate == nullptr ? 1 : 0;
+        if ( !node_data.same_match_alternative )
+        {
+          if ( g0.arrival > node_data.required[0] + epsilon || g1.arrival > node_data.required[1] + epsilon )
+            return;
 
-        if ( valid == 0 )
+          refine_best_matches_copy_refinement( n, 0, false );
+          refine_best_matches_copy_refinement( n, 1, false );
+          node_data.same_match = false;
           return;
-        else if ( valid != 3 )
-          use_phase = valid >> 1;
-        else if ( compare_map<true>( g1.arrival, g0.arrival, flow1, flow0, g1.size, g0.size ) )
-          use_phase = 1;
-      }
-      else
-      {
-        best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
-        if ( gUse.arrival > node_data.required[use_phase] + epsilon || gUse.arrival + lib_inv_delay > node_data.required[use_phase ^ 1] + epsilon )
+        }
+        else
         {
+          best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
+          if ( gUse.arrival > node_data.required[use_phase] + epsilon || gUse.arrival + lib_inv_delay > node_data.required[use_phase ^ 1] + epsilon )
+          {
+            return;
+          }
+          refine_best_matches_copy_refinement( n, use_phase, true );
           return;
         }
       }
-
-      best_gate_emap<NInputs>& gUse = node_data.best_alternative[use_phase];
-      float flowUse = gUse.flow * node_data.est_refs[use_phase] + ( node_data.map_refs[use_phase] ? 0 : lib_inv_area );
-      float flowCurrent = node_data.flows[best_match_phase] * node_data.est_refs[best_match_phase] + ( node_data.map_refs[best_match_phase] ? 0 : lib_inv_area );
-      if ( flowUse < flowCurrent )
+      else
       {
-        refine_best_matches_copy_refinement( n, use_phase, true );
+        /* not same match: evaluate both zero and one phase */
+        if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
+        {
+          node_data.same_match = false;
+          refine_best_matches_copy_refinement( n, 0, node_data.same_match_alternative && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon );
+        }
+        if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
+        {
+          node_data.same_match = false;
+          refine_best_matches_copy_refinement( n, 1, node_data.same_match_alternative && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon );
+        }
       }
-      return;
     }
-
-    /* TODO: should I check the potential gain of merging the two current implementations before replacing them? */
-
-    /* not same match: evaluate both zero and one phase */
-    if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
+    else if ( node_data.map_refs[0] )
     {
-      if ( compare_map<true>( g0.arrival, node_data.arrival[0], g0.flow, node_data.flows[0], g0.size, cuts[index][node_data.best_cut[0]].size() ) )
+      if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
       {
+        node_data.same_match = false;
         refine_best_matches_copy_refinement( n, 0, false );
       }
+      else if ( g0.gate == nullptr && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon )
+      {
+        refine_best_matches_copy_refinement( n, 1, true );
+      }
     }
-    if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
+    else
     {
-      if ( compare_map<true>( g1.arrival, node_data.arrival[1], g1.flow, node_data.flows[1], g1.size, cuts[index][node_data.best_cut[1]].size() ) )
+      if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
       {
+        node_data.same_match = false;
         refine_best_matches_copy_refinement( n, 1, false );
       }
+      else if ( g1.gate == nullptr && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon )
+      {
+        refine_best_matches_copy_refinement( n, 0, true );
+      }
     }
-
-    /* evaluate change of phase + inverter */
-    if ( node_data.map_refs[0] == 0 || node_data.map_refs[1] == 0 )
-    {
-      unsigned phase = node_data.map_refs[0] == 0 ? 1 : 0;
-      bool valid = node_data.arrival[phase ^ 1] + lib_inv_delay < node_data.required[phase];
-      if ( valid && compare_map<true>( node_data.arrival[phase ^ 1], node_data.arrival[phase], node_data.flows[phase ^ 1], node_data.flows[phase], cuts[index][node_data.best_cut[phase ^ 1]].size(), cuts[index][node_data.best_cut[phase]].size() ) )
-      {
-        set_match_complemented_phase( index, phase ^ 1, node_data.arrival[phase ^ 1] + lib_inv_delay );
-      }
-    }
-
-    /* TODO: check if it is possible to merge the gates? */
-    // if ( node_data.map_refs[0] && node_data.map_refs[1] )
-    // {
-    //   bool use_zero = node_data.arrival[0] + lib_inv_delay < node_data.required[1];
-    //   bool use_one = node_data.arrival[1] + lib_inv_delay < node_data.required[0];
-    //   if ( use_zero && use_one )
-    //   {
-    //     if ( compare_map<true>( node_data.arrival[0], node_data.arrival[1], node_data.flows[0], node_data.flows[1], cuts[index][node_data.best_cut[0]].size(), cuts[index][node_data.best_cut[1]].size() ) )
-    //     {
-    //       use_one = false;
-    //     }
-    //     else
-    //     {
-    //       use_zero = false;
-    //     }
-    //   }
-
-    //   if ( use_zero )
-    //   {
-    //     set_match_complemented_phase( index, 0, node_data.arrival[0] + lib_inv_delay );
-    //   }
-    //   else if ( use_one )
-    //   {
-    //     set_match_complemented_phase( index, 1, node_data.arrival[1] + lib_inv_delay );
-    //   }
-    // }
   }
 
   inline void refine_best_matches_copy_refinement( node<Ntk> const& n, unsigned phase, bool both_phases )
@@ -3153,7 +3129,8 @@ class emap_impl
 
     if ( !both_phases )
       return;
-    
+
+    node_data.same_match = true;
     phase ^= 1;
     node_data.best_supergate[phase] = nullptr;
     node_data.phase[phase] = bg.phase;

From ec071e45b43203bb3490d0595c4eed8febcdaee9 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 11:21:57 +0200
Subject: [PATCH 12/27] Performance improvements

---
 include/mockturtle/algorithms/emap.hpp | 160 +++++++++++--------------
 1 file changed, 68 insertions(+), 92 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index b24bac77d..5221703cd 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -110,7 +110,7 @@ struct emap_params
   /*! \brief Required time for delay optimization. */
   double required_time{ 0.0f };
 
-  /*! \brief Required time relaxation ratio. */
+  /*! \brief Required time relaxation in percentage (10 = 10%). */
   double relax_required{ 0.0f };
 
   /*! \brief Number of rounds for area flow optimization. */
@@ -137,9 +137,6 @@ struct emap_params
   /*! \brief Remove overlapping multi-output cuts */
   bool remove_overlapping_multicuts{ false };
 
-  /*! \brief Doesn't allow node duplication */
-  bool allow_node_duplication{ true };
-
   /*! \brief Be verbose. */
   bool verbose{ false };
 };
@@ -949,7 +946,6 @@ class emap_impl
     uint32_t i = 0;
     while ( i++ < ps.area_flow_rounds )
     {
-      compute_required_time();
       if ( !compute_mapping<true>() )
       {
         return false;
@@ -964,7 +960,7 @@ class emap_impl
       reindex_multioutput_data();
       while ( i++ < ps.ela_rounds )
       {
-        if ( !compute_mapping_exact_reversed<false>( i == ps.ela_rounds ) )
+        if ( !compute_mapping_exact_reversed<false>() )
         {
           return false;
         }
@@ -974,7 +970,7 @@ class emap_impl
       i = 0;
       while ( i++ < ps.eswp_rounds )
       {
-        if ( !compute_mapping_exact_reversed<true>( true ) )
+        if ( !compute_mapping_exact_reversed<true>() )
         {
           return false;
         }
@@ -1034,8 +1030,9 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
 
+      /* select alternative matches to use */
       select_alternatives<DO_AREA>( n );
 
       /* load and try a multi-output matches */
@@ -1055,7 +1052,7 @@ class emap_impl
     }
 
     double area_old = area;
-    bool success = set_mapping_refs2<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     if ( warning_box )
     {
@@ -1546,7 +1543,7 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
     }
     double area_old = area;
     bool success = set_mapping_refs<false>();
@@ -1638,7 +1635,7 @@ class emap_impl
       match_phase<DO_AREA>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<DO_AREA, false>( n, 0 );
+      match_drop_phase<DO_AREA, false>( n );
 
       /* try a multi-output match */
       if constexpr ( DO_AREA )
@@ -1656,7 +1653,7 @@ class emap_impl
     }
 
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     /* round stats */
     if ( ps.verbose )
@@ -1728,7 +1725,7 @@ class emap_impl
       match_phase_exact<SwitchActivity>( n, 1u );
 
       /* try to drop one phase */
-      match_drop_phase<true, true>( n, 0 );
+      match_drop_phase<true, true>( n );
 
       /* try a multi-output match */
       if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
@@ -1763,7 +1760,7 @@ class emap_impl
   }
 
   template<bool SwitchActivity>
-  bool compute_mapping_exact_reversed( bool last_round )
+  bool compute_mapping_exact_reversed()
   {
     /* this method works in reverse topological order: less nodes to update (faster) */
     /* instead of propagating arrival times forward, it propagates required times backwards */
@@ -1831,7 +1828,7 @@ class emap_impl
       }
 
       /* try to drop one phase */
-      match_drop_phase<true, true>( *it, 0 );
+      match_drop_phase<true, true>( *it );
 
       /* try a multi-output match */
       if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 )
@@ -2037,7 +2034,6 @@ class emap_impl
           if ( iteration < ps.area_flow_rounds )
           {
             ++node_data.map_refs[use_phase];
-            // node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
           }
           area += lib_inv_area;
           ++inv;
@@ -2085,8 +2081,8 @@ class emap_impl
     return true;
   }
 
-  template<bool ELA>
-  bool set_mapping_refs2()
+  template<bool DO_AREA, bool ELA>
+  bool set_mapping_refs_and_req()
   {
     for ( auto i = 0u; i < node_match.size(); ++i )
     {
@@ -2190,9 +2186,12 @@ class emap_impl
         }
       }
 
-      /* refine best mathes looking at alternatives */
-      if ( ps.use_match_alternatives )
-        refine_best_matches( *it );
+      /* refine best matches with alternatives */
+      if constexpr ( !DO_AREA )
+      {
+        if ( ps.use_match_alternatives )
+          refine_best_matches( *it );
+      }
       
       unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
       if ( node_data.best_supergate[use_phase] == nullptr )
@@ -2224,7 +2223,6 @@ class emap_impl
           if ( iteration < ps.area_flow_rounds )
           {
             ++node_data.map_refs[use_phase];
-            // node_data.map_refs[use_phase] += node_data.map_refs[use_phase ^ 1];
           }
           area += lib_inv_area;
           ++inv;
@@ -2301,7 +2299,6 @@ class emap_impl
         if ( iteration < ps.area_flow_rounds )
         {
           ++node_match[index].map_refs[0];
-          // node_match[index].map_refs[0] += node_match[index].map_refs[1];
         }
         area += lib_inv_area;
         ++inv;
@@ -2762,7 +2759,7 @@ class emap_impl
   }
 
   template<bool DO_AREA, bool ELA>
-  void match_drop_phase( node<Ntk> const& n, float required_margin_factor )
+  void match_drop_phase( node<Ntk> const& n )
   {
     auto index = ntk.node_to_index( n );
     auto& node_data = node_match[index];
@@ -2811,8 +2808,8 @@ class emap_impl
     else
     {
       /* check if both phases + inverter meet the required time */
-      use_zero = worst_arrival_nneg < ( node_data.required[1] + epsilon - required_margin_factor * lib_inv_delay );
-      use_one = worst_arrival_npos < ( node_data.required[0] + epsilon - required_margin_factor * lib_inv_delay );
+      use_zero = worst_arrival_nneg < ( node_data.required[1] + epsilon );
+      use_one = worst_arrival_npos < ( node_data.required[0] + epsilon );
     }
 
     /* condition on not used phases, evaluate a substitution during exact area recovery */
@@ -2854,21 +2851,10 @@ class emap_impl
     if ( ( !use_zero && !use_one ) )
     {
       /* use both phases */
-      if ( ps.allow_node_duplication )
-      {
-        node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
-        node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
-        node_data.same_match = false;
-        return;
-      }
-
-      /* if node duplication is not allowed, pick one phase based on delay */
-      auto size_zero = cuts[index][node_data.best_cut[0]].size();
-      auto size_one = cuts[index][node_data.best_cut[1]].size();
-      if ( compare_map<false>( worst_arrival_npos, worst_arrival_nneg, node_data.flows[1], node_data.flows[0], size_one, size_zero ) )
-        use_zero = true;
-      else
-        use_one = true;
+      node_data.flows[0] = node_data.flows[0] / node_data.est_refs[0];
+      node_data.flows[1] = node_data.flows[1] / node_data.est_refs[1];
+      node_data.same_match = false;
+      return;
     }
 
     /* use area flow as a tiebreaker */
@@ -2887,10 +2873,24 @@ class emap_impl
           node_data.flows[0] = cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
           cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
         }
-        if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
-          use_one = false;
-        else
-          use_zero = false;
+        /* evaluate based on inverter cost */
+        // use_zero = lib_inv_area < node_data.flows[1] + epsilon;
+        // use_one = lib_inv_area < node_data.flows[0] + epsilon;
+
+        if ( use_one && use_zero )
+        {
+          if ( compare_map<DO_AREA>( worst_arrival_nneg, worst_arrival_npos, node_data.flows[0], node_data.flows[1], size_zero, size_one ) )
+            use_one = false;
+          else
+            use_zero = false;
+        }
+        else if ( !use_one && !use_zero && node_data.same_match )
+        {
+          node_data.same_match = false;
+          cut_ref<false>( cuts[index][node_data.best_cut[0]], n, 0 );
+          cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
+          return;
+        }
       }
       else
       {
@@ -2971,6 +2971,9 @@ class emap_impl
   template<bool DO_AREA>
   inline void select_alternatives( node<Ntk> const& n )
   {
+    if constexpr ( DO_AREA )
+      return;
+
     if ( !ps.use_match_alternatives )
       return;
 
@@ -2982,53 +2985,26 @@ class emap_impl
     float g0flow = g0.flow / node_data.est_refs[0];
     float g1flow = g1.flow / node_data.est_refs[1];
 
-    if constexpr ( DO_AREA )
+    /* process for best area */ /* removed check on required since this is executed only during a delay pass */
+    if ( g0.gate != nullptr && g0flow + lib_inv_area < g1flow + epsilon )
     {
-      /* process for best delay */
-      if ( g0.arrival + lib_inv_delay < g1.arrival + epsilon )
-      {
-        node_data.same_match_alternative = true;
-        g1 = g0;
-        g1.gate = nullptr;
-        g1.arrival += lib_inv_delay;
-        g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
-        g0.flow = g0flow;
-        return;
-      }
-      else if ( g1.arrival + lib_inv_delay < g0.arrival + epsilon )
-      {
-        node_data.same_match_alternative = true;
-        g0 = g1;
-        g0.gate = nullptr;
-        g0.arrival += lib_inv_delay;
-        g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
-        g1.flow = g1flow;
-        return;
-      }
+      node_data.same_match_alternative = true;
+      g1 = g0;
+      g1.gate = nullptr;
+      g1.arrival += lib_inv_delay;
+      g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
+      g0.flow = g0flow;
+      return;
     }
-    else
+    else if ( g1.gate != nullptr && g1flow + lib_inv_area < g0flow + epsilon )
     {
-      /* process for best area */ /* removed check on required since this is executed only during a delay pass */
-      if ( g0.gate != nullptr && g0flow + lib_inv_area < g1flow + epsilon )
-      {
-        node_data.same_match_alternative = true;
-        g1 = g0;
-        g1.gate = nullptr;
-        g1.arrival += lib_inv_delay;
-        g1.flow = ( g1.flow + lib_inv_area ) / node_data.est_refs[1];
-        g0.flow = g0flow;
-        return;
-      }
-      else if ( g1.gate != nullptr && g1flow + lib_inv_area < g0flow + epsilon )
-      {
-        node_data.same_match_alternative = true;
-        g0 = g1;
-        g0.gate = nullptr;
-        g0.arrival += lib_inv_delay;
-        g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
-        g1.flow = g1flow;
-        return;
-      }
+      node_data.same_match_alternative = true;
+      g0 = g1;
+      g0.gate = nullptr;
+      g0.arrival += lib_inv_delay;
+      g0.flow = ( g0.flow + lib_inv_area ) / node_data.est_refs[0];
+      g1.flow = g1flow;
+      return;
     }
 
     node_data.same_match_alternative = false;
@@ -3771,7 +3747,7 @@ class emap_impl
     match_phase<DO_AREA>( n, 1u );
 
     /* try to drop one phase */
-    match_drop_phase<DO_AREA, false>( n, 0 );
+    match_drop_phase<DO_AREA, false>( n );
 
     assert( node_data.arrival[0] < node_data.required[0] + epsilon );
     assert( node_data.arrival[1] < node_data.required[1] + epsilon );
@@ -3852,7 +3828,7 @@ class emap_impl
     match_phase_exact<SwitchActivity>( n, 1u );
 
     /* try to drop one phase */
-    match_drop_phase<true, true>( n, 0 );
+    match_drop_phase<true, true>( n );
 
     assert( node_data.arrival[0] < std::numeric_limits<float>::max() );
     assert( node_data.arrival[1] < std::numeric_limits<float>::max() );
@@ -4030,7 +4006,7 @@ class emap_impl
         match_phase_exact<false>( n, 1u );
 
         /* try to drop one phase */
-        match_drop_phase<true, true>( n, 0 );
+        match_drop_phase<true, true>( n );
       }
     }
 

From c2ca9a871a1017082c24741936df788edf626d8a Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 11:22:43 +0200
Subject: [PATCH 13/27] Add inverter cost evaluation in exact area (high
 inverter cost optimization)

---
 include/mockturtle/algorithms/emap.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 5221703cd..5efc6d563 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -2874,8 +2874,8 @@ class emap_impl
           cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
         }
         /* evaluate based on inverter cost */
-        // use_zero = lib_inv_area < node_data.flows[1] + epsilon;
-        // use_one = lib_inv_area < node_data.flows[0] + epsilon;
+        use_zero = lib_inv_area < node_data.flows[1] + epsilon;
+        use_one = lib_inv_area < node_data.flows[0] + epsilon;
 
         if ( use_one && use_zero )
         {

From b9fb641e8a79539bd30e0c24d163da9bf013820d Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 14:26:11 +0200
Subject: [PATCH 14/27] Add pin-specific input arrival times and required time
 constraints to emap

---
 include/mockturtle/algorithms/emap.hpp | 206 +++++++++++++------------
 1 file changed, 111 insertions(+), 95 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 5efc6d563..c7bd54191 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -107,12 +107,18 @@ struct emap_params
     hybrid
   } matching_mode = hybrid;
 
-  /*! \brief Required time for delay optimization. */
+  /*! \brief Target required time (for each PO). */
   double required_time{ 0.0f };
 
   /*! \brief Required time relaxation in percentage (10 = 10%). */
   double relax_required{ 0.0f };
 
+  /*! \brief Custom input arrival times. */
+  std::vector<double> arrival_times{};
+
+  /*! \brief Custom output required times. */
+  std::vector<double> required_times{};
+
   /*! \brief Number of rounds for area flow optimization. */
   uint32_t area_flow_rounds{ 3u };
 
@@ -805,6 +811,10 @@ class emap_impl
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* search for large matches */
     if ( ps.matching_mode == emap_params::structural || CutSize > 6 )
     {
@@ -859,6 +869,10 @@ class emap_impl
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* search for large matches */
     if ( ps.matching_mode == emap_params::structural || CutSize > 6 )
     {
@@ -904,11 +918,15 @@ class emap_impl
 
     auto [res, old2new] = initialize_map_network();
 
-    /* TODO: multi-output support is currently not implemented */
+    /* multi-output support is currently not implemented */
 
     /* compute and save topological order */
     init_topo_order();
 
+    /* init arrival time */
+    if ( !init_arrivals() )
+      return res;
+
     /* compute cuts, matches, and initial mapping */
     if ( !ps.area_oriented_mapping )
     {
@@ -1111,13 +1129,9 @@ class emap_impl
     {
       node_data.flows[0] = 0.0f;
       node_data.best_alternative[0].flow = 0.0f;
-      node_data.arrival[0] = 0.0f;
-      node_data.best_alternative[0].arrival = 0.0f;
       /* PIs have the negative phase implemented with an inverter */
       node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
       node_data.best_alternative[1].flow = lib_inv_area / node_data.est_refs[1];
-      node_data.arrival[1] = lib_inv_delay;
-      node_data.best_alternative[1].arrival = lib_inv_delay;
       /* skip if cuts have been computed before */
       if ( cuts[index].size() == 0 )
       {
@@ -1525,10 +1539,8 @@ class emap_impl
       {
         /* all terminals have flow 0 */
         node_data.flows[0] = 0.0f;
-        node_data.arrival[0] = 0.0f;
         /* PIs have the negative phase implemented with an inverter */
         node_data.flows[1] = lib_inv_area / node_data.est_refs[1];
-        node_data.arrival[1] = lib_inv_delay;
         add_unit_cut( index );
         continue;
       }
@@ -1544,9 +1556,12 @@ class emap_impl
 
       /* try to drop one phase */
       match_drop_phase<DO_AREA, false>( n );
+
+      /* select alternative matches to use */
+      select_alternatives<DO_AREA>( n );
     }
     double area_old = area;
-    bool success = set_mapping_refs<false>();
+    bool success = set_mapping_refs_and_req<DO_AREA, false>();
 
     /* round stats */
     if ( ps.verbose )
@@ -2108,35 +2123,7 @@ class emap_impl
       }
     } );
 
-    double required = delay;
-    /* relax delay constraints */
-    if ( iteration == 0 && ps.required_time == 0.0f && ps.relax_required > 0.0f )
-    {
-      required *= ( 100.0 + ps.relax_required ) / 100.0;
-    }
-
-    /* Global target time constraint */
-    if ( ps.required_time != 0.0f )
-    {
-      if ( ps.required_time < delay - epsilon )
-      {
-        if ( !ps.area_oriented_mapping && iteration == 1 )
-          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
-      }
-      else
-      {
-        required = ps.required_time;
-      }
-    }
-
-    /* set the required time at POs */
-    ntk.foreach_po( [&]( auto const& s ) {
-      const auto index = ntk.node_to_index( ntk.get_node( s ) );
-      if ( ntk.is_complemented( s ) )
-        node_match[index].required[1] = required;
-      else
-        node_match[index].required[0] = required;
-    } );
+    set_output_required_time( iteration == 0 );
 
     /* compute current area and update mapping refs in top-down order */
     area = 0.0f;
@@ -2306,47 +2293,73 @@ class emap_impl
     }
   }
 
-  void compute_required_time( bool exit_early = false )
+  void set_output_required_time( bool warning )
   {
-    for ( auto i = 0u; i < node_match.size(); ++i )
-    {
-      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
-    }
-
-    /* return if mapping is area oriented */
-    if ( ps.area_oriented_mapping )
-      return;
-
     double required = delay;
-
     /* relax delay constraints */
-    if ( iteration == 1 && ps.required_time == 0.0f && ps.relax_required > 0.0f )
+    if ( iteration == 0 && ps.required_time == 0.0f && ps.required_times.empty() && ps.relax_required > 0.0f )
     {
       required *= ( 100.0 + ps.relax_required ) / 100.0;
     }
 
     /* Global target time constraint */
-    if ( ps.required_time != 0.0f )
+    if ( ps.required_times.empty() )
     {
-      if ( ps.required_time < delay - epsilon )
-      {
-        if ( !ps.area_oriented_mapping && iteration == 1 )
-          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
-      }
-      else
+      if ( ps.required_time != 0.0f )
       {
-        required = ps.required_time;
+        if ( ps.required_time < delay - epsilon )
+        {
+          if ( warning )
+            std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f}", ps.required_time ) << std::endl;
+        }
+        else
+        {
+          required = ps.required_time;
+        }
       }
+
+      /* set the required time at POs */
+      ntk.foreach_po( [&]( auto const& s ) {
+        const auto index = ntk.node_to_index( ntk.get_node( s ) );
+        if ( ntk.is_complemented( s ) )
+          node_match[index].required[1] = required;
+        else
+          node_match[index].required[0] = required;
+      } );
+
+      return;
     }
 
-    /* set the required time at POs */
-    ntk.foreach_po( [&]( auto const& s ) {
+    /* Output-specific target time constraint */
+    ntk.foreach_po( [&]( auto const& s, uint32_t i ) {
       const auto index = ntk.node_to_index( ntk.get_node( s ) );
-      if ( ntk.is_complemented( s ) )
-        node_match[index].required[1] = required;
+      uint8_t phase = ntk.is_complemented( s ) ? 1 : 0;
+      if ( node_match[index].arrival[phase] > ps.required_times[i] + epsilon )
+      {
+        /* maintain the same delay */
+        node_match[index].required[phase] = node_match[index].arrival[phase];
+        if ( warning )
+          std::cerr << fmt::format( "[i] MAP WARNING: cannot meet the target required time of {:.2f} at output {}", ps.required_times[i], i ) << std::endl;
+      }
       else
-        node_match[index].required[0] = required;
+      {
+        node_match[index].required[phase] = ps.required_times[i];
+      }
     } );
+  }
+
+  void compute_required_time( bool exit_early = false )
+  {
+    for ( auto i = 0u; i < node_match.size(); ++i )
+    {
+      node_match[i].required[0] = node_match[i].required[1] = std::numeric_limits<float>::max();
+    }
+
+    /* return if mapping is area oriented */
+    if ( ps.area_oriented_mapping )
+      return;
+    
+    set_output_required_time( iteration == 1 );
 
     if ( exit_early )
       return;
@@ -3347,38 +3360,6 @@ class emap_impl
         area_flow[j] = gate.area + cut_leaves_flow( cut, n, phase[j] );
         node_data.phase[phase[j]] = old_phase;
 
-        /* local evaluation for delay (area flow improvement is approximated) */
-      //   if constexpr ( !DO_AREA )
-      //   {
-      //     /* recompute local area flow of previous matches */
-      //     double mapped_flow = node_data.flows[phase[j]];
-
-      //     if ( node_data.multioutput_match[phase[j]] )
-      //     {
-      //       /* recompute estimation for multi-output gate */
-      //       float k_est = 0;
-      //       for ( auto k = 0; k < max_multioutput_output_size; ++k )
-      //       {
-      //         uint32_t index_k = tuple_data[k].node_index;
-      //         auto used_phase = node_match[index_k].supergate[0] == nullptr ? 1 : 0;
-      //         k_est += node_match[index_k].est_refs[used_phase]; /* TODO: review */
-      //       }
-      //       mapped_flow *= k_est;
-      //     }
-      //     else
-      //     {
-      //       auto used_phase = node_data.supergate[0] == nullptr ? 1 : 0; /* TODO: review */
-      //       mapped_flow *= node_data.est_refs[used_phase];
-      //     }
-
-      //     auto const& mapped_cut = cuts[node_index][node_data.best_cut[phase[j]]];
-      //     if ( !compare_map<DO_AREA>( arrival[j], node_data.arrival[phase[j]], area_flow[j], mapped_flow, cut.size(), mapped_cut.size() ) )
-      //     {
-      //       is_best = false;
-      //       break;
-      //     }
-      //   }
-
         /* current version may lead to delay increase */
         est_refs[j] = node_data.est_refs[phase[j]];
       }
@@ -4377,6 +4358,41 @@ class emap_impl
     } );
   }
 
+  bool init_arrivals()
+  {
+    if ( ps.required_times.size() && ps.required_times.size() != ntk.num_pos() )
+    {
+      std::cerr << "[e] MAP ERROR: required time vector does not match the output size of the network" << std::endl;
+      st.mapping_error = true;
+      return false;
+    }
+
+    if ( ps.arrival_times.empty() )
+    {
+      ntk.foreach_pi( [&]( auto const& n ) {
+        auto& node_data = node_match[ntk.node_to_index( n )];
+        node_data.arrival[0] = node_data.best_alternative[0].arrival = 0;
+        node_data.arrival[1] = node_data.best_alternative[1].arrival = lib_inv_delay;
+      } );
+      return true;
+    }
+
+    if ( ps.arrival_times.size() != ntk.num_pis() )
+    {
+      std::cerr << "[e] MAP ERROR: arrival time vector does not match the input size of the network" << std::endl;
+      st.mapping_error = true;
+      return false;
+    }
+
+    ntk.foreach_pi( [&]( auto const& n, uint32_t i ) {
+      auto& node_data = node_match[ntk.node_to_index( n )];
+      node_data.arrival[0] = node_data.best_alternative[0].arrival = ps.arrival_times[i];
+      node_data.arrival[1] = node_data.best_alternative[1].arrival = ps.arrival_times[i] + lib_inv_delay;
+    } );
+
+    return true;
+  }
+
   void finalize_cover( binding_view<klut_network>& res, klut_map& old2new )
   {
     uint32_t multioutput_count = 0;

From f720bead12ca1d515e7c75aed0a2931803a9694b Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 14:37:37 +0200
Subject: [PATCH 15/27] removing standard foward exact area method (completely
 replaced by exact reversed)

---
 include/mockturtle/algorithms/emap.hpp | 244 ++-----------------------
 1 file changed, 18 insertions(+), 226 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index c7bd54191..9b1a3be72 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -131,9 +131,6 @@ struct emap_params
   /*! \brief Number of patterns for switching activity computation. */
   uint32_t switching_activity_patterns{ 2048u };
 
-  /*! \brief Fast area recovery */
-  bool use_fast_area_recovery{ true };
-
   /*! \brief Compute alternatives using a different cost functions */
   bool use_match_alternatives{ true };
 
@@ -972,54 +969,23 @@ class emap_impl
 
     /* compute mapping using exact area */
     i = 0;
-    if ( ps.use_fast_area_recovery )
+    compute_required_time( true );
+    reindex_multioutput_data();
+    while ( i++ < ps.ela_rounds )
     {
-      compute_required_time( true );
-      reindex_multioutput_data();
-      while ( i++ < ps.ela_rounds )
-      {
-        if ( !compute_mapping_exact_reversed<false>() )
-        {
-          return false;
-        }
-      }
-
-      /* compute mapping using exact switching activity estimation */
-      i = 0;
-      while ( i++ < ps.eswp_rounds )
+      if ( !compute_mapping_exact_reversed<false>() )
       {
-        if ( !compute_mapping_exact_reversed<true>() )
-        {
-          return false;
-        }
+        return false;
       }
     }
-    else
-    {
-      while ( i++ < ps.ela_rounds )
-      {
-        compute_required_time();
-        if ( !compute_mapping_exact<false>( i == ps.ela_rounds ) )
-        {
-          return false;
-        }
-      }
-
-      /* compute mapping using exact switching activity estimation */
-      i = 0;
-      while ( i++ < ps.eswp_rounds )
-      {
-        compute_required_time();
-        if ( !compute_mapping_exact<true>( true ) )
-        {
-          return false;
-        }
-      }
 
-      /* cleaning not fully utilized multi-output gates */
-      if ( ps.map_multioutput )
+    /* compute mapping using exact switching activity estimation */
+    i = 0;
+    while ( i++ < ps.eswp_rounds )
+    {
+      if ( !compute_mapping_exact_reversed<true>() )
       {
-        remove_unused_multioutput();
+        return false;
       }
     }
 
@@ -1693,93 +1659,9 @@ class emap_impl
     return success;
   }
 
-  template<bool SwitchActivity>
-  bool compute_mapping_exact( bool last_round )
-  {
-    for ( auto const& n : topo_order )
-    {
-      if ( ntk.is_constant( n ) || ntk.is_pi( n ) )
-        continue;
-
-      /* don't touch box */
-      if constexpr ( has_is_dont_touch_v<Ntk> )
-      {
-        if ( ntk.is_dont_touch( n ) )
-        {
-          if constexpr ( has_has_binding_v<Ntk> )
-          {
-            propagate_data_forward_white_box( n );
-          }
-          continue;
-        }
-      }
-
-      auto index = ntk.node_to_index( n );
-      auto& node_data = node_match[index];
-
-      /* recursively deselect the best cut shared between
-       * the two phases if in use in the cover */
-      if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
-      {
-        uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
-        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
-        cut_deref<SwitchActivity>( best_cut, n, use_phase );
-      }
-      else if ( !node_data.map_refs[0] || !node_data.map_refs[1] )
-      {
-        uint8_t use_phase = node_data.map_refs[0] ? 0 : 1;
-        auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
-        cut_deref<SwitchActivity>( best_cut, n, use_phase );
-        node_data.same_match = true;
-      }
-
-      /* match positive phase */
-      match_phase_exact<SwitchActivity>( n, 0u );
-
-      /* match negative phase */
-      match_phase_exact<SwitchActivity>( n, 1u );
-
-      /* try to drop one phase */
-      match_drop_phase<true, true>( n );
-
-      /* try a multi-output match */
-      if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
-      {
-        bool multi_success = match_multioutput_exact<SwitchActivity>( n, last_round );
-        if ( multi_success )
-          multi_node_update_exact<SwitchActivity>( n );
-      }
-
-      if ( node_match[index].map_refs[0] )
-        assert( node_match[index].arrival[0] < node_match[index].required[0] + epsilon );
-      if ( node_match[index].map_refs[1] )
-        assert( node_match[index].arrival[1] < node_match[index].required[1] + epsilon );
-    }
-
-    double area_old = area;
-    bool success = set_mapping_refs<true>();
-
-    /* round stats */
-    if ( ps.verbose )
-    {
-      float area_gain = float( ( area_old - area ) / area_old * 100 );
-      std::stringstream stats{};
-      if constexpr ( SwitchActivity )
-        stats << fmt::format( "[i] Switching: Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      else
-        stats << fmt::format( "[i] Area     : Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      st.round_stats.push_back( stats.str() );
-    }
-
-    return success;
-  }
-
   template<bool SwitchActivity>
   bool compute_mapping_exact_reversed()
   {
-    /* this method works in reverse topological order: less nodes to update (faster) */
-    /* instead of propagating arrival times forward, it propagates required times backwards */
-
     for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
     {
       if ( ntk.is_constant( *it ) || ntk.is_pi( *it ) )
@@ -1843,7 +1725,7 @@ class emap_impl
       }
 
       /* try to drop one phase */
-      match_drop_phase<true, true>( *it );
+      match_drop_phase<true, true, SwitchActivity>( *it );
 
       /* try a multi-output match */
       if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 )
@@ -2771,7 +2653,7 @@ class emap_impl
     }
   }
 
-  template<bool DO_AREA, bool ELA>
+  template<bool DO_AREA, bool ELA, bool SwitchActivity = false>
   void match_drop_phase( node<Ntk> const& n )
   {
     auto index = ntk.node_to_index( n );
@@ -2887,8 +2769,11 @@ class emap_impl
           cut_ref<false>( cuts[index][node_data.best_cut[1]], n, 1 );
         }
         /* evaluate based on inverter cost */
-        use_zero = lib_inv_area < node_data.flows[1] + epsilon;
-        use_one = lib_inv_area < node_data.flows[0] + epsilon;
+        if constexpr ( !SwitchActivity )
+        {
+          use_zero = lib_inv_area < node_data.flows[1] + epsilon;
+          use_one = lib_inv_area < node_data.flows[0] + epsilon;
+        }
 
         if ( use_one && use_zero )
         {
@@ -3911,99 +3796,6 @@ class emap_impl
 
     return false;
   }
-
-  bool remove_unused_multioutput()
-  {
-    /* TODO: update required times */
-    for ( auto it = topo_order.rbegin(); it != topo_order.rend(); ++it )
-    {
-      if ( ntk.is_constant( *it ) || ntk.is_pi( *it ) )
-        continue;
-
-      auto index = ntk.node_to_index( *it );
-
-      /* get used multi-output gates */
-      if ( node_tuple_match[index] == UINT32_MAX )
-        continue;
-
-      if ( node_match[index].same_match && !node_match[index].multioutput_match[0] )
-        continue;
-
-      if ( !node_match[index].same_match && !( node_match[index].multioutput_match[0] || node_match[index].multioutput_match[1] ) )
-        continue;
-
-      /* check if mapped to multi-output with unused outputs */
-      multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
-
-      bool used = false;
-      bool unused = false;
-      for ( auto j = 0; j < max_multioutput_output_size; ++j )
-      {
-        uint32_t node_index = tuple_data[j].node_index;
-        auto& node_data = node_match[node_index];
-
-        if ( node_data.best_supergate[0] != nullptr && node_data.multioutput_match[0] )
-        {
-          if ( node_data.map_refs[0] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
-            used = true;
-          else
-            unused = true;
-        }
-        else if ( node_data.best_supergate[1] != nullptr && node_data.multioutput_match[1] )
-        {
-          if ( node_data.map_refs[1] > 0 || ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) ) )
-            used = true;
-          else
-            unused = true;
-        }
-      }
-
-      if ( !used || !unused )
-        continue;
-
-      /* remap connected outputs (reverse topo order)*/
-      for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
-      {
-        uint32_t node_index = tuple_data[j].node_index;
-        auto& node_data = node_match[node_index];
-        auto const n = ntk.index_to_node( node_index );
-
-        if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
-          continue;
-
-        /* recursively deselect the best cut shared between
-         * the two phases if in use in the cover */
-        if ( node_data.same_match )
-        {
-          uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
-          auto const& best_cut = cuts[node_index][node_data.best_cut[use_phase]];
-          cut_deref<false>( best_cut, n, use_phase );
-        }
-
-        /* match positive phase */
-        match_phase_exact<false>( n, 0u );
-
-        /* match negative phase */
-        match_phase_exact<false>( n, 1u );
-
-        /* try to drop one phase */
-        match_drop_phase<true, true>( n );
-      }
-    }
-
-    double area_old = area;
-    bool success = set_mapping_refs<true>();
-
-    /* round stats */
-    if ( ps.verbose )
-    {
-      float area_gain = float( ( area_old - area ) / area_old * 100 );
-      std::string stats = fmt::format( "[i] Cleaning : Delay = {:>12.2f}  Area = {:>12.2f}  Gain = {:>5.2f} %  Inverters = {:>5}  Time = {:>5.2f}\n", delay, area, area_gain, inv, to_seconds( clock::now() - time_begin ) );
-      st.round_stats.push_back( stats );
-    }
-
-    return success;
-  }
 #pragma endregion
 
 #pragma region Mapping utils

From 1c5284c2bae687613b21bbc8fabddabdd368bb8f Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 14:47:03 +0200
Subject: [PATCH 16/27] Updating tests

---
 test/algorithms/emap.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/test/algorithms/emap.cpp b/test/algorithms/emap.cpp
index 0e95a7a26..a52717971 100644
--- a/test/algorithms/emap.cpp
+++ b/test/algorithms/emap.cpp
@@ -170,8 +170,7 @@ TEST_CASE( "Emap on full adder 2", "[emap]" )
 
   emap_params ps;
   ps.cut_enumeration_ps.minimize_truth_table = false;
-  ps.use_fast_area_recovery = false;
-  ps.ela_rounds = 0;
+  ps.ela_rounds = 1;
   ps.eswp_rounds = 2;
   emap_stats st;
   binding_view<klut_network> luts = emap_klut( aig, lib, ps, &st );
@@ -244,8 +243,7 @@ TEST_CASE( "Emap on full adder 2 with cells", "[emap]" )
 
   emap_params ps;
   ps.cut_enumeration_ps.minimize_truth_table = false;
-  ps.use_fast_area_recovery = false;
-  ps.ela_rounds = 0;
+  ps.ela_rounds = 1;
   ps.eswp_rounds = 2;
   emap_stats st;
   cell_view<block_network> luts = emap( aig, lib, ps, &st );
@@ -382,12 +380,12 @@ TEST_CASE( "Emap on multiplier with multi-output gates", "[emap]" )
 
   const float eps{ 0.005f };
 
-  CHECK( luts.size() == 233u );
+  CHECK( luts.size() == 235u );
   CHECK( luts.num_pis() == 16u );
   CHECK( luts.num_pos() == 16u );
-  CHECK( luts.num_gates() == 215u );
-  CHECK( st.area > 575.0f - eps );
-  CHECK( st.area < 575.0f + eps );
+  CHECK( luts.num_gates() == 217u );
+  CHECK( st.area > 612.0f - eps );
+  CHECK( st.area < 612.0f + eps );
   CHECK( st.delay > 33.60f - eps );
   CHECK( st.delay < 33.60f + eps );
   CHECK( st.multioutput_gates == 40 );

From e202d8d19d7c4a10a82644b57dd947ff99e22982 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 15:19:41 +0200
Subject: [PATCH 17/27] Adding tests on custom required times, required time
 relaxation, and arrival times

---
 test/algorithms/emap.cpp | 179 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)

diff --git a/test/algorithms/emap.cpp b/test/algorithms/emap.cpp
index a52717971..514c405e8 100644
--- a/test/algorithms/emap.cpp
+++ b/test/algorithms/emap.cpp
@@ -652,6 +652,185 @@ TEST_CASE( "Emap with hybrid matching", "[emap]" )
   CHECK( st.delay < 5.8f + eps );
 }
 
+TEST_CASE( "Emap with arrival times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( large_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  const auto a = aig.create_pi();
+  const auto b = aig.create_pi();
+  const auto c = aig.create_pi();
+  const auto d = aig.create_pi();
+  const auto e = aig.create_pi();
+  const auto f = aig.create_pi();
+  const auto g = aig.create_pi();
+  const auto h = aig.create_pi();
+
+  const auto f1 = aig.create_and( !a, b );
+  const auto f2 = aig.create_and( f1, !c );
+  const auto f3 = aig.create_and( d, e );
+  const auto f4 = aig.create_and( f, !g );
+  const auto f5 = aig.create_and( f4, h );
+  const auto f6 = aig.create_and( f2, f3 );
+  const auto f7 = aig.create_and( f5, f6 );
+
+  aig.create_po( f7 );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  emap_stats st;
+
+  ps.arrival_times = std::vector<double>( 8 );
+  ps.arrival_times[0] = 0.0;
+  ps.arrival_times[1] = 1.0;
+  ps.arrival_times[2] = 2.0;
+  ps.arrival_times[3] = 3.0;
+  ps.arrival_times[4] = 4.0;
+  ps.arrival_times[5] = 5.0;
+  ps.arrival_times[6] = 6.0;
+  ps.arrival_times[7] = 7.0;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 27u );
+  CHECK( ntk.num_pis() == 8u );
+  CHECK( ntk.num_pos() == 1u );
+  CHECK( ntk.num_gates() == 17u );
+  CHECK( st.area > 24.0f - eps );
+  CHECK( st.area < 24.0f + eps );
+  CHECK( st.delay > 12.6f - eps );
+  CHECK( st.delay < 12.6f + eps );
+}
+
+TEST_CASE( "Emap with global required times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); } );
+  aig.create_po( carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  ps.required_time = 20.0; // real delay 15.7
+  emap_stats st;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
+TEST_CASE( "Emap with required times", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  // ps.required_time = 20.0; // real delay 15.7
+  emap_stats st;
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); ps.required_times.push_back( 19.0 ); } );
+  aig.create_po( carry );
+  ps.required_times.push_back( 20.0 );
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
+TEST_CASE( "Emap with required time relaxation", "[emap]" )
+{
+  std::vector<gate> gates;
+
+  std::istringstream in( test_library );
+  auto result = lorina::read_genlib( in, genlib_reader( gates ) );
+  CHECK( result == lorina::return_code::success );
+
+  tech_library<6> lib( gates );
+
+  aig_network aig;
+  
+  std::vector<aig_network::signal> a( 8 ), b( 8 );
+  std::generate( a.begin(), a.end(), [&aig]() { return aig.create_pi(); } );
+  std::generate( b.begin(), b.end(), [&aig]() { return aig.create_pi(); } );
+  auto carry = aig.get_constant( false );
+
+  carry_ripple_adder_inplace( aig, a, b, carry );
+
+  std::for_each( a.begin(), a.end(), [&]( auto f ) { aig.create_po( f ); } );
+  aig.create_po( carry );
+
+  emap_params ps;
+  ps.matching_mode = emap_params::boolean;
+  ps.relax_required = 27.5; // real delay 15.7
+  emap_stats st;
+
+  cell_view<block_network> ntk = emap<6>( aig, lib, ps, &st );
+
+  const float eps{ 0.005f };
+
+  CHECK( ntk.size() == 34 );
+  CHECK( ntk.num_pis() == 16u );
+  CHECK( ntk.num_pos() == 9u );
+  CHECK( ntk.num_gates() == 16u );
+  CHECK( st.area > 63.0f - eps );
+  CHECK( st.area < 63.0f + eps );
+  CHECK( st.delay < 20.0f + eps );
+}
+
 TEST_CASE( "Emap with supergates", "[emap]" )
 {
   std::vector<gate> gates;

From 42f2e1399e27ff6da0b8335f66835c977d420053 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 15:28:41 +0200
Subject: [PATCH 18/27] Reverting experiment file

---
 experiments/emap.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index fa2bb17c1..11d465f8b 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -55,7 +55,7 @@ int main()
 
   /* library to map to technology */
   fmt::print( "[i] processing technology library\n" );
-  std::string library = "asap7";
+  std::string library = "multioutput";
   std::vector<gate> gates;
   std::ifstream in( cell_libraries_path( library ) );
 
@@ -66,9 +66,9 @@ int main()
 
   tech_library_params tps;
   tps.verbose = true;
-  tech_library<6> tech_lib( gates, tps );
+  tech_library<9> tech_lib( gates, tps );
 
-  for ( auto const& benchmark : iwls_benchmarks() )
+  for ( auto const& benchmark : epfl_benchmarks() )
   {
     fmt::print( "[i] processing {}\n", benchmark );
 
@@ -78,32 +78,27 @@ int main()
       continue;
     }
 
-    // if ( aig.num_gates() > 100000 )
-    //   continue;
-
     /* remove structural redundancies */
-    // aig_balancing_params bps;
-    // bps.minimize_levels = false;
-    // bps.fast_mode = true;
-    // aig_balance( aig, bps );
+    aig_balancing_params bps;
+    bps.minimize_levels = false;
+    bps.fast_mode = true;
+    aig_balance( aig, bps );
 
     const uint32_t size_before = aig.num_gates();
     const uint32_t depth_before = depth_view( aig ).depth();
 
     emap_params ps;
-    ps.matching_mode = emap_params::boolean;
+    ps.matching_mode = emap_params::hybrid;
     ps.area_oriented_mapping = false;
-    ps.map_multioutput = false;
-    ps.verbose = true;
+    ps.map_multioutput = true;
+    ps.relax_required = 0;
     emap_stats st;
-    cell_view<block_network> res = emap<6>( aig, tech_lib, ps, &st );
+    cell_view<block_network> res = emap<9>( aig, tech_lib, ps, &st );
 
     names_view res_names{ res };
     restore_network_name( aig, res_names );
     restore_pio_names_by_order( aig, res_names );
-    // const auto cec = benchmark == "hyp" ? true : abc_cec_mapped_cell( res_names, benchmark, library );
-    // std::cout << fmt::format( "[i] CEC = {}\n", cec );
-    const auto cec = false; /* don't run CEC */
+    const auto cec = benchmark == "hyp" ? true : abc_cec_mapped_cell( res_names, benchmark, library );
 
     /* write verilog netlist */
     // write_verilog_with_cell( res_names, benchmark + "_mapped.v" );

From cc3babc2fbf805ee79a65e68ac1a543bffeaa763 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 15:31:46 +0200
Subject: [PATCH 19/27] Formatting and changing output text when loading a
 library

---
 include/mockturtle/algorithms/emap.hpp   | 15 +++++++--------
 include/mockturtle/utils/super_utils.hpp |  4 ++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 9b1a3be72..10448cba0 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -682,8 +682,8 @@ struct best_gate_emap
   float area;
   float flow;
   unsigned phase : 16;
-  unsigned cut   : 12;
-  unsigned size  :  4;
+  unsigned cut : 12;
+  unsigned size : 4;
 };
 
 template<unsigned NInputs>
@@ -1114,7 +1114,6 @@ class emap_impl
     {
       if ( ntk.is_dont_touch( n ) )
       {
-        
         warning_box |= initialize_box( n );
         return false;
       }
@@ -2061,7 +2060,7 @@ class emap_impl
         if ( ps.use_match_alternatives )
           refine_best_matches( *it );
       }
-      
+
       unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
       if ( node_data.best_supergate[use_phase] == nullptr )
       {
@@ -2240,7 +2239,7 @@ class emap_impl
     /* return if mapping is area oriented */
     if ( ps.area_oriented_mapping )
       return;
-    
+
     set_output_required_time( iteration == 1 );
 
     if ( exit_early )
@@ -3306,7 +3305,7 @@ class emap_impl
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
-        node_data.area[mapped_phase] = area[j];                  /* partial area contribution */
+        node_data.area[mapped_phase] = area[j]; /* partial area contribution */
         node_data.flows[mapped_phase] = flow_sum_neg;
 
         assert( node_data.arrival[mapped_phase] < node_data.required[mapped_phase] + epsilon );
@@ -5746,7 +5745,7 @@ class emap_impl
  * The function takes the size of the cuts in the template parameter `CutSize`.
  *
  * The function returns a block network that supports multi-output cells.
- * 
+ *
  * The novelties of this mapper are contained in 2 publications:
  * - A. Tempia Calvino and G. De Micheli, "Technology Mapping Using Multi-Output Library Cells," ICCAD, 2023.
  * - G. Radi, A. Tempia Calvino, and G. De Micheli, "In Medio Stat Virtus: Combining Boolean and Pattern Matching," ASP-DAC, 2024.
@@ -5805,7 +5804,7 @@ cell_view<block_network> emap( Ntk const& ntk, tech_library<NInputs, Configurati
  * The function takes the size of the cuts in the template parameter `CutSize`.
  *
  * The function returns a k-LUT network. Each LUT abstacts a gate of the technology library.
- * 
+ *
  * The novelties of this mapper are contained in 2 publications:
  * - A. Tempia Calvino and G. De Micheli, "Technology Mapping Using Multi-Output Library Cells," ICCAD, 2023.
  * - G. Radi, A. Tempia Calvino, and G. De Micheli, "In Medio Stat Virtus: Combining Boolean and Pattern Matching," ASP-DAC, 2024.
diff --git a/include/mockturtle/utils/super_utils.hpp b/include/mockturtle/utils/super_utils.hpp
index a2771f726..c7a74b1c1 100644
--- a/include/mockturtle/utils/super_utils.hpp
+++ b/include/mockturtle/utils/super_utils.hpp
@@ -212,8 +212,8 @@ class super_utils
 
     if ( _ps.verbose )
     {
-      std::cout << fmt::format( "[i] Loading {} simple cells in the library\n", simple_gates_size + large_gates );
-      std::cout << fmt::format( "[i] Loading {} multi-output cells in the library\n", _multioutput_gates.size() );
+      std::cout << fmt::format( "[i] Loading {} simple library cells\n", simple_gates_size + large_gates );
+      std::cout << fmt::format( "[i] Loading {} multi-output library cells\n", _multioutput_gates.size() );
     }
 
     if ( ignored > 0 )

From 2e8855fe18d8d352aa4668543ce4cafdfffc5602 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 15:46:19 +0200
Subject: [PATCH 20/27] Updating documentation

---
 docs/algorithms/mapper.rst             | 166 +++++++++++++------------
 include/mockturtle/algorithms/emap.hpp |   2 +-
 2 files changed, 86 insertions(+), 82 deletions(-)

diff --git a/docs/algorithms/mapper.rst b/docs/algorithms/mapper.rst
index 816d4f6b9..14360efad 100644
--- a/docs/algorithms/mapper.rst
+++ b/docs/algorithms/mapper.rst
@@ -1,3 +1,87 @@
+Extended technology mapping
+---------------------------
+
+**Header:** ``mockturtle/algorithms/emap.hpp``
+
+The command `emap` stands for extended mapper. It supports large
+library cells, of more than 6 inputs, and can perform matching using 3
+different methods: Boolean, pattern, or hybrid. The current version
+can map to 2-output gates, such as full adders and half adders,
+and provides a 2x speedup in mapping time compared to command `map`
+for similar or better quality. Similarly, to `map`, the implementation
+is independent of the underlying graph representation.
+Additionally, `emap` supports "don't touch" white boxes (gates).
+
+Command `emap` can return the mapped network in two formats.
+Command `emap` returns a `cell_view<block_network>` that supports
+multi-output cells. Command `emap_klut` returns a `binding_view<klut_network>`
+similarly as command `map`.
+
+The following example shows how to perform delay-oriented technology mapping
+from an and-inverter graph using large cells up to 9 inputs:
+
+.. code-block:: c++
+
+   aig_network aig = ...;
+
+   /* read cell library in genlib format */
+   std::vector<gate> gates;
+   std::ifstream in( ... );
+   lorina::read_genlib( in, genlib_reader( gates ) )
+   tech_library<9> tech_lib( gates );
+
+   /* perform technology mapping */
+   cell_view<block_network> res = emap<9>( aig, tech_lib );
+
+The next example performs area-oriented graph mapping using multi-output cells:
+
+.. code-block:: c++
+
+   aig_network aig = ...;
+
+   /* read cell library in genlib format */
+   std::vector<gate> gates;
+   std::ifstream in( ... );
+   lorina::read_genlib( in, genlib_reader( gates ) )
+   tech_library tech_lib( gates );
+
+   /* perform technology mapping */
+   emap_params ps;
+   ps.area_oriented_mapping = true;
+   ps.map_multioutput = true;
+   cell_view<block_network> res = emap( aig, tech_lib, ps );
+
+In this case, `emap` is used to return a `block_network`, which can respresent multi-output
+cells as single nodes. Alternatively, also `emap_klut` can be used but multi-output cells
+would be reporesented by single-output nodes.
+
+The maximum number of cuts stored for each node is limited to 20.
+To increase this limit, change `max_cut_num` in `emap`.
+
+You can set the inputs arrival time and output required times using the parameters `arrival_times`
+and `required times`. Moreover, it is possible to ask for a required time relaxation. For instance,
+if we want to map a network with an increase of 10% over its minimal delay, we can set
+`relax_required` to 10.
+
+For further details and usage scenarios of `emap`, such as white boxes, please check the
+related tests.
+
+**Parameters and statistics**
+
+.. doxygenstruct:: mockturtle::emap_params
+   :members:
+
+.. doxygenstruct:: mockturtle::emap_stats
+   :members:
+
+**Algorithm**
+
+.. doxygenfunction:: mockturtle::emap(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_klut(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_node_map(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
+.. doxygenfunction:: mockturtle::emap_load_mapping(Ntk&)
+
+
 Technology mapping and network conversion
 -----------------------------------------
 
@@ -136,84 +220,4 @@ To increase this limit, change `max_cut_num` in `fast_network_cuts`.
 **Algorithm**
 
 .. doxygenfunction:: mockturtle::map(Ntk const&, tech_library<NInputs, Configuration> const&, map_params const&, map_stats*)
-.. doxygenfunction:: mockturtle::map(Ntk&, exact_library<NtkDest, RewritingFn, NInputs> const&, map_params const&, map_stats*)
-
-
-
-Extended technology mapping
----------------------------
-
-**Header:** ``mockturtle/algorithms/emap.hpp``
-
-The command `emap` stands for extended mapper. It supports large
-library cells, of more than 6 inputs, and can perform matching using 3
-different methods: Boolean, pattern, or hybrid. The current version
-can map to 2-output gates, such as full adders and half adders,
-and provides a 2x speedup in mapping time compared to command `map`
-for similar or better quality. Similarly, to `map`, the implementation
-is independent of the underlying graph representation.
-Additionally, `emap` supports "don't touch" white boxes (gates).
-
-Command `emap` can return the mapped network in two formats.
-Command `emap` returns a `cell_view<block_network>` that supports
-multi-output cells. Command `emap_klut` returns a `binding_view<klut_network>`
-similarly as command `map`.
-
-The following example shows how to perform delay-oriented technology mapping
-from an and-inverter graph using large cells up to 9 inputs:
-
-.. code-block:: c++
-
-   aig_network aig = ...;
-
-   /* read cell library in genlib format */
-   std::vector<gate> gates;
-   std::ifstream in( ... );
-   lorina::read_genlib( in, genlib_reader( gates ) )
-   tech_library<9> tech_lib( gates );
-
-   /* perform technology mapping */
-   cell_view<block_network> res = emap<9>( aig, tech_lib );
-
-The next example performs area-oriented graph mapping using multi-output cells:
-
-.. code-block:: c++
-
-   aig_network aig = ...;
-
-   /* read cell library in genlib format */
-   std::vector<gate> gates;
-   std::ifstream in( ... );
-   lorina::read_genlib( in, genlib_reader( gates ) )
-   tech_library tech_lib( gates );
-
-   /* perform technology mapping */
-   emap_params ps;
-   ps.area_oriented_mapping = true;
-   ps.map_multioutput = true;
-   cell_view<block_network> res = emap( aig, tech_lib, ps );
-
-In this case, `emap` is used to return a `block_network`, which can respresent multi-output
-cells as single nodes. Alternatively, also `emap_klut` can be used but multi-output cells
-would be reporesented by single-output nodes.
-
-The maximum number of cuts stored for each node is limited to 32.
-To increase this limit, change `max_cut_num` in `emap`.
-
-For further details and usage scenarios of `emap`, such as white boxes, please check the
-related tests.
-
-**Parameters and statistics**
-
-.. doxygenstruct:: mockturtle::emap_params
-   :members:
-
-.. doxygenstruct:: mockturtle::emap_stats
-   :members:
-
-**Algorithm**
-
-.. doxygenfunction:: mockturtle::emap(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_klut(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_node_map(Ntk const&, tech_library<NInputs, Configuration> const&, emap_params const&, emap_stats*)
-.. doxygenfunction:: mockturtle::emap_load_mapping(Ntk&)
\ No newline at end of file
+.. doxygenfunction:: mockturtle::map(Ntk&, exact_library<NtkDest, NInputs> const&, map_params const&, map_stats*)
\ No newline at end of file
diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 10448cba0..bc29a2240 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -131,7 +131,7 @@ struct emap_params
   /*! \brief Number of patterns for switching activity computation. */
   uint32_t switching_activity_patterns{ 2048u };
 
-  /*! \brief Compute alternatives using a different cost functions */
+  /*! \brief Compute area-oriented alternative matches */
   bool use_match_alternatives{ true };
 
   /*! \brief Remove the cuts that are contained in others */

From 653427c0ff496c40660b012bb54f5193feaaa024 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Wed, 1 May 2024 16:01:06 +0200
Subject: [PATCH 21/27] Changing names and formatting

---
 include/mockturtle/algorithms/emap.hpp | 155 ++++++++++++-------------
 1 file changed, 75 insertions(+), 80 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index bc29a2240..b9bf5875c 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -690,7 +690,7 @@ template<unsigned NInputs>
 struct node_match_emap
 {
   /* best gate match for positive and negative output phases */
-  supergate<NInputs> const* best_supergate[2];
+  supergate<NInputs> const* best_gate[2];
   /* alternative best gate for positibe and negative output phase */
   best_gate_emap<NInputs> best_alternative[2];
   /* fanin pin phases for both output phases */
@@ -699,8 +699,6 @@ struct node_match_emap
   uint16_t best_cut[2];
   /* node is mapped using only one phase */
   bool same_match;
-  /* node alternative uses only one phase */
-  bool same_match_alternative;
   /* node is mapped to a multi-output gate */
   bool multioutput_match[2];
 
@@ -1484,7 +1482,7 @@ class emap_impl
       auto const index = ntk.node_to_index( n );
       auto& node_data = node_match[index];
 
-      node_data.best_supergates[0] = node_data.best_supergates[1] = nullptr;
+      node_data.best_gate[0] = node_data.best_gate[1] = nullptr;
       node_data.same_match = 0;
       node_data.multioutput_match[0] = node_data.multioutput_match[1] = false;
       node_data.required[0] = node_data.required[1] = std::numeric_limits<float>::max();
@@ -1689,7 +1687,7 @@ class emap_impl
 
       /* recursively deselect the best cut shared between
        * the two phases if in use in the cover */
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
       double old_required = -1;
       if ( node_data.same_match )
       {
@@ -1785,10 +1783,10 @@ class emap_impl
     auto& node_data = node_match[index];
 
     /* propagate required time through the leaves */
-    unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
+    unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
     unsigned other_phase = use_phase ^ 1;
 
-    assert( node_data.best_supergate[0] != nullptr || node_data.best_supergate[1] != nullptr );
+    assert( node_data.best_gate[0] != nullptr || node_data.best_gate[1] != nullptr );
     // assert( node_data.map_refs[0] || node_data.map_refs[1] );
 
     /* propagate required time over the output inverter if present */
@@ -1806,7 +1804,7 @@ class emap_impl
     {
       auto ctr = 0u;
       auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
-      auto const& supergate = node_data.best_supergate[use_phase];
+      auto const& supergate = node_data.best_gate[use_phase];
       for ( auto leaf : best_cut )
       {
         auto phase = ( node_data.phase[use_phase] >> ctr ) & 1;
@@ -1819,7 +1817,7 @@ class emap_impl
     {
       auto ctr = 0u;
       auto const& best_cut = cuts[index][node_data.best_cut[other_phase]];
-      auto const& supergate = node_data.best_supergate[other_phase];
+      auto const& supergate = node_data.best_gate[other_phase];
       for ( auto leaf : best_cut )
       {
         auto phase = ( node_data.phase[other_phase] >> ctr ) & 1;
@@ -1865,7 +1863,7 @@ class emap_impl
         if ( node_data.map_refs[0] || node_data.map_refs[1] )
         {
           /* if used and not available in the library launch a mapping error */
-          if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+          if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           {
             std::cerr << "[e] MAP ERROR: technology library does not contain constant gates, impossible to perform mapping" << std::endl;
             st.mapping_error = true;
@@ -1899,9 +1897,9 @@ class emap_impl
         }
       }
 
-      unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
+      unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
 
-      if ( node_data.best_supergate[use_phase] == nullptr )
+      if ( node_data.best_gate[use_phase] == nullptr )
       {
         /* Library is not complete, mapping is not possible */
         std::cerr << "[e] MAP ERROR: technology library is not complete, impossible to perform mapping" << std::endl;
@@ -2020,7 +2018,7 @@ class emap_impl
         if ( node_match[index].map_refs[0] || node_match[index].map_refs[1] )
         {
           /* if used and not available in the library launch a mapping error */
-          if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+          if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           {
             std::cerr << "[e] MAP ERROR: technology library does not contain constant gates, impossible to perform mapping" << std::endl;
             st.mapping_error = true;
@@ -2061,8 +2059,8 @@ class emap_impl
           refine_best_matches( *it );
       }
 
-      unsigned use_phase = node_data.best_supergate[0] == nullptr ? 1u : 0u;
-      if ( node_data.best_supergate[use_phase] == nullptr )
+      unsigned use_phase = node_data.best_gate[0] == nullptr ? 1u : 0u;
+      if ( node_data.best_gate[use_phase] == nullptr )
       {
         /* Library is not complete, mapping is not possible */
         std::cerr << "[e] MAP ERROR: technology library is not complete, impossible to perform mapping" << std::endl;
@@ -2310,16 +2308,16 @@ class emap_impl
         }
       }
 
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
 
       /* compute arrival of use_phase */
-      supergate<NInputs> const* best_supergate = node_data.best_supergate[use_phase];
+      supergate<NInputs> const* best_gate = node_data.best_gate[use_phase];
       double worst_arrival = 0;
       uint16_t best_phase = node_data.phase[use_phase];
       auto ctr = 0u;
       for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
       {
-        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
         worst_arrival = std::max( worst_arrival, arrival_pin );
         ++ctr;
       }
@@ -2345,15 +2343,15 @@ class emap_impl
         continue;
       }
 
-      assert( node_data.best_supergate[use_phase] != nullptr );
+      assert( node_data.best_gate[use_phase] != nullptr );
 
-      best_supergate = node_data.best_supergate[use_phase];
+      best_gate = node_data.best_gate[use_phase];
       worst_arrival = 0;
       best_phase = node_data.phase[use_phase];
       ctr = 0u;
       for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
       {
-        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+        double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
         worst_arrival = std::max( worst_arrival, arrival_pin );
         ++ctr;
       }
@@ -2396,16 +2394,16 @@ class emap_impl
   {
     uint32_t index = ntk.node_to_index( n );
     auto& node_data = node_match[index];
-    uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+    uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
 
     /* compute arrival of use_phase */
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[use_phase];
+    supergate<NInputs> const* best_gate = node_data.best_gate[use_phase];
     double worst_arrival = 0;
     uint16_t best_phase = node_data.phase[use_phase];
     auto ctr = 0u;
     for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
     {
-      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
       worst_arrival = std::max( worst_arrival, arrival_pin );
       ++ctr;
     }
@@ -2419,15 +2417,15 @@ class emap_impl
       return;
     }
 
-    assert( node_data.best_supergate[0] != nullptr );
+    assert( node_data.best_gate[0] != nullptr );
 
-    best_supergate = node_data.best_supergate[use_phase];
+    best_gate = node_data.best_gate[use_phase];
     worst_arrival = 0;
     best_phase = node_data.phase[use_phase];
     ctr = 0u;
     for ( auto l : cuts[index][node_data.best_cut[use_phase]] )
     {
-      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_supergate->tdelay[ctr];
+      double arrival_pin = node_match[l].arrival[( best_phase >> ctr ) & 1] + best_gate->tdelay[ctr];
       worst_arrival = std::max( worst_arrival, arrival_pin );
       ++ctr;
     }
@@ -2442,7 +2440,7 @@ class emap_impl
     auto& node_data = node_match[index];
     uint32_t cut_index = 0u;
 
-    node_data.best_supergate[phase] = nullptr;
+    node_data.best_gate[phase] = nullptr;
     node_data.arrival[phase] = std::numeric_limits<float>::max();
     node_data.flows[phase] = std::numeric_limits<float>::max();
     node_data.area[phase] = std::numeric_limits<float>::max();
@@ -2518,7 +2516,7 @@ class emap_impl
 
         if ( !skip && compare_map<DO_AREA>( worst_arrival, node_data.arrival[phase], area_local, node_data.flows[phase], cut->size(), best_size ) )
         {
-          node_data.best_supergate[phase] = &gate;
+          node_data.best_gate[phase] = &gate;
           node_data.arrival[phase] = worst_arrival;
           node_data.flows[phase] = area_local;
           node_data.best_cut[phase] = cut_index;
@@ -2558,23 +2556,23 @@ class emap_impl
     auto index = ntk.node_to_index( n );
 
     auto& node_data = node_match[index];
-    supergate<NInputs> const* best_supergate = node_data.best_supergate[phase];
+    supergate<NInputs> const* best_gate = node_data.best_gate[phase];
 
     /* unmap multioutput */
     if ( node_data.multioutput_match[phase] )
     {
       /* dereference multi-output */
-      if ( !node_data.same_match && best_supergate != nullptr && node_data.map_refs[phase] )
+      if ( !node_data.same_match && best_gate != nullptr && node_data.map_refs[phase] )
       {
         auto const& cut = multi_cut_set[node_data.best_cut[phase]][0];
         cut_deref<SwitchActivity>( cut, n, phase );
       }
-      best_supergate = nullptr;
+      best_gate = nullptr;
       node_data.multioutput_match[phase] = false;
     }
 
     /* recompute best match info */
-    if ( best_supergate != nullptr )
+    if ( best_gate != nullptr )
     {
       /* if cut is implemented, remove it from the cover */
       if ( !node_data.same_match && node_data.map_refs[phase] )
@@ -2632,7 +2630,7 @@ class emap_impl
           best_size = cut->size();
           best_cut = cut_index;
           best_phase = gate_polarity;
-          best_supergate = &gate;
+          best_gate = &gate;
         }
       }
 
@@ -2644,7 +2642,7 @@ class emap_impl
     node_data.area[phase] = best_area;
     node_data.best_cut[phase] = best_cut;
     node_data.phase[phase] = best_phase;
-    node_data.best_supergate[phase] = best_supergate;
+    node_data.best_gate[phase] = best_gate;
 
     if ( !node_data.same_match && node_data.map_refs[phase] )
     {
@@ -2665,7 +2663,7 @@ class emap_impl
     bool use_one = false;
 
     /* only one phase is matched */
-    if ( node_data.best_supergate[0] == nullptr )
+    if ( node_data.best_gate[0] == nullptr )
     {
       set_match_complemented_phase( index, 1, worst_arrival_npos );
       if constexpr ( ELA )
@@ -2675,7 +2673,7 @@ class emap_impl
       }
       return;
     }
-    else if ( node_data.best_supergate[1] == nullptr )
+    else if ( node_data.best_gate[1] == nullptr )
     {
       set_match_complemented_phase( index, 0, worst_arrival_nneg );
       if constexpr ( ELA )
@@ -2856,7 +2854,7 @@ class emap_impl
     auto& node_data = node_match[index];
     auto phase_n = phase ^ 1;
     node_data.same_match = true;
-    node_data.best_supergate[phase_n] = nullptr;
+    node_data.best_gate[phase_n] = nullptr;
     node_data.best_cut[phase_n] = node_data.best_cut[phase];
     node_data.phase[phase_n] = node_data.phase[phase];
     node_data.arrival[phase_n] = worst_arrival_n;
@@ -2885,7 +2883,6 @@ class emap_impl
     /* process for best area */ /* removed check on required since this is executed only during a delay pass */
     if ( g0.gate != nullptr && g0flow + lib_inv_area < g1flow + epsilon )
     {
-      node_data.same_match_alternative = true;
       g1 = g0;
       g1.gate = nullptr;
       g1.arrival += lib_inv_delay;
@@ -2895,7 +2892,6 @@ class emap_impl
     }
     else if ( g1.gate != nullptr && g1flow + lib_inv_area < g0flow + epsilon )
     {
-      node_data.same_match_alternative = true;
       g0 = g1;
       g0.gate = nullptr;
       g0.arrival += lib_inv_delay;
@@ -2904,7 +2900,6 @@ class emap_impl
       return;
     }
 
-    node_data.same_match_alternative = false;
     g0.flow = g0flow;
     g1.flow = g1flow;
   }
@@ -2923,9 +2918,9 @@ class emap_impl
       if ( node_data.same_match )
       {
         /* pick best implementation between the two alternatives */
-        unsigned best_match_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+        unsigned best_match_phase = node_data.best_gate[0] == nullptr ? 1 : 0;
         unsigned use_phase = g0.gate == nullptr ? 1 : 0;
-        if ( !node_data.same_match_alternative )
+        if ( g0.gate != nullptr && g1.gate != nullptr )
         {
           if ( g0.arrival > node_data.required[0] + epsilon || g1.arrival > node_data.required[1] + epsilon )
             return;
@@ -2952,12 +2947,12 @@ class emap_impl
         if ( g0.gate != nullptr && g0.arrival < node_data.required[0] + epsilon )
         {
           node_data.same_match = false;
-          refine_best_matches_copy_refinement( n, 0, node_data.same_match_alternative && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon );
+          refine_best_matches_copy_refinement( n, 0, g1.gate == nullptr && g0.arrival + lib_inv_delay < node_data.required[1] + epsilon );
         }
         if ( g1.gate != nullptr && g1.arrival < node_data.required[1] + epsilon )
         {
           node_data.same_match = false;
-          refine_best_matches_copy_refinement( n, 1, node_data.same_match_alternative && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon );
+          refine_best_matches_copy_refinement( n, 1, g0.gate == nullptr && g1.arrival + lib_inv_delay < node_data.required[0] + epsilon );
         }
       }
     }
@@ -2993,7 +2988,7 @@ class emap_impl
     auto& node_data = node_match[index];
     best_gate_emap<NInputs>& bg = node_data.best_alternative[phase];
 
-    node_data.best_supergate[phase] = bg.gate;
+    node_data.best_gate[phase] = bg.gate;
     node_data.phase[phase] = bg.phase;
     node_data.best_cut[phase] = bg.cut;
     node_data.arrival[phase] = bg.arrival;
@@ -3005,7 +3000,7 @@ class emap_impl
 
     node_data.same_match = true;
     phase ^= 1;
-    node_data.best_supergate[phase] = nullptr;
+    node_data.best_gate[phase] = nullptr;
     node_data.phase[phase] = bg.phase;
     node_data.best_cut[phase] = bg.cut;
     node_data.arrival[phase] = bg.arrival + lib_inv_delay;
@@ -3124,16 +3119,16 @@ class emap_impl
     /* if only one is available, the other is obtained using an inverter */
     if ( supergates_zero != nullptr )
     {
-      node_data.best_supergate[0] = &( ( *supergates_zero )[0] );
-      node_data.arrival[0] = node_data.best_supergate[0]->tdelay[0];
-      node_data.area[0] = node_data.best_supergate[0]->area;
+      node_data.best_gate[0] = &( ( *supergates_zero )[0] );
+      node_data.arrival[0] = node_data.best_gate[0]->tdelay[0];
+      node_data.area[0] = node_data.best_gate[0]->area;
       node_data.phase[0] = 0;
     }
     if ( supergates_one != nullptr )
     {
-      node_data.best_supergate[1] = &( ( *supergates_one )[0] );
-      node_data.arrival[1] = node_data.best_supergate[1]->tdelay[0];
-      node_data.area[1] = node_data.best_supergate[1]->area;
+      node_data.best_gate[1] = &( ( *supergates_one )[0] );
+      node_data.arrival[1] = node_data.best_gate[1]->tdelay[0];
+      node_data.area[1] = node_data.best_gate[1]->area;
       node_data.phase[1] = 0;
     }
     else
@@ -3235,7 +3230,7 @@ class emap_impl
         /* compute area flow */
         if ( j == 0 || !node_data.multioutput_match[0] )
         {
-          uint8_t current_phase = node_data.best_supergate[0] == nullptr ? 1 : 0;
+          uint8_t current_phase = node_data.best_gate[0] == nullptr ? 1 : 0;
           old_flow_sum += node_data.flows[current_phase];
         }
         uint8_t old_phase = node_data.phase[phase[j]];
@@ -3289,7 +3284,7 @@ class emap_impl
         uint8_t mapped_phase = phase[j];
         node_data.multioutput_match[mapped_phase] = true;
 
-        node_data.best_supergate[mapped_phase] = &gate;
+        node_data.best_gate[mapped_phase] = &gate;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j];
@@ -3301,7 +3296,7 @@ class emap_impl
         /* select opposite phase */
         mapped_phase ^= 1;
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = nullptr;
+        node_data.best_gate[mapped_phase] = nullptr;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
@@ -3349,13 +3344,13 @@ class emap_impl
     for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
     {
       uint32_t node_index = tuple_data[j].node_index;
-      uint8_t selected_phase = node_match[node_index].best_supergate[0] == nullptr ? 1 : 0;
+      uint8_t selected_phase = node_match[node_index].best_gate[0] == nullptr ? 1 : 0;
 
       if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
         /* match is always single output here */
         auto const& cut = cuts[node_index][node_match[node_index].best_cut[0]];
-        uint8_t use_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t use_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         best_exact_area[j] = cut_deref<SwitchActivity>( cut, ntk.index_to_node( node_index ), use_phase );
 
         /* mapping a non referenced phase */
@@ -3375,7 +3370,7 @@ class emap_impl
 
       if ( node_match[node_index].map_refs[0] || node_match[node_index].map_refs[1] )
       {
-        uint8_t use_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t use_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         auto const& best_cut = cuts[node_index][node_match[node_index].best_cut[use_phase]];
         cut_ref<SwitchActivity>( best_cut, ntk.index_to_node( node_index ), use_phase );
       }
@@ -3514,7 +3509,7 @@ class emap_impl
 
         /* write data */
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = &gate;
+        node_data.best_gate[mapped_phase] = &gate;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j];
@@ -3524,7 +3519,7 @@ class emap_impl
         /* select opposite phase */
         mapped_phase ^= 1;
         node_data.multioutput_match[mapped_phase] = true;
-        node_data.best_supergate[mapped_phase] = nullptr;
+        node_data.best_gate[mapped_phase] = nullptr;
         node_data.best_cut[mapped_phase] = cut_index[j];
         node_data.phase[mapped_phase] = pin_phase[j];
         node_data.arrival[mapped_phase] = arrival[j] + lib_inv_delay;
@@ -3681,7 +3676,7 @@ class emap_impl
 
     if ( node_data.same_match && ( node_data.map_refs[0] || node_data.map_refs[1] ) )
     {
-      uint8_t use_phase = node_data.best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t use_phase = node_data.best_gate[0] != nullptr ? 0 : 1;
       auto const& best_cut = cuts[index][node_data.best_cut[use_phase]];
       cut_deref<SwitchActivity>( best_cut, n, use_phase );
     }
@@ -3873,7 +3868,7 @@ class emap_impl
         }
 
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
-        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -3946,7 +3941,7 @@ class emap_impl
       if ( node_match[leaf].same_match )
       {
         /* Add inverter area if it is used only by the current gate and leaf node is implemented in the opposite phase */
-        if ( --node_match[leaf].map_refs[leaf_phase] == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( --node_match[leaf].map_refs[leaf_phase] == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -4053,7 +4048,7 @@ class emap_impl
         }
 
         /* Add inverter area if not present yet and leaf node is implemented in the opposite phase */
-        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_supergate[leaf_phase] == nullptr )
+        if ( node_match[leaf].map_refs[leaf_phase]++ == 0u && node_match[leaf].best_gate[leaf_phase] == nullptr )
         {
           if constexpr ( SwitchActivity )
             count += switch_activity[leaf];
@@ -4196,7 +4191,7 @@ class emap_impl
       /* add inverter at PI if needed */
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -4223,7 +4218,7 @@ class emap_impl
         }
       }
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       /* add used cut */
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
@@ -4314,7 +4309,7 @@ class emap_impl
       /* add inverter at PI if needed */
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -4341,7 +4336,7 @@ class emap_impl
         }
       }
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       /* add used cut */
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
@@ -4413,7 +4408,7 @@ class emap_impl
   {
     auto const& node_data = node_match[index];
     auto const& best_cut = cuts[index][node_data.best_cut[phase]];
-    auto const& gate = node_data.best_supergate[phase]->root;
+    auto const& gate = node_data.best_gate[phase]->root;
 
     /* permutate and negate to obtain the matched gate truth table */
     std::vector<signal<klut_network>> children( gate->num_vars );
@@ -4423,7 +4418,7 @@ class emap_impl
     {
       if ( ctr >= gate->num_vars )
         break;
-      children[node_data.best_supergate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
+      children[node_data.best_gate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
@@ -4474,7 +4469,7 @@ class emap_impl
   {
     auto const& node_data = node_match[index];
     auto const& best_cut = cuts[index][node_data.best_cut[phase]];
-    auto const& gate = node_data.best_supergate[phase]->root;
+    auto const& gate = node_data.best_gate[phase]->root;
 
     /* permutate and negate to obtain the matched gate truth table */
     std::vector<signal<block_network>> children( gate->num_vars );
@@ -4484,7 +4479,7 @@ class emap_impl
     {
       if ( ctr >= gate->num_vars )
         break;
-      children[node_data.best_supergate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
+      children[node_data.best_gate[phase]->permutation[ctr]] = old2new[l][( node_data.phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
@@ -4534,7 +4529,7 @@ class emap_impl
   void create_block_for_gate( cell_view<block_network>& res, block_map& old2new, uint32_t index, unsigned phase, std::vector<uint32_t> const& genlib_to_cell )
   {
     std::vector<standard_cell> const& lib = res.get_library();
-    composed_gate<NInputs> const* local_gate = node_match[index].best_supergate[phase]->root;
+    composed_gate<NInputs> const* local_gate = node_match[index].best_gate[phase]->root;
     standard_cell const& cell = lib[genlib_to_cell.at( local_gate->root->id )];
 
     assert( !local_gate->is_super );
@@ -4549,7 +4544,7 @@ class emap_impl
     {
       if ( ctr >= local_gate->num_vars )
         break;
-      children[node_match[index].best_supergate[phase]->permutation[ctr]] = old2new[l][( node_match[index].phase[phase] >> ctr ) & 1];
+      children[node_match[index].best_gate[phase]->permutation[ctr]] = old2new[l][( node_match[index].phase[phase] >> ctr ) & 1];
       ++ctr;
     }
 
@@ -4565,10 +4560,10 @@ class emap_impl
       {
         uint32_t node_index = tuple_data[j].node_index;
         assert( node_match[node_index].same_match );
-        uint8_t node_phase = node_match[node_index].best_supergate[0] != nullptr ? 0 : 1;
+        uint8_t node_phase = node_match[node_index].best_gate[0] != nullptr ? 0 : 1;
         assert( node_match[node_index].multioutput_match[node_phase] );
 
-        gate const* node_gate = node_match[node_index].best_supergate[node_phase]->root->root;
+        gate const* node_gate = node_match[node_index].best_gate[node_phase]->root->root;
 
         /* wrong output */
         if ( node_gate->id != g.id )
@@ -4588,7 +4583,7 @@ class emap_impl
     for ( uint32_t s : outputs )
     {
       /* add inverted version if used */
-      uint8_t node_phase = node_match[s].best_supergate[0] != nullptr ? 0 : 1;
+      uint8_t node_phase = node_match[s].best_gate[0] != nullptr ? 0 : 1;
       assert( node_match[s].same_match );
 
       /* add the node in the data structure */
@@ -5000,7 +4995,7 @@ class emap_impl
 
       if ( ntk.is_constant( n ) )
       {
-        if ( node_data.best_supergate[0] == nullptr && node_data.best_supergate[1] == nullptr )
+        if ( node_data.best_gate[0] == nullptr && node_data.best_gate[1] == nullptr )
           continue;
       }
       else if ( ntk.is_pi( n ) )
@@ -5014,7 +5009,7 @@ class emap_impl
       if ( !node_data.map_refs[0] && !node_data.map_refs[1] )
         continue;
 
-      unsigned phase = ( node_data.best_supergate[0] != nullptr ) ? 0 : 1;
+      unsigned phase = ( node_data.best_gate[0] != nullptr ) ? 0 : 1;
 
       if ( node_data.same_match || node_data.map_refs[phase] > 0 )
       {

From 406a5decf5a1aecddfd2ca49aeacc285ea7fc564 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Fri, 3 May 2024 19:05:42 +0200
Subject: [PATCH 22/27] Fixing multioutput cut insertion on cutset

---
 include/mockturtle/algorithms/emap.hpp | 266 ++++++++++++-------------
 1 file changed, 126 insertions(+), 140 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index b9bf5875c..f6454ec2b 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -717,20 +717,31 @@ struct node_match_emap
   float flows[2];
 };
 
-union multi_match_data
-{
-  uint64_t data{ 0 };
-  struct
-  {
-    uint64_t in_tfi : 1;
-    uint64_t cut_index : 31;
-    uint64_t node_index : 32;
-  };
-};
-
 template<class Ntk, unsigned CutSize, unsigned NInputs, classification_type Configuration>
 class emap_impl
 {
+private:
+  union multi_match_data
+  {
+    uint64_t data{ 0 };
+    struct
+    {
+      uint64_t in_tfi : 1;
+      uint64_t cut_index : 31;
+      uint64_t node_index : 32;
+    };
+  };
+  union multioutput_info
+  {
+    uint32_t data;
+    struct
+    {
+      unsigned index : 29;
+      unsigned lowest_index : 1;
+      unsigned highest_index : 1;
+      unsigned has_info : 1;
+    };
+  };
 public:
   static constexpr float epsilon = 0.0005;
   static constexpr uint32_t max_cut_num = 20;
@@ -767,10 +778,11 @@ class emap_impl
         ps( ps ),
         st( st ),
         node_match( ntk.size() ),
-        node_tuple_match( ntk.size(), UINT32_MAX ),
+        node_tuple_match( ntk.size() ),
         switch_activity( ps.eswp_rounds ? switching_activity( ntk, ps.switching_activity_patterns ) : std::vector<float>( 0 ) ),
         cuts( ntk.size() )
   {
+    std::memset( node_tuple_match.data(), 0, sizeof( multioutput_info ) * ntk.size() );
     std::tie( lib_inv_area, lib_inv_delay, lib_inv_id ) = library.get_inverter_info();
     std::tie( lib_buf_area, lib_buf_delay, lib_buf_id ) = library.get_buffer_info();
     tmp_visited.reserve( 100 );
@@ -782,10 +794,11 @@ class emap_impl
         ps( ps ),
         st( st ),
         node_match( ntk.size() ),
-        node_tuple_match( ntk.size(), UINT32_MAX ),
+        node_tuple_match( ntk.size() ),
         switch_activity( switch_activity ),
         cuts( ntk.size() )
   {
+    std::memset( node_tuple_match.data(), 0, sizeof( multioutput_info ) * ntk.size() );
     std::tie( lib_inv_area, lib_inv_delay, lib_inv_id ) = library.get_inverter_info();
     std::tie( lib_buf_area, lib_buf_delay, lib_buf_id ) = library.get_buffer_info();
     tmp_visited.reserve( 100 );
@@ -968,7 +981,6 @@ class emap_impl
     /* compute mapping using exact area */
     i = 0;
     compute_required_time( true );
-    reindex_multioutput_data();
     while ( i++ < ps.ela_rounds )
     {
       if ( !compute_mapping_exact_reversed<false>() )
@@ -1005,6 +1017,12 @@ class emap_impl
         continue;
       }
 
+      /* load multi-output cuts and data */
+      if ( ps.map_multioutput && node_tuple_match[index].has_info )
+      {
+        match_multi_add_cuts( n );
+      }
+
       /* match positive phase */
       match_phase<DO_AREA>( n, 0u );
 
@@ -1017,17 +1035,12 @@ class emap_impl
       /* select alternative matches to use */
       select_alternatives<DO_AREA>( n );
 
-      /* load and try a multi-output matches */
-      if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
+      /* try multi-output matches */
+      if constexpr ( DO_AREA )
       {
-        /* continue if matches do not fit in the cut data structure due to bad settings */
-        if ( !match_multi_add_cuts<DO_AREA>( n ) )
-          continue;
-
-        if constexpr ( DO_AREA )
+        if ( ps.map_multioutput && node_tuple_match[index].highest_index )
         {
-          bool multi_success = match_multioutput<DO_AREA>( n );
-          if ( multi_success )
+          if ( match_multioutput<DO_AREA>( n ) )
             multi_node_update<DO_AREA>( n );
         }
       }
@@ -1198,7 +1211,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1285,7 +1298,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1308,7 +1321,7 @@ class emap_impl
         compute_truth_table( index, vcuts, fanin, new_cut );
 
         /* match cut and compute data */
-        compute_cut_data<DO_AREA>( new_cut, n );
+        compute_cut_data( new_cut, n );
 
         if ( ps.remove_dominated_cuts )
           rcuts.insert( new_cut, false, sort );
@@ -1569,7 +1582,7 @@ class emap_impl
     new_cut->function = kitty::extend_to<6>( ntk.node_function( n ) );
 
     /* match cut and compute data */
-    compute_cut_data<DO_AREA>( new_cut, n );
+    compute_cut_data( new_cut, n );
 
     ++cuts_total;
   }
@@ -1618,7 +1631,7 @@ class emap_impl
       /* try a multi-output match */
       if constexpr ( DO_AREA )
       {
-        if ( ps.map_multioutput && node_tuple_match[index] != UINT32_MAX )
+        if ( ps.map_multioutput && node_tuple_match[index].highest_index )
         {
           bool multi_success = match_multioutput<DO_AREA>( n );
           if ( multi_success )
@@ -1724,8 +1737,8 @@ class emap_impl
       /* try to drop one phase */
       match_drop_phase<true, true, SwitchActivity>( *it );
 
-      /* try a multi-output match */
-      if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 )
+      /* try a multi-output match */ /* TODO: fix the required time*/
+      if ( ps.map_multioutput && node_tuple_match[index].lowest_index )
       {
         bool mapped = match_multioutput_exact<SwitchActivity>( *it, true );
 
@@ -3008,24 +3021,6 @@ class emap_impl
     node_data.flows[phase] = ( bg.flow * node_data.est_refs[phase ^ 1] + lib_inv_area ) / node_data.est_refs[phase];
   }
 
-  void reindex_multioutput_data()
-  {
-    /* re-index the multioutput list using the lowest index output instead of the greatest one */
-    if ( !ps.map_multioutput )
-      return;
-
-    for ( auto i = ntk.num_pis(); i < topo_order.size(); ++i )
-    {
-      uint32_t tuple_index = node_tuple_match[i];
-      if ( tuple_index >= UINT32_MAX - 1 )
-        continue;
-
-      multi_match_t const& tuple_data = multi_node_match[tuple_index][0];
-      node_tuple_match[i] = UINT32_MAX - 1; /* arbitrary value to skip the required time propagation */
-      node_tuple_match[tuple_data[0].node_index] = tuple_index;
-    }
-  }
-
   bool initialize_box( node<Ntk> const& n )
   {
     uint32_t index = ntk.node_to_index( n );
@@ -3152,7 +3147,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     /* get the cut */
     auto const& cut0 = cuts[tuple_data[0].node_index][tuple_data[0].cut_index];
@@ -3315,7 +3310,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     /* local values storage */
     std::array<float, max_multioutput_output_size> best_exact_area;
@@ -3537,7 +3532,7 @@ class emap_impl
   void multi_node_update( node<Ntk> const& n )
   {
     uint32_t check_index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )].index][0];
     uint64_t signature = 0;
 
     /* check if a node is in TFI: there is a path of length > 1 */
@@ -3617,7 +3612,7 @@ class emap_impl
   void multi_node_update_exact( node<Ntk> const& n )
   {
     uint32_t check_index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[ntk.node_to_index( n )].index][0];
     uint64_t signature = 0;
 
     /* check if a node is in TFI: there is a path of length > 1 */
@@ -3698,7 +3693,7 @@ class emap_impl
   {
     /* extract outputs tuple */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
 
     for ( int j = max_multioutput_output_size - 1; j >= 0; --j )
     {
@@ -3707,78 +3702,67 @@ class emap_impl
     }
   }
 
-  template<bool DO_AREA>
   bool match_multi_add_cuts( node<Ntk> const& n )
   {
+    /* assume a single cut (current version) */
     uint32_t index = ntk.node_to_index( n );
-    auto& matches = multi_node_match[node_tuple_match[index]];
+    multi_match_t& matches = multi_node_match[node_tuple_match[index].index][0];
+
+    /* find the corresponding cut */
+    uint32_t cut_p = 0;
+    while( matches[cut_p].node_index != index )
+      ++cut_p;
+    
+    assert( cut_p < matches.size() );
+    uint32_t cut_index = matches[cut_p].cut_index;
+    auto& cut = multi_cut_set[cut_index][cut_p];
+    auto single_cut = multi_cut_set[cut_index][cut_p];
+    auto& rcuts = cuts[index];
+
+    /* not enough space in the data structure: abort */
+    if ( rcuts.size() == max_cut_num )
+    {
+      match_multi_add_cuts_remove_entry( matches );
+      return false;
+    }
 
-    /* get the cuts */
-    auto tuple_data_it = matches.begin();
-    while ( tuple_data_it != matches.end() )
+    /* insert single cut variation if unique (for delay preservation) */
+    if ( !rcuts.is_contained( single_cut ) )
     {
-      multi_match_t& tuple_data = *tuple_data_it;
-      uint32_t cut_index = tuple_data[0].cut_index;
-      auto& cut_pair = multi_cut_set[cut_index];
-      bool remove_entry = false;
+      single_cut->pattern_index = 0;
+      compute_cut_data( single_cut, ntk.index_to_node( index ) );
+      rcuts.append_cut( single_cut );
 
-      /* insert multi-output cuts into the standard cut set */
-      for ( auto i = 0; i < max_multioutput_output_size; ++i )
+      /* not enough space in the data structure: abort */
+      if ( rcuts.size() == max_cut_num )
       {
-        uint64_t node_index = tuple_data[i].node_index;
-        auto& cut = cut_pair[i];
-        auto single_cut = cut_pair[i];
-
-        auto& rcuts = cuts[node_index];
-
-        /* not enough space in the data structure: abort */
-        if ( rcuts.size() == max_cut_num )
-        {
-          remove_entry = true;
-          break;
-        }
-
-        /* insert single cut variation if unique (for delay preservation) */
-        if ( !rcuts.is_contained( single_cut ) )
-        {
-          compute_cut_data<DO_AREA>( single_cut, ntk.index_to_node( node_index ) );
-          rcuts.append_cut( single_cut );
-
-          /* not enough space in the data structure: abort */
-          if ( rcuts.size() == max_cut_num )
-          {
-            rcuts.limit( rcuts.size() - 1 );
-            remove_entry = true;
-            break;
-          }
-        }
+        rcuts.limit( rcuts.size() - 1 );
+        match_multi_add_cuts_remove_entry( matches );
+        return false;
+      }
+    }
 
-        /* add multi-output cut */
-        uint32_t num_cuts_pre = rcuts.size();
-        cut->ignore = true;
-        rcuts.append_cut( cut );
+    /* add multi-output cut */
+    uint32_t num_cuts_pre = rcuts.size();
+    cut->ignore = true;
+    rcuts.append_cut( cut );
 
-        uint32_t num_cuts_after = rcuts.size();
-        assert( num_cuts_after == num_cuts_pre + 1 );
+    uint32_t num_cuts_after = rcuts.size();
+    assert( num_cuts_after == num_cuts_pre + 1 );
 
-        rcuts.limit( num_cuts_pre );
+    rcuts.limit( num_cuts_pre );
 
-        /* update tuple data */
-        tuple_data[i].cut_index = num_cuts_pre;
-      }
+    /* update tuple data */
+    matches[cut_p].cut_index = num_cuts_pre;
+  }
 
-      if ( remove_entry )
-        matches.erase( tuple_data_it );
-      else
-        ++tuple_data_it;
+  inline void match_multi_add_cuts_remove_entry( multi_match_t const& matches )
+  {
+    /* reset matches */
+    for ( multi_match_data const& entry : matches )
+    {
+      node_tuple_match[entry.node_index].data = 0;
     }
-
-    /* matches do not fit in the data structure, remove multi-output option */
-    if ( matches.empty() )
-      node_tuple_match[index] = UINT32_MAX;
-
-    /* return if the insertion is (partially) successful */
-    return !matches.empty();
   }
 
   inline bool multi_node_update_cut_check( uint32_t index, uint64_t signature, uint8_t phase )
@@ -4233,7 +4217,7 @@ class emap_impl
         }
 
         /* count multioutput gates */
-        if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 && node_data.multioutput_match[phase] )
+        if ( ps.map_multioutput && node_tuple_match[index].lowest_index && node_data.multioutput_match[phase] )
         {
           ++multioutput_count;
         }
@@ -4246,7 +4230,7 @@ class emap_impl
         create_lut_for_gate( res, old2new, index, phase );
 
         /* count multioutput gates */
-        if ( ps.map_multioutput && node_tuple_match[index] < UINT32_MAX - 1 && node_data.multioutput_match[phase] )
+        if ( ps.map_multioutput && node_tuple_match[index].lowest_index && node_data.multioutput_match[phase] )
         {
           ++multioutput_count;
         }
@@ -4346,11 +4330,10 @@ class emap_impl
         {
           assert( node_data.same_match == true );
 
-          if ( node_tuple_match[index] < UINT32_MAX - 1 )
+          if ( node_tuple_match[index].has_info && node_tuple_match[index].lowest_index )
           {
             ++multioutput_count;
             create_block_for_gate( res, old2new, index, phase, genlib_to_cell );
-            /* TODO: implement */
           }
           continue;
         }
@@ -4548,7 +4531,7 @@ class emap_impl
       ++ctr;
     }
 
-    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index]][0];
+    multi_match_t const& tuple_data = multi_node_match[node_tuple_match[index].index][0];
     std::vector<uint32_t> outputs;
     std::vector<kitty::dynamic_truth_table> functions;
 
@@ -4700,7 +4683,6 @@ class emap_impl
 #pragma endregion
 
 #pragma region Cuts and matching utils
-  template<bool DO_AREA>
   void compute_cut_data( cut_t& cut, node<Ntk> const& n )
   {
     cut->delay = std::numeric_limits<float>::max();
@@ -5240,7 +5222,12 @@ class emap_impl
       if constexpr ( OverlapFilter )
       {
         multi_gate_mark_visited( index1, index2, cut1 );
-        node_tuple_match[index2] = multi_node_match.size();
+        node_tuple_match[index1].has_info = 1;
+        node_tuple_match[index1].lowest_index = 1;
+        node_tuple_match[index1].index = multi_node_match.size();
+        node_tuple_match[index2].has_info = 1;
+        node_tuple_match[index2].highest_index = 1;
+        node_tuple_match[index2].index = multi_node_match.size();
       }
       else
       {
@@ -5269,16 +5256,6 @@ class emap_impl
         multi_node_match[insertion_index].push_back( p );
       }
     }
-
-    /* remove indexing for lower index for compatible overlapping cuts */
-    if constexpr ( !OverlapFilter )
-    {
-      for ( auto const& entry : multi_node_match )
-      {
-        multi_match_t const& p = entry[0];
-        node_tuple_match[p[0].node_index] = UINT32_MAX;
-      }
-    }
   }
 
   bool multi_compute_cut_data( std::array<cut_t, max_multioutput_output_size>& cut_tuple )
@@ -5385,24 +5362,30 @@ class emap_impl
   inline bool multi_gate_check_incompatible( uint32_t index1, uint32_t index2, bool& is_new, uint32_t& data_index )
   {
     /* check cut assigned cut outputs, specialized code for 2 outputs */
-    uint32_t current_assignment = node_tuple_match[index1];
-    if ( current_assignment != node_tuple_match[index2] )
-      return true;
+    if ( !node_tuple_match[index1].has_info && !node_tuple_match[index2].has_info )
+      return false;
 
-    /* load data */
-    if ( current_assignment != UINT32_MAX )
+    if ( node_tuple_match[index1].has_info && node_tuple_match[index2].has_info )
     {
+      uint32_t current_assignment = node_tuple_match[index1].index;
+      if ( current_assignment != node_tuple_match[index2].index )
+        return true;
       is_new = false;
       data_index = current_assignment;
+      return false;
     }
 
-    return false;
+    return true;
   }
 
   inline void multi_gate_mark_compatibility( uint32_t index1, uint32_t index2, uint32_t mark_value )
   {
-    node_tuple_match[index1] = mark_value;
-    node_tuple_match[index2] = mark_value;
+    node_tuple_match[index1].has_info = 1;
+    node_tuple_match[index1].lowest_index = 1;
+    node_tuple_match[index1].index = mark_value;
+    node_tuple_match[index2].has_info = 1;
+    node_tuple_match[index2].highest_index = 1;
+    node_tuple_match[index2].index = mark_value;
   }
 
   inline void multi_gate_mark_visited( uint32_t index1, uint32_t index2, multi_cut_t const& cut )
@@ -5579,8 +5562,11 @@ class emap_impl
         ntk.set_visited( g, ntk.trav_id() - 2 );
         if ( i > 0 && n == repr )
         {
-          /* fix cycle: remove multi-output match; TODO: extend for more than 2 outputs */
-          node_tuple_match[ntk.node_to_index( g )] = UINT32_MAX;
+          /* fix cycle: remove multi-output match */
+          choice_ntk.foreach_choice( repr, [&]( auto const& p ) {
+            node_tuple_match[ntk.node_to_index( p )].data = 0;
+            return true;
+          } );
           choice_ntk.remove_choice( g );
           check = true;
         }
@@ -5712,7 +5698,7 @@ class emap_impl
 
   std::vector<node<Ntk>> topo_order;
   node_match_t node_match;
-  std::vector<uint32_t> node_tuple_match;
+  std::vector<multioutput_info> node_tuple_match;
   std::vector<float> switch_activity;
   std::vector<uint64_t> tmp_visited;
 

From f222053606d2e984920264b4e1a999956736ab40 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 6 May 2024 10:35:32 +0200
Subject: [PATCH 23/27] Adding option for removing symmetrical permutations of
 gates for faster mapping

---
 include/mockturtle/utils/tech_library.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/mockturtle/utils/tech_library.hpp b/include/mockturtle/utils/tech_library.hpp
index a671e7a25..cb3c83e33 100644
--- a/include/mockturtle/utils/tech_library.hpp
+++ b/include/mockturtle/utils/tech_library.hpp
@@ -109,6 +109,9 @@ struct tech_library_params
   /*! \brief Loads multioutput gates in the library */
   bool load_multioutput_gates{ true };
 
+  /*! \brief Don't load symmetrical permutations of gate pins (drastically speeds-up mapping) */
+  bool ignore_symmetries{ false };
+
   /*! \brief Load gates with minimum size only */
   bool load_minimum_size_only{ true };
 
@@ -473,7 +476,7 @@ class tech_library
             if ( sg.root->id == it->root->id )
             {
               /* if already in the library exit, else ignore permutations if with equal delay cost */
-              if ( sg.polarity == it->polarity && sg.tdelay == it->tdelay )
+              if ( sg.polarity == it->polarity && ( _ps.ignore_symmetries || sg.tdelay == it->tdelay ) )
               {
                 to_add = false;
                 break;
@@ -534,7 +537,7 @@ class tech_library
               if ( sg.root->id == it->root->id )
               {
                 /* if already in the library exit, else ignore permutations if with equal delay cost */
-                if ( sg.polarity == it->polarity && sg.tdelay == it->tdelay )
+                if ( sg.polarity == it->polarity && ( _ps.ignore_symmetries || sg.tdelay == it->tdelay ) )
                 {
                   to_add = false;
                   break;

From fe15cd668c52dd3309ac26d1661b487052c6e17c Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 6 May 2024 10:41:37 +0200
Subject: [PATCH 24/27] Updating experiment emap

---
 experiments/emap.cpp                   | 1 +
 include/mockturtle/algorithms/emap.hpp | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/experiments/emap.cpp b/experiments/emap.cpp
index 11d465f8b..55ced8457 100644
--- a/experiments/emap.cpp
+++ b/experiments/emap.cpp
@@ -65,6 +65,7 @@ int main()
   }
 
   tech_library_params tps;
+  tps.ignore_symmetries = false; // set to true to drastically speed-up mapping with minor delay increase
   tps.verbose = true;
   tech_library<9> tech_lib( gates, tps );
 
diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index f6454ec2b..30173aae2 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -742,6 +742,7 @@ class emap_impl
       unsigned has_info : 1;
     };
   };
+
 public:
   static constexpr float epsilon = 0.0005;
   static constexpr uint32_t max_cut_num = 20;
@@ -3710,9 +3711,9 @@ class emap_impl
 
     /* find the corresponding cut */
     uint32_t cut_p = 0;
-    while( matches[cut_p].node_index != index )
+    while ( matches[cut_p].node_index != index )
       ++cut_p;
-    
+
     assert( cut_p < matches.size() );
     uint32_t cut_index = matches[cut_p].cut_index;
     auto& cut = multi_cut_set[cut_index][cut_p];

From bc32f0de62d8e58c87debb13d7625d17335f3484 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 6 May 2024 12:23:30 +0200
Subject: [PATCH 25/27] Bug fix delay for structural matches

---
 include/mockturtle/utils/struct_library.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/mockturtle/utils/struct_library.hpp b/include/mockturtle/utils/struct_library.hpp
index 7787b906b..c239ca946 100644
--- a/include/mockturtle/utils/struct_library.hpp
+++ b/include/mockturtle/utils/struct_library.hpp
@@ -371,6 +371,12 @@ class struct_library
                                   perm,
                                   gate_pol };
 
+        /* permute pin-to-pin delays */
+        for ( uint32_t i = 0; i < gate.num_vars; ++i )
+        {
+          sg.tdelay[i] = gate.tdelay[perm[i]];
+        }
+
         auto& v = _label_to_gate[index_rule.data];
 
         auto it = std::lower_bound( v.begin(), v.end(), sg, [&]( auto const& s1, auto const& s2 ) {

From 8a10355505806c693dd0e996e5d54da3fcd099c4 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 6 May 2024 19:41:36 +0200
Subject: [PATCH 26/27] Fixes and data structure changes

---
 include/mockturtle/algorithms/emap.hpp | 42 +++++++++++---------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 17d34df26..1028d8e70 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -721,26 +721,18 @@ template<class Ntk, unsigned CutSize, unsigned NInputs, classification_type Conf
 class emap_impl
 {
 private:
-  union multi_match_data
+  struct multi_match_data
   {
-    uint64_t data{ 0 };
-    struct
-    {
-      uint64_t in_tfi : 1;
-      uint64_t cut_index : 31;
-      uint64_t node_index : 32;
-    };
+    uint32_t node_index;
+    uint32_t cut_index;
+    bool in_tfi;
   };
-  union multioutput_info
+  struct multioutput_info
   {
-    uint32_t data;
-    struct
-    {
-      unsigned index : 29;
-      unsigned lowest_index : 1;
-      unsigned highest_index : 1;
-      unsigned has_info : 1;
-    };
+    uint32_t index : 29;
+    uint32_t lowest_index : 1;
+    uint32_t highest_index : 1;
+    uint32_t has_info : 1;
   };
 
 public:
@@ -3703,11 +3695,11 @@ class emap_impl
     }
   }
 
-  bool match_multi_add_cuts( node<Ntk> const& n )
+  void match_multi_add_cuts( node<Ntk> const& n )
   {
     /* assume a single cut (current version) */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t& matches = multi_node_match[node_tuple_match[index].index][0];
+    multi_match_t& matches = multi_node_match[node_tuple_match[index].index].at( 0 );
 
     /* find the corresponding cut */
     uint32_t cut_p = 0;
@@ -3724,7 +3716,7 @@ class emap_impl
     if ( rcuts.size() == max_cut_num )
     {
       match_multi_add_cuts_remove_entry( matches );
-      return false;
+      return;
     }
 
     /* insert single cut variation if unique (for delay preservation) */
@@ -3739,7 +3731,7 @@ class emap_impl
       {
         rcuts.limit( rcuts.size() - 1 );
         match_multi_add_cuts_remove_entry( matches );
-        return false;
+        return;
       }
     }
 
@@ -3762,7 +3754,7 @@ class emap_impl
     /* reset matches */
     for ( multi_match_data const& entry : matches )
     {
-      node_tuple_match[entry.node_index].data = 0;
+      node_tuple_match[entry.node_index] = { 0 };
     }
   }
 
@@ -5243,8 +5235,10 @@ class emap_impl
       multi_match_data new_data1, new_data2;
       new_data1.node_index = index1;
       new_data1.cut_index = multi_cut_set.size() - 1;
+      new_data1.in_tfi = false;
       new_data2.node_index = index2;
       new_data2.cut_index = multi_cut_set.size() - 1;
+      new_data2.in_tfi = false;
       multi_match_t p = { new_data1, new_data2 };
 
       /* add cuts to the correct bucket */
@@ -5497,7 +5491,7 @@ class emap_impl
       if ( multi_is_in_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ), cut ) )
       {
         /* if there is a path of length > 1 linking node 1 and 2, save as TFI node */
-        uint32_t in_tfi = multi_is_in_direct_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ) ) ? 0 : 1;
+        bool in_tfi = multi_is_in_direct_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ) );
         for ( auto& match : field )
           match[0].in_tfi = in_tfi;
         /* add a TFI dependency */
@@ -5565,7 +5559,7 @@ class emap_impl
         {
           /* fix cycle: remove multi-output match */
           choice_ntk.foreach_choice( repr, [&]( auto const& p ) {
-            node_tuple_match[ntk.node_to_index( p )].data = 0;
+            node_tuple_match[ntk.node_to_index( p )] = { 0 };
             return true;
           } );
           choice_ntk.remove_choice( g );

From c13cd02917c0805dac157fee123a8ce50966e079 Mon Sep 17 00:00:00 2001
From: aletempiac <alessandro.tempia@gmail.com>
Date: Mon, 6 May 2024 19:51:57 +0200
Subject: [PATCH 27/27] revert changes

---
 include/mockturtle/algorithms/emap.hpp | 36 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/include/mockturtle/algorithms/emap.hpp b/include/mockturtle/algorithms/emap.hpp
index 1028d8e70..2ca21c5b6 100644
--- a/include/mockturtle/algorithms/emap.hpp
+++ b/include/mockturtle/algorithms/emap.hpp
@@ -721,18 +721,26 @@ template<class Ntk, unsigned CutSize, unsigned NInputs, classification_type Conf
 class emap_impl
 {
 private:
-  struct multi_match_data
+  union multi_match_data
   {
-    uint32_t node_index;
-    uint32_t cut_index;
-    bool in_tfi;
+    uint64_t data{ 0 };
+    struct
+    {
+      uint64_t in_tfi : 1;
+      uint64_t cut_index : 31;
+      uint64_t node_index : 32;
+    };
   };
-  struct multioutput_info
+  union multioutput_info
   {
-    uint32_t index : 29;
-    uint32_t lowest_index : 1;
-    uint32_t highest_index : 1;
-    uint32_t has_info : 1;
+    uint32_t data;
+    struct
+    {
+      unsigned index : 29;
+      unsigned lowest_index : 1;
+      unsigned highest_index : 1;
+      unsigned has_info : 1;
+    };
   };
 
 public:
@@ -3699,7 +3707,7 @@ class emap_impl
   {
     /* assume a single cut (current version) */
     uint32_t index = ntk.node_to_index( n );
-    multi_match_t& matches = multi_node_match[node_tuple_match[index].index].at( 0 );
+    multi_match_t& matches = multi_node_match[node_tuple_match[index].index][0];
 
     /* find the corresponding cut */
     uint32_t cut_p = 0;
@@ -3754,7 +3762,7 @@ class emap_impl
     /* reset matches */
     for ( multi_match_data const& entry : matches )
     {
-      node_tuple_match[entry.node_index] = { 0 };
+      node_tuple_match[entry.node_index].data = 0;
     }
   }
 
@@ -5235,10 +5243,8 @@ class emap_impl
       multi_match_data new_data1, new_data2;
       new_data1.node_index = index1;
       new_data1.cut_index = multi_cut_set.size() - 1;
-      new_data1.in_tfi = false;
       new_data2.node_index = index2;
       new_data2.cut_index = multi_cut_set.size() - 1;
-      new_data2.in_tfi = false;
       multi_match_t p = { new_data1, new_data2 };
 
       /* add cuts to the correct bucket */
@@ -5491,7 +5497,7 @@ class emap_impl
       if ( multi_is_in_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ), cut ) )
       {
         /* if there is a path of length > 1 linking node 1 and 2, save as TFI node */
-        bool in_tfi = multi_is_in_direct_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ) );
+        uint32_t in_tfi = multi_is_in_direct_tfi( ntk.index_to_node( index2 ), ntk.index_to_node( index1 ) ) ? 0 : 1;
         for ( auto& match : field )
           match[0].in_tfi = in_tfi;
         /* add a TFI dependency */
@@ -5559,7 +5565,7 @@ class emap_impl
         {
           /* fix cycle: remove multi-output match */
           choice_ntk.foreach_choice( repr, [&]( auto const& p ) {
-            node_tuple_match[ntk.node_to_index( p )] = { 0 };
+            node_tuple_match[ntk.node_to_index( p )].data = 0;
             return true;
           } );
           choice_ntk.remove_choice( g );