From ab830a5c0ccd159bc5c5b5418bdf1829b7a30c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 19 Sep 2024 15:37:43 +0100 Subject: [PATCH 1/5] Introduce SIMD-dependent pipelining into the result reduction. --- finn-rtllib/mvu/mvu_4sx4u.sv | 70 +++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 1f6e97281e..f8a9258408 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -123,12 +123,14 @@ module mvu_4sx4u #( endfunction : init_leave_loads // Pipeline for last indicator flag - logic [1:5] L = '0; + // Depth: 3 cycles for DSP + external SIMD reduction + localparam int unsigned PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1); + logic [1:PIPELINE_DEPTH] L = '0; always_ff @(posedge clk) begin if(rst) L <= '0; - else if(en) L <= { last, L[1:4] }; + else if(en) L <= { last, L[1:PIPELINE_DEPTH-1] }; end - assign vld = L[5]; + assign vld = L[PIPELINE_DEPTH]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned PIPE_COUNT = (PE+3)/4; @@ -504,10 +506,13 @@ module mvu_4sx4u #( end : genSIMD - // Stage #4: Cross-SIMD Reduction + // Stage #4: Potentially Multiple Cycles of Cross-SIMD Reduction + // - binary reduction trees with SIMD leave nodes for both the core lane outputs and the spill accumulation + // - balanced tree construction with all fully occupied levels pipelined // Count leaves reachable from each node localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + localparam int unsigned HI_NODE_REGISTERED = 2**($clog2(SIMD+1)-1)-2; uwire signed [ACCU_WIDTH-1:0] up4; uwire signed [ HI_WIDTH_MAX-1:0] hi4[3]; @@ -525,8 +530,17 @@ module mvu_4sx4u #( for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); - assign tree[n] = s; + typedef logic signed [$clog2(1+LEAVE_LOAD[n]):0] sum_t; + uwire sum_t s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + if((0 < n) && (n <= HI_NODE_REGISTERED)) begin + sum_t S = 'x; + always_ff @(posedge clk) begin + if(rst) S <= 'x; + else S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; end // High Sideband Accumulation @@ -534,7 +548,7 @@ module mvu_4sx4u #( always_ff @(posedge clk) begin if(rst) Hi4 <= 0; else if(en) begin - automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + automatic logic signed [HI_WIDTH:0] h = $signed(L[PIPELINE_DEPTH-1]? 0 : Hi4) + $signed(tree[0]); assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); $stop; @@ -555,22 +569,36 @@ module mvu_4sx4u #( // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - logic [ROOT_WIDTH-1:0] Lo4 = 0; - always_ff @(posedge clk) begin - if(rst) Lo4 <= 0; - else if(en) Lo4 <= tree[0]; - end + if(SIMD == 1) begin : genReg + // Just slide in a balancing register + logic [ROOT_WIDTH-1:0] R = 'x; + always_ff @(posedge clk) begin + if(rst) R <= 'x; + else R <= p3[0][OFFSETS[i]+:LO_WIDTH]; + end + assign tree[0] = R; + end : genReg + else begin : genTree + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + if(n <= HI_NODE_REGISTERED) begin + logic [NODE_WIDTH-1:0] S = 'x; + always_ff @(posedge clk) begin + if(rst) S <= 'x; + else S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; + end + end : genTree - if(i == 3) assign up4 = Lo4; - else assign lo4[i] = Lo4; + if(i == 3) assign up4 = tree[0]; + else assign lo4[i] = tree[0]; end : genLo end From a7f29e0552780cb8f48f3caddd248adb8b999e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 30 Sep 2024 07:59:09 +0100 Subject: [PATCH 2/5] 4-BIT DSP-MVU: Allow very wide lanes requiring even a high-lane extension. --- finn-rtllib/mvu/mvu_4sx4u.sv | 59 +- finn-rtllib/mvu/tb/mvu_4x4_accu_tb.dat | 2304 +++++++++++++++++ finn-rtllib/mvu/tb/mvu_4x4_accu_tb.sv | 162 ++ .../{mvu_accu_tb.dat => mvu_8x8_accu_tb.dat} | 0 .../tb/{mvu_accu_tb.sv => mvu_8x8_accu_tb.sv} | 6 +- 5 files changed, 2508 insertions(+), 23 deletions(-) create mode 100644 finn-rtllib/mvu/tb/mvu_4x4_accu_tb.dat create mode 100644 finn-rtllib/mvu/tb/mvu_4x4_accu_tb.sv rename finn-rtllib/mvu/tb/{mvu_accu_tb.dat => mvu_8x8_accu_tb.dat} (100%) rename finn-rtllib/mvu/tb/{mvu_accu_tb.sv => mvu_8x8_accu_tb.sv} (98%) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 1f6e97281e..1f9ed00d22 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -80,35 +80,47 @@ module mvu_4sx4u #( * - Internal lane widths differ, at most, by a single bit. * - The rightmost lane (#0) has the maximum internal width. * - The leftmost lane (#3) extends into the wide DSP accumulation path and - * is constrained by ACCU_WIDTH rather than the next lane. It doesn't have - * an external high extension. + * is typically constrained by ACCU_WIDTH rather than the next lane. If so, + * it doesn't have an external high extension. * - The one but leftmost lane (#2) has the minimum internal width and, hence, * the macimum external high extension. */ typedef int unsigned lane_offset_v[4:0]; function lane_offset_v sliceLanes(); + automatic lane_offset_v res; unique case(VERSION) 1: begin - return NARROW_WEIGHTS? + res = NARROW_WEIGHTS? lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } : lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported end 2: begin - return NARROW_WEIGHTS? + res = NARROW_WEIGHTS? lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } : lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 }; end endcase + if(res[4] > 48) res[4] = 48; + return res; endfunction : sliceLanes localparam lane_offset_v OFFSETS = sliceLanes(); + function int unsigned sum_width(input int unsigned n, input int unsigned w); + return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); + endfunction : sum_width function int unsigned lo_width(input int unsigned i); return OFFSETS[i+1] - OFFSETS[i]; endfunction : lo_width function int unsigned hi_width(input int unsigned i); - return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD); + automatic int unsigned lw = lo_width(i); + return ACCU_WIDTH <= lw? + 0 : + 1 + ($clog2(SIMD) < ACCU_WIDTH-lw? + ACCU_WIDTH-lw : + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD) + ); endfunction : hi_width - localparam int unsigned LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0]; + localparam int unsigned LO_WIDTH_MAX = lo_width(3); localparam int unsigned HI_WIDTH_MAX = hi_width(2); localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath @@ -139,7 +151,7 @@ module mvu_4sx4u #( localparam int unsigned PE_REM = 4*(c+1) - PE_END; uwire [47:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD][3]; + uwire signed [ 1:0] h3[SIMD][4]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly @@ -500,6 +512,16 @@ module mvu_4sx4u #( for(genvar i = 0; i < 3; i++) begin assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1]; end + // Overflow out of high lane + logic PZ = 0; + always_ff @(posedge clk) begin + if(rst) PZ <= 0; + else if(en) PZ <= L[3]? 0 : pp[$left(pp)]; + end + assign h3[s][3] = + ( PZ && !pp[$left(pp)-:2])? +1 : + (!PZ && &pp[$left(pp)-:2])? -1 : 0; + assign p3[s] = pp; end : genSIMD @@ -509,17 +531,16 @@ module mvu_4sx4u #( // Count leaves reachable from each node localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [ HI_WIDTH_MAX-1:0] hi4[3]; - uwire [$clog2(SIMD)+LO_WIDTH_MAX-1:0] lo4[3]; + uwire signed [HI_WIDTH_MAX-1:0] hi4[4]; + uwire [LO_WIDTH_MAX-1:0] lo4[4]; for(genvar i = 0; i < 4; i++) begin // Conclusive high part accumulation - if(i < 3) begin : genHi - if(i < PE_REM) assign hi4[i] = '0; + if(i < PE_REM) assign hi4[i] = 0; + else begin : genHi + localparam int unsigned HI_WIDTH = hi_width(i); + if(HI_WIDTH == 0) assign hi4[i] = 0; else begin - localparam int unsigned HI_WIDTH = hi_width(i); - // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; @@ -543,7 +564,6 @@ module mvu_4sx4u #( end end assign hi4[i] = Hi4; - end end : genHi @@ -553,12 +573,12 @@ module mvu_4sx4u #( localparam int unsigned LO_WIDTH = lo_width(i); // Adder Tree across all SIMD low contributions - localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH); uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; assign tree[n] = s; end @@ -569,8 +589,7 @@ module mvu_4sx4u #( else if(en) Lo4 <= tree[0]; end - if(i == 3) assign up4 = Lo4; - else assign lo4[i] = Lo4; + assign lo4[i] = Lo4; end : genLo end @@ -580,7 +599,7 @@ module mvu_4sx4u #( always_ff @(posedge clk) begin if(rst) Res5 <= '{ default: 0 }; else if(en) begin - Res5[3] <= up4 - hi4[2]; + Res5[3] <= $signed({ hi4[3], {(lo_width(3)){1'b0}} }) + $signed({ 1'b0, lo4[3] }) - hi4[2]; Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] }); diff --git a/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.dat new file mode 100644 index 0000000000..ea37bdc7b0 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.dat @@ -0,0 +1,2304 @@ +e +2 +5 +d +7 +6 +f +9 +f +d +e +2 +6 +0 +4 +d +e +4 +4 +1 +f +6 +3 +2 +a +3 +8 +3 +a +c +1 +e +7 +1 +f +f +1 +9 +1 +2 +e +8 +7 +f +3 +3 +4 +0 +e +d +2 +5 +3 +c +f +a +4 +b +9 +6 +2 +2 +4 +b +4 +8 +4 +4 +4 +5 +d +c +f +d +f +a +d +e +e +a +8 +7 +0 +e +4 +3 +9 +a +f +7 +5 +c +0 +c +d +4 +9 +d +9 +9 +a +6 +0 +4 +0 +e +2 +6 +e +7 +6 +2 +4 +5 +0 +7 +a +8 +9 +3 +0 +0 +5 +2 +0 +4 +2 +a +3 +0 +d +a +0 +f +0 +4 +9 +f +2 +2 +4 +d +4 +1 +d +8 +a +e +5 +8 +0 +9 +5 +6 +d +4 +3 +e +c +4 +5 +6 +6 +7 +2 +0 +7 +a +c +d +b +7 +d +7 +2 +e +5 +2 +3 +9 +2 +1 +1 +2 +b +d +f +1 +0 +2 +6 +b +4 +0 +4 +2 +e +2 +0 +8 +6 +a +6 +e +4 +8 +4 +9 +0 +e +2 +8 +2 +9 +6 +3 +1 +3 +4 +9 +c +6 +d +6 +b +f +d +1 +f +d +d +4 +1 +3 +e +8 +b +6 +8 +1 +a +0 +f +4 +e +4 +b +c +2 +f +d +7 +8 +4 +6 +1 +5 +3 +1 +e +4 +e +2 +3 +9 +9 +1 +8 +b +f +9 +f +9 +4 +8 +e +8 +8 +9 +b +2 +0 +2 +0 +3 +6 +7 +0 +1 +a +2 +5 +0 +c +1 +7 +3 +4 +3 +7 +e +e +0 +5 +6 +a +d +e +0 +f +b +7 +7 +f +d +a +2 +c +a +2 +7 +7 +6 +9 +e +e +f +b +a +e +9 +3 +0 +0 +6 +d +e +f +0 +7 +6 +c +b +9 +8 +a +9 +a +a +3 +e +7 +1 +b +5 +7 +1 +2 +8 +0 +d +a +1 +4 +b +c +4 +8 +f +9 +6 +6 +6 +a +6 +4 +9 +d +e +b +a +8 +7 +e +6 +e +b +0 +8 +b +9 +d +5 +0 +4 +9 +3 +d +2 +3 +1 +7 +7 +5 +4 +c +2 +a +8 +a +4 +7 +c +1 +5 +a +5 +6 +b +4 +9 +9 +f +2 +a +1 +6 +6 +d +5 +d +d +a +6 +6 +2 +8 +f +3 +c +c +4 +0 +d +0 +4 +3 +a +c +2 +9 +7 +2 +f +8 +8 +8 +b +4 +b +b +d +c +c +b +5 +f +a +0 +3 +1 +d +0 +0 +2 +6 +1 +7 +a +e +4 +2 +4 +3 +9 +2 +9 +e +b +f +7 +4 +1 +4 +f +4 +c +5 +b +8 +8 +f +5 +4 +0 +f +d +3 +d +d +6 +f +2 +5 +4 +1 +d +0 +b +4 +d +b +a +7 +c +2 +3 +3 +2 +f +4 +7 +5 +2 +2 +9 +a +8 +e +3 +e +b +3 +6 +1 +f +5 +2 +8 +a +5 +e +8 +e +d +e +c +1 +9 +e +8 +3 +4 +c +7 +4 +d +2 +8 +d +b +e +3 +a +3 +7 +6 +c +c +4 +2 +d +0 +4 +3 +e +d +6 +7 +9 +7 +3 +0 +6 +b +f +0 +6 +2 +2 +f +2 +0 +4 +0 +7 +b +b +5 +5 +a +e +5 +f +7 +5 +4 +e +1 +3 +4 +3 +8 +f +c +8 +c +c +7 +1 +b +4 +a +d +2 +b +4 +5 +7 +8 +a +e +3 +d +8 +6 +d +c +d +9 +5 +0 +b +3 +4 +1 +2 +e +6 +d +1 +5 +3 +a +d +f +5 +2 +e +d +a +0 +c +6 +e +8 +1 +2 +0 +d +c +3 +e +d +c +9 +f +7 +d +3 +f +4 +7 +3 +c +2 +f +3 +7 +b +3 +b +f +b +a +e +7 +a +0 +2 +b +5 +4 +8 +8 +5 +6 +1 +c +4 +2 +2 +9 +d +8 +0 +c +b +e +6 +0 +c +b +0 +d +c +c +5 +4 +d +0 +b +d +f +d +6 +c +d +5 +c +3 +5 +6 +3 +0 +7 +7 +5 +7 +c +7 +0 +1 +8 +b +5 +c +c +a +3 +d +1 +c +4 +c +3 +6 +0 +6 +7 +5 +2 +c +9 +2 +a +7 +2 +0 +1 +4 +b +1 +8 +7 +f +c +d +9 +f +5 +2 +d +9 +3 +f +e +e +8 +7 +f +7 +5 +d +b +2 +1 +b +2 +8 +3 +1 +e +7 +8 +8 +2 +4 +b +3 +9 +2 +f +2 +8 +b +3 +9 +e +8 +d +5 +d +8 +b +e +f +6 +4 +d +0 +4 +4 +3 +d +9 +c +c +d +d +4 +b +8 +f +4 +7 +0 +c +6 +4 +0 +a +5 +a +3 +3 +d +2 +2 +d +b +5 +c +a +2 +9 +9 +9 +2 +7 +9 +c +f +3 +b +8 +b +2 +c +2 +7 +c +1 +a +f +9 +5 +3 +8 +8 +d +1 +9 +2 +2 +b +c +1 +0 +9 +8 +7 +a +b +6 +7 +8 +7 +8 +9 +7 +5 +d +4 +2 +e +9 +e +0 +1 +2 +3 +d +5 +c +6 +0 +2 +d +7 +9 +1 +1 +a +3 +7 +9 +b +5 +9 +0 +8 +c +1 +7 +c +b +6 +7 +5 +f +3 +a +9 +d +d +f +6 +e +a +c +9 +a +c +1 +3 +a +9 +8 +2 +9 +4 +6 +8 +2 +c +e +d +6 +7 +1 +9 +a +3 +7 +0 +f +2 +c +0 +4 +b +e +1 +1 +4 +1 +c +6 +6 +1 +d +4 +e +2 +2 +1 +7 +6 +2 +8 +3 +7 +5 +b +6 +a +a +7 +7 +1 +a +a +0 +e +a +c +d +0 +6 +e +6 +0 +f +2 +a +3 +7 +9 +a +7 +5 +6 +6 +f +f +8 +1 +5 +8 +4 +4 +f +4 +2 +e +c +f +c +f +9 +0 +7 +2 +3 +0 +2 +1 +a +5 +2 +5 +a +8 +5 +c +6 +0 +9 +f +f +0 +7 +8 +8 +c +c +7 +5 +3 +a +a +4 +a +4 +9 +3 +4 +e +2 +6 +3 +6 +9 +d +f +f +9 +4 +a +9 +8 +9 +8 +0 +9 +d +0 +e +7 +9 +3 +8 +e +e +e +e +e +7 +7 +a +a +0 +a +1 +7 +1 +a +1 +3 +e +e +b +8 +c +c +a +b +8 +4 +b +9 +5 +0 +5 +3 +9 +e +7 +d +3 +7 +b +e +d +9 +6 +9 +a +4 +5 +8 +c +e +9 +0 +2 +5 +5 +f +d +8 +d +5 +6 +c +9 +6 +4 +4 +f +2 +9 +e +a +8 +8 +9 +4 +b +8 +4 +5 +4 +f +9 +a +7 +6 +e +f +c +0 +2 +6 +3 +2 +a +3 +7 +9 +3 +e +7 +e +c +f +f +b +0 +4 +0 +c +8 +c +a +9 +e +5 +0 +e +4 +d +6 +0 +d +5 +c +c +a +d +c +f +0 +2 +0 +9 +7 +b +9 +8 +5 +3 +5 +e +4 +2 +7 +0 +2 +c +6 +0 +3 +f +8 +5 +6 +0 +6 +8 +a +3 +3 +0 +4 +d +c +1 +5 +a +6 +b +5 +9 +b +2 +4 +f +d +3 +e +7 +d +c +e +f +3 +d +3 +7 +e +b +f +9 +8 +4 +4 +9 +0 +5 +2 +3 +b +6 +4 +5 +1 +e +8 +a +7 +4 +3 +3 +9 +2 +8 +9 +4 +9 +1 +1 +3 +7 +b +c +8 +d +a +7 +8 +7 +9 +1 +9 +4 +8 +7 +b +3 +6 +1 +7 +8 +f +7 +8 +6 +4 +1 +1 +f +6 +3 +b +1 +9 +5 +c +3 +f +6 +2 +d +1 +1 +c +8 +f +3 +c +f +9 +c +5 +1 +8 +8 +b +f +5 +6 +1 +2 +1 +f +9 +9 +f +2 +7 +3 +7 +7 +c +7 +2 +c +8 +4 +d +f +1 +2 +1 +0 +1 +8 +9 +1 +f +0 +1 +4 +b +8 +f +e +e +0 +6 +2 +e +e +1 +3 +8 +c +0 +7 +4 +0 +4 +b +0 +3 +2 +a +9 +f +6 +7 +4 +5 +4 +2 +6 +0 +8 +1 +1 +e +0 +c +8 +9 +c +4 +b +5 +f +6 +5 +f +b +b +a +2 +c +b +9 +a +f +4 +5 +0 +e +8 +5 +b +8 +9 +b +9 +0 +2 +7 +5 +d +9 +f +3 +2 +f +7 +d +2 +b +6 +d +6 +c +f +5 +d +d +e +d +4 +a +e +b +b +5 +d +1 +1 +3 +2 +2 +9 +7 +0 +3 +e +5 +6 +9 +9 +9 +3 +7 +c +0 +6 +6 +3 +6 +7 +9 +9 +e +4 +9 +f +8 +4 +8 +7 +9 +7 +8 +0 +3 +8 +9 +4 +0 +9 +2 +1 +4 +6 +f +6 +b +2 +d +6 +4 +4 +e +4 +1 +6 +c +4 +7 +2 +6 +2 +9 +a +7 +f +3 +2 +e +0 +8 +8 +b +f +9 +0 +4 +1 +9 +a +b +7 +4 +9 +6 +5 +d +0 +b +1 +4 +0 +6 +f +d +a +c +e +a +9 +9 +c +3 +e +0 +c +8 +d +4 +c +b +1 +9 +b +9 +6 +4 +3 +a +9 +f +7 +8 +d +a +2 +6 +f +6 +4 +0 +4 +0 +2 +2 +5 +2 +c +0 +d +f +6 +7 +2 +d +7 +1 +9 +3 +5 +5 +1 +2 +1 +6 +9 +3 +f +5 +2 +3 +0 +d +3 +d +2 +0 +6 +2 +8 +1 +4 +b +9 +c +4 +f +b +4 +4 +c +0 +0 +7 +1 +0 +b +3 +1 +b +9 +c +b +8 +f +6 +d +a +5 +c +4 +5 +7 +6 +6 +f +6 +6 +5 +3 +3 +5 +e +9 +9 +3 +0 +7 +a +9 +a +f +7 +d +5 +e +d +c +a +5 +c +6 +1 +4 +6 +5 +3 +1 +7 +b +1 +1 +d +8 +9 +1 +d +e +a +b +3 +a +e +e +a +3 +4 +e +1 +b +6 +a +6 +3 +6 +3 +e +c +1 +8 +2 +f +3 +c +4 +c +8 +f +9 +a +1 +4 +b +8 +a +8 +f +8 +4 +c +a +0 +c +6 +6 +5 +e +3 +8 +7 +6 +c +2 +e +7 +d +a +0 +3 +b +1 +b +6 +e +0 +7 +a +1 +b +2 +4 +d +9 +3 +5 +e +a +7 +4 +a +8 +e +1 +4 +f +c +5 +5 +0 +9 +1 +4 +a +0 +c +1 +e +f +2 +c +9 +2 +d +d +1 +7 +1 +b +d +6 +6 +8 +8 +d +8 +1 +0 +8 +a +f +f +a +0 +1 +c +8 +a +e +7 +d +5 +a +5 +c +0 +c +1 +d +e +e +5 +4 +2 +5 +d +5 +2 +c +1 +f +4 +d +7 +3 +2 +2 +1 +3 +6 +d +2 +b +7 +b +c +b +1 +3 +1 +7 +f +d +a +d +0 +3 +2 +0 +2 +0 +c +3 +a +9 +4 +0 +d +e +8 +e +c +1 +8 +0 +7 +4 +8 +b +c +d +4 +0 +6 +c +6 +0 +7 +5 +c +0 +a +7 +0 +f +d +a +e +3 +d +4 +0 +4 +b +2 +0 +6 +6 +4 +e +d +f +f +8 +4 +f +6 +c +3 +b +a +c +1 +c +b +4 +2 +e +1 +f +e +d +a +d +0 +e +8 +5 +f +8 +0 +c +f +4 +5 +7 +3 +d +7 +7 +1 +7 +3 +0 +0 +c +9 +0 +7 +7 +1 +1 +8 +1 +0 +3 +a +e +f +9 +3 +c +e +6 +3 +e +3 +9 +b +7 +7 +6 +f +d +6 +8 +f +d +e +3 +d +a +8 +b +2 +5 +a +4 +f +d +9 +e +c +e +7 +f +c +0 +e +4 +6 +6 +5 +7 +2 +3 +1 +1 +7 +1 +b +4 +4 +4 +1 +a +c +0 +9 +3 +a +b +3 +f +1 +8 +5 +8 +e +c +8 +8 +f +a +c +b +1 +b +7 +b +2 +e +a +b +e +d +a +1 +f +2 +5 +7 +4 +2 +3 +e +d +5 +2 +3 +a +b +9 +4 +1 +d +d +c +2 +a +1 +8 +8 +d +2 +3 +e +d +1 +9 +0 +f +9 +0 +3 +6 +7 +5 +e +9 +d +7 +a +5 +f +4 +e +a +a +6 +5 +b +f +2 +6 +c +e +4 +d +8 +f +c +9 +d +4 +a +b +e +7 +a +3 +4 +1 +5 +4 +8 +d +8 +2 +f +5 +b +0 +2 +2 +7 +2 +4 +5 +9 +c +7 +6 +9 +5 +e +3 +a +2 +f +d +7 +8 +e +0 +2 +1 +c +6 +5 +f +8 +8 +8 +2 diff --git a/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.sv new file mode 100644 index 0000000000..e587b8b594 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_4x4_accu_tb.sv @@ -0,0 +1,162 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_4x4_accu_tb; + + localparam IS_MVU = 1; + localparam COMPUTE_CORE = "mvu_4sx4u_dsp48e2"; + localparam PUMPED_COMPUTE = 0; + localparam MW = 128; + localparam MH = 18; + localparam PE = 1; + localparam SIMD = 1; + localparam ACTIVATION_WIDTH = 4; + localparam WEIGHT_WIDTH = 4; + localparam NARROW_WEIGHTS = 0; + localparam SIGNED_ACTIVATIONS = 0; + localparam SEGMENTLEN = 1; + localparam FORCE_BEHAVIORAL = 0; + + // Safely deducible parameters + localparam WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8; + localparam INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + repeat(16) @(posedge clk); + rst <= 0; + end + + logic [WEIGHT_WIDTH-1:0] WeightMem[MH*MW]; + initial $readmemh("mvu_4x4_accu_tb.dat", WeightMem); + + // Shared Input Feed + logic [INPUT_STREAM_WIDTH_BA-1:0] in_TDATA; + logic in_TVALID[2]; + uwire in_TREADY[2]; + initial begin + in_TDATA = 'x; + in_TVALID = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(523*MW) begin + automatic logic [ACTIVATION_WIDTH-1:0] a = $urandom(); + in_TDATA <= a; + in_TVALID <= '{ default: 1 }; + fork + begin + @(posedge clk iff in_TREADY[0]); + in_TVALID[0] <= 0; + end + begin + @(posedge clk iff in_TREADY[1]); + in_TVALID[1] <= 0; + end + join + end + + repeat(MH*MW) @(posedge clk); + $display("Test completed."); + $finish; + end + + // DUTs + localparam int unsigned ACCU_WIDTHS[2] = '{ 16, 32 }; + int OutQ[2][$]; + for(genvar i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs + localparam int unsigned ACCU_WIDTH = ACCU_WIDTHS[i]; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Private Weight Feed + logic [WEIGHT_STREAM_WIDTH_BA-1:0] weights_TDATA; + logic weights_TVALID; + uwire weights_TREADY; + initial begin + weights_TDATA = 'x; + weights_TVALID = 0; + @(posedge clk iff !rst); + + weights_TVALID <= 1; + forever begin + for(int unsigned i = 0; i < MH*MW; i++) begin + weights_TDATA <= WeightMem[i]; + @(posedge clk iff weights_TREADY); + end + end + end + + // Private Output Capture into Queue + uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0] out_TDATA; + uwire out_TVALID; + uwire out_TREADY = !rst; + always_ff @(posedge clk iff !rst) begin + if(out_TVALID) OutQ[i].push_back(out_TDATA); + end + + // Actual DUT Instance + mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) dut ( + .ap_clk(clk), + .ap_clk2x(1'b0), + .ap_rst_n(!rst), + .s_axis_weights_tdata(weights_TDATA), + .s_axis_weights_tvalid(weights_TVALID), + .s_axis_weights_tready(weights_TREADY), + .s_axis_input_tdata(in_TDATA), + .s_axis_input_tvalid(in_TVALID[i]), + .s_axis_input_tready(in_TREADY[i]), + .m_axis_output_tdata(out_TDATA), + .m_axis_output_tvalid(out_TVALID), + .m_axis_output_tready(out_TREADY) + ); + end : genDUTs + + // Output Equivalence Checker + always_ff @(posedge clk) begin + if(OutQ[0].size && OutQ[1].size) begin + automatic int unsigned y0 = OutQ[0].pop_front(); + automatic int unsigned y1 = OutQ[1].pop_front(); + assert(y0 == y1) else begin + $error("Output Mismatch: %0d vs. %0d", y0, y1); + $stop; + end + end + end + +endmodule : mvu_4x4_accu_tb diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_8x8_accu_tb.dat similarity index 100% rename from finn-rtllib/mvu/tb/mvu_accu_tb.dat rename to finn-rtllib/mvu/tb/mvu_8x8_accu_tb.dat diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_8x8_accu_tb.sv similarity index 98% rename from finn-rtllib/mvu/tb/mvu_accu_tb.sv rename to finn-rtllib/mvu/tb/mvu_8x8_accu_tb.sv index ceeb31194c..8c1d79cbb2 100644 --- a/finn-rtllib/mvu/tb/mvu_accu_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_8x8_accu_tb.sv @@ -31,7 +31,7 @@ * @brief Testbench for MVU core compute kernel. *****************************************************************************/ -module mvu_accu_tb; +module mvu_8x8_accu_tb; localparam IS_MVU = 1; localparam COMPUTE_CORE = "mvu_8sx8u_dsp48"; @@ -61,7 +61,7 @@ module mvu_accu_tb; end logic [WEIGHT_WIDTH-1:0] WeightMem[MH*MW]; - initial $readmemh("mvu_accu_tb.dat", WeightMem); + initial $readmemh("mvu_8x8_accu_tb.dat", WeightMem); // Shared Input Feed logic [INPUT_STREAM_WIDTH_BA-1:0] in_TDATA; @@ -159,4 +159,4 @@ module mvu_accu_tb; end end -endmodule : mvu_accu_tb +endmodule : mvu_8x8_accu_tb From 4c3a93cd90d0f65f532bb51ade7e6e2b11febf1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Tue, 15 Oct 2024 16:19:21 +0100 Subject: [PATCH 3/5] Adopt DSP58 support. Broaden testbench coverage. Work around Verilator limitations. --- finn-rtllib/mvu/mvu_4sx4u.sv | 193 +++++++++++++++--- finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv | 301 +++++++++++++++++------------ finn-rtllib/mvu/tb/mvu_dsp58_tb.sv | 33 ++++ 3 files changed, 379 insertions(+), 148 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 1f9ed00d22..9d515e08e5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -39,6 +39,7 @@ module mvu_4sx4u #( int unsigned ACCU_WIDTH, int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS + // Allowed versions - 1: DSP48E1, 2: DSP48E2, 3: DSP58 bit SIGNED_ACTIVATIONS = 0, bit NARROW_WEIGHTS = 0, // Weights from [-7:7] rather than [-8:7] bit FORCE_BEHAVIORAL = 0 @@ -58,7 +59,8 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - // for verilator always use behavioral code + + // For Verilator: always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || @@ -66,10 +68,15 @@ module mvu_4sx4u #( FORCE_BEHAVIORAL; //----------------------------------------------------------------------- - // Determine Lane Configuration + // Determine version-specific constraints + typedef enum { DSP48E1 = 1, DSP48E2 = 2, DSP58 = 3 } dsp_version_e; + localparam int unsigned A_WIDTH = 25 + 2*(VERSION > 1); // Width of A datapath + localparam int unsigned B_WIDTH = 18 + 6*(VERSION > 2); // Width of B datapath + localparam int unsigned P_WIDTH = VERSION == DSP58? 58 : 48; // Width of P datapath + initial begin - if(!NARROW_WEIGHTS && (VERSION == 1)) begin - $error("%m: Need NARROW_WEIGHTS for DSP48E1."); + if(!NARROW_WEIGHTS && (VERSION == DSP48E1)) begin + $error("%m: Need NARROW_WEIGHTS for %s.", DSP48E1.name); $finish; end end @@ -88,19 +95,19 @@ module mvu_4sx4u #( typedef int unsigned lane_offset_v[4:0]; function lane_offset_v sliceLanes(); automatic lane_offset_v res; + unique case(VERSION) - 1: begin + DSP48E1: begin res = NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } : + lane_offset_v'{ ACCU_WIDTH+21 > P_WIDTH? P_WIDTH : ACCU_WIDTH+21, 21, 14, 7, 0 } : lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported end - 2: begin + DSP48E2, DSP58: begin res = NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } : - lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 }; + lane_offset_v'{ ACCU_WIDTH+23 > P_WIDTH? P_WIDTH : ACCU_WIDTH+23, 23, 16, 8, 0 } : + lane_offset_v'{ ACCU_WIDTH+22 > P_WIDTH? P_WIDTH : ACCU_WIDTH+22, 22, 15, 8, 0 }; end endcase - if(res[4] > 48) res[4] = 48; return res; endfunction : sliceLanes localparam lane_offset_v OFFSETS = sliceLanes(); @@ -123,8 +130,6 @@ module mvu_4sx4u #( localparam int unsigned LO_WIDTH_MAX = lo_width(3); localparam int unsigned HI_WIDTH_MAX = hi_width(2); - localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath - // Compute the count of decendents for all nodes in the reduction trees. typedef int unsigned leave_load_t[2*SIMD-1]; function leave_load_t init_leave_loads(); @@ -150,15 +155,15 @@ module mvu_4sx4u #( localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); localparam int unsigned PE_REM = 4*(c+1) - PE_END; - uwire [47:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD][4]; + uwire [P_WIDTH-1:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][4]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [29:0] aa; - logic [26:0] dd; - logic [ 1:0] xx[3:1]; + uwire [B_WIDTH-1:0] bb = { {(B_WIDTH-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [A_WIDTH-1:0] aa; + logic [A_WIDTH-1:0] dd; + logic [1:0] xx[3:1]; if(1) begin : blkVectorize uwire signed [3:0] ww[PE_END - PE_BEG]; for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin @@ -202,7 +207,7 @@ module mvu_4sx4u #( end end : blkVectorize - uwire [47:0] pp; + uwire [P_WIDTH-1:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B @@ -210,7 +215,7 @@ module mvu_4sx4u #( if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [17:0] B1 = 0; + logic signed [B_WIDTH-1:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -223,7 +228,7 @@ module mvu_4sx4u #( end // Stage #2: Multiply - logic signed [45:0] M2 = 0; + logic signed [A_WIDTH+B_WIDTH-1:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -234,7 +239,7 @@ module mvu_4sx4u #( end // Stage #3: Accumulate - logic signed [47:0] P3 = 0; + logic signed [P_WIDTH-1:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -247,7 +252,7 @@ module mvu_4sx4u #( localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; case(VERSION) - 1: DSP48E1 #( + DSP48E1: DSP48E1 #( // Feature Control Attributes: Data Path Selection .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) @@ -311,7 +316,7 @@ module mvu_4sx4u #( .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input // Data: 30-bit (each) input: Data Ports - .A(aa), // 30-bit input: A data input + .A({5'b0, aa}), // 30-bit input: A data input .B(bb), // 18-bit input: B data input .C('x), // 48-bit input: C data input .CARRYIN('0), // 1-bit input: Carry input signal @@ -352,7 +357,7 @@ module mvu_4sx4u #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); - 2: DSP48E2 #( + DSP48E2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -440,13 +445,147 @@ module mvu_4sx4u #( .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports - .A(aa), // 34-bit input: A data + .A({3'b0, aa}), // 30-bit input: A data + .B(bb), // 18-bit input: B data + .C('x), // 48-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + DSP58: DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT24"), + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"),// Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_NEGATE_INVERTES('0), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A({7'b0, aa}), // 34-bit input: A data .B(bb), // 24-bit input: B data .C('x), // 58-bit input: C data .CARRYIN('0), // 1-bit input: Carry-in .D(dd), // 27-bit input: D data // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG .CEAD(en), // 1-bit input: Clock enable for ADREG @@ -482,7 +621,7 @@ module mvu_4sx4u #( .RSTP(rst) // 1-bit input: Reset for PREG ); default: initial begin - $error("Unknown version DSP48E%0d.", VERSION); + $error("Unknown DSP version."); $finish; end endcase diff --git a/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv b/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv index 783218e08c..a169fb5176 100644 --- a/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_3sx3u_tb.sv @@ -1,16 +1,56 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + module mvu_3sx3u_tb; - localparam int unsigned ROUNDS = 157; + localparam int unsigned ROUNDS = 59; - localparam int unsigned MH = 32; + localparam int unsigned MH = 48; localparam int unsigned MW = 60; - localparam int unsigned PE = 1; - localparam int unsigned SIMD = 1; localparam int unsigned ACTIVATION_WIDTH = 3; localparam int unsigned WEIGHT_WIDTH = 3; localparam int unsigned ACCU_WIDTH = 16; + //----------------------------------------------------------------------- + // Random Weights + typedef logic signed [WEIGHT_WIDTH-1:0] weights_t[MH][MW]; + function weights_t calc_WEIGHTS(); + automatic weights_t ret; + std::randomize(ret); + return ret; + endfunction : calc_WEIGHTS + weights_t WEIGHTS = calc_WEIGHTS(); //----------------------------------------------------------------------- // Global Control @@ -24,142 +64,161 @@ module mvu_3sx3u_tb; end //----------------------------------------------------------------------- - // DUT - logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; - logic s_axis_weights_tvalid; - uwire s_axis_weights_tready; - - logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; - logic s_axis_input_tvalid; - uwire s_axis_input_tready; - - uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata; - uwire m_axis_output_tvalid; - logic m_axis_output_tready; - - mvu_vvu_axi #( - .IS_MVU(1), - .COMPUTE_CORE("mvu_4sx4u_dsp48e2"), - .MH(MH), .MW(MW), - .PE(PE), .SIMD(SIMD), - - .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), - .ACCU_WIDTH(ACCU_WIDTH) - //int unsigned SEGMENTLEN = 0, - //bit FORCE_BEHAVIORAL = 0, - ) dut ( - .ap_clk(clk), .ap_clk2x('x), .ap_rst_n(!rst), - .s_axis_weights_tdata, .s_axis_weights_tvalid, .s_axis_weights_tready, - .s_axis_input_tdata, .s_axis_input_tvalid, .s_axis_input_tready, - .m_axis_output_tdata, .m_axis_output_tvalid, .m_axis_output_tready - ); - - //----------------------------------------------------------------------- - // Stimuli - - //- Infinite Weight Feed ------------ - typedef logic signed [WEIGHT_WIDTH-1:0] weights_t[MH][MW]; - function weights_t calc_WEIGHTS(); - automatic weights_t ret; - std::randomize(ret); - return ret; - endfunction : calc_WEIGHTS - weights_t WEIGHTS = calc_WEIGHTS(); + // DUTs of Different Geometries + localparam int unsigned N = 6; + typedef struct { + int unsigned pe; + int unsigned simd; + } cfg_t; + localparam cfg_t TESTS[N] = '{ + cfg_t'{ pe: 1, simd: 1 }, + cfg_t'{ pe: 2, simd: 1 }, + cfg_t'{ pe: 1, simd: 2 }, + cfg_t'{ pe: 6, simd: 6 }, + cfg_t'{ pe: 12, simd: 3 }, + cfg_t'{ pe: 4, simd: 30 } + }; + bit [N-1:0] done = 0; + always_comb begin + if(&done) begin + $display("All tests completed."); + $finish; + end + end - initial begin - s_axis_weights_tdata = 'x; - s_axis_weights_tvalid = 0; - @(posedge clk iff !rst); - - forever begin - for(int unsigned h = 0; h < MH; h+=PE) begin - for(int unsigned w = 0; w < MW; w+=SIMD) begin - for(int unsigned pe = 0; pe < PE; pe++) begin - for(int unsigned simd = 0; simd < SIMD; simd++) begin - s_axis_weights_tdata[pe][simd] <= WEIGHTS[h+pe][w+simd]; + for(genvar test = 0; test < N; test++) begin : genTests + localparam int unsigned PE = TESTS[test].pe; + localparam int unsigned SIMD = TESTS[test].simd; + + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid; + uwire s_axis_weights_tready; + + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid; + uwire s_axis_input_tready; + + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata; + uwire m_axis_output_tvalid; + logic m_axis_output_tready; + + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_4sx4u_dsp48e2"), + .MH(MH), .MW(MW), + .PE(PE), .SIMD(SIMD), + + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH) + //int unsigned SEGMENTLEN = 0, + //bit FORCE_BEHAVIORAL = 0, + ) dut ( + .ap_clk(clk), .ap_clk2x('x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid, .s_axis_weights_tready, + .s_axis_input_tdata, .s_axis_input_tvalid, .s_axis_input_tready, + .m_axis_output_tdata, .m_axis_output_tvalid, .m_axis_output_tready + ); + + //----------------------------------------------------------------------- + // Stimuli + + //- Infinite Weight Feed ------------ + initial begin + s_axis_weights_tdata = 'x; + s_axis_weights_tvalid = 0; + @(posedge clk iff !rst); + + forever begin + for(int unsigned h = 0; h < MH; h+=PE) begin + for(int unsigned w = 0; w < MW; w+=SIMD) begin + for(int unsigned pe = 0; pe < PE; pe++) begin + for(int unsigned simd = 0; simd < SIMD; simd++) begin + s_axis_weights_tdata[pe][simd] <= WEIGHTS[h+pe][w+simd]; + end end + s_axis_weights_tvalid <= 1; + @(posedge clk iff s_axis_weights_tready); + s_axis_weights_tvalid <= 0; + s_axis_weights_tdata <= 'x; end - s_axis_weights_tvalid <= 1; - @(posedge clk iff s_axis_weights_tready); - s_axis_weights_tvalid <= 0; - s_axis_weights_tdata <= 'x; end end end - end - //- Input Feed and Reference Computation - typedef logic [PE-1:0][ACCU_WIDTH-1:0] outvec_t; - outvec_t Q_ref[$] = {}; + //- Input Feed and Reference Computation + typedef logic [PE-1:0][ACCU_WIDTH-1:0] outvec_t; + outvec_t Q_ref[$] = {}; + + initial begin + s_axis_input_tdata = 'x; + s_axis_input_tvalid = 0; + @(posedge clk iff !rst); + + repeat(ROUNDS) begin : blkRounds + automatic logic [MH-1:0][ACCU_WIDTH-1:0] accus = '{ default: 0 }; + + for(int unsigned w = 0; w < MW; w+=SIMD) begin : blkSF + for(int unsigned simd = 0; simd < SIMD; simd++) begin : blkSIMD + automatic logic [ACTIVATION_WIDTH-1:0] act = $urandom(); + for(int unsigned h = 0; h < MH; h++) begin : blkMH + automatic logic signed [ACCU_WIDTH-1:0] prod = WEIGHTS[h][w+simd] * $signed({1'b0, act}); + accus[h] += prod; + end : blkMH + s_axis_input_tdata[simd] <= act; + end : blkSIMD + s_axis_input_tvalid <= 1; + @(posedge clk iff s_axis_input_tready); + s_axis_input_tvalid <= 0; + s_axis_input_tdata <= 'x; + end : blkSF + + for(int unsigned h = 0; h < MH; h+=PE) begin + Q_ref.push_back(accus[h+:PE]); + end - initial begin - s_axis_input_tdata = 'x; - s_axis_input_tvalid = 0; - @(posedge clk iff !rst); - - repeat(ROUNDS) begin : blkRounds - automatic logic [MH-1:0][ACCU_WIDTH-1:0] accus = '{ default: 0 }; - - for(int unsigned w = 0; w < MW; w+=SIMD) begin : blkSF - for(int unsigned simd = 0; simd < SIMD; simd++) begin : blkSIMD - automatic logic [ACTIVATION_WIDTH-1:0] act = $urandom(); - for(int unsigned h = 0; h < MH; h++) begin : blkMH - automatic logic signed [ACCU_WIDTH-1:0] prod = WEIGHTS[h][w+simd] * $signed({1'b0, act}); - accus[h] += prod; - end : blkMH - s_axis_input_tdata[simd] <= act; - end : blkSIMD - s_axis_input_tvalid <= 1; - @(posedge clk iff s_axis_input_tready); - s_axis_input_tvalid <= 0; - s_axis_input_tdata <= 'x; - end : blkSF - - for(int unsigned h = 0; h < MH; h+=PE) begin - Q_ref.push_back(accus[h+:PE]); - end + end : blkRounds + end - end : blkRounds - end + //- Output Checker + initial begin + automatic int timeout = 0; - //- Output Checker - initial begin - automatic int timeout = 0; + m_axis_output_tready = 0; + @(posedge clk iff !rst); - m_axis_output_tready = 0; - @(posedge clk iff !rst); + m_axis_output_tready <= 1; + while(timeout < MW/SIMD+16) begin + @(posedge clk); + if(!m_axis_output_tvalid) timeout++; + else begin + automatic outvec_t exp; - m_axis_output_tready <= 1; - while(timeout < MW/SIMD+16) begin - @(posedge clk); - if(!m_axis_output_tvalid) timeout++; - else begin - automatic outvec_t exp; + assert(Q_ref.size()) else begin + $error("Spurious output."); + $stop; + end - assert(Q_ref.size()) else begin - $error("Spurious output."); - $stop; - end + exp = Q_ref.pop_front(); + assert(m_axis_output_tdata === exp) else begin + $error("Mismatched output %p instead of %p.", m_axis_output_tdata, exp); + $stop; + end - exp = Q_ref.pop_front(); - assert(m_axis_output_tdata === exp) else begin - $error("Mismatched output %p instead of %p.", m_axis_output_tdata, exp); - $stop; + timeout = 0; end + end + m_axis_output_tready <= 0; - timeout = 0; + assert(Q_ref.size() == 0) else begin + $error("Missing output."); + $stop; end - end - m_axis_output_tready <= 0; - assert(Q_ref.size() == 0) else begin - $error("Missing output."); - $stop; + $display("Completed PExSIMD = %0dx%0d.", PE, SIMD); + done[test] = 1; end - $display("Test completed."); - $finish; - end + end : genTests endmodule : mvu_3sx3u_tb diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv index 108980c497..af505cf0de 100644 --- a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -1,3 +1,36 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + module mvu_dsp58_tb; localparam int unsigned N = 1000; From e38606c32dc75e30db9421c95b167aa41e4495f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 16 Oct 2024 07:59:49 +0100 Subject: [PATCH 4/5] Another attempt to please PyVerilator. --- finn-rtllib/mvu/mvu_4sx4u.sv | 54 ++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 9d515e08e5..07a00a8751 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -76,7 +76,7 @@ module mvu_4sx4u #( initial begin if(!NARROW_WEIGHTS && (VERSION == DSP48E1)) begin - $error("%m: Need NARROW_WEIGHTS for %s.", DSP48E1.name); + $error("%m: Need NARROW_WEIGHTS for DSP48E1."); $finish; end end @@ -92,31 +92,31 @@ module mvu_4sx4u #( * - The one but leftmost lane (#2) has the minimum internal width and, hence, * the macimum external high extension. */ + function int unsigned sum_width(input int unsigned n, input int unsigned w); + return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); + endfunction : sum_width typedef int unsigned lane_offset_v[4:0]; - function lane_offset_v sliceLanes(); - automatic lane_offset_v res; - + function int unsigned lane_offset(input int unsigned i); + automatic lane_offset_v offsets; + automatic int unsigned res; unique case(VERSION) DSP48E1: begin - res = NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+21 > P_WIDTH? P_WIDTH : ACCU_WIDTH+21, 21, 14, 7, 0 } : + offsets = NARROW_WEIGHTS? + lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } : lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported end DSP48E2, DSP58: begin - res = NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+23 > P_WIDTH? P_WIDTH : ACCU_WIDTH+23, 23, 16, 8, 0 } : - lane_offset_v'{ ACCU_WIDTH+22 > P_WIDTH? P_WIDTH : ACCU_WIDTH+22, 22, 15, 8, 0 }; + offsets = NARROW_WEIGHTS? + lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } : + lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 }; end endcase + res = offsets[i]; + if(res > P_WIDTH) res = P_WIDTH; return res; - endfunction : sliceLanes - localparam lane_offset_v OFFSETS = sliceLanes(); - - function int unsigned sum_width(input int unsigned n, input int unsigned w); - return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); - endfunction : sum_width + endfunction : lane_offset function int unsigned lo_width(input int unsigned i); - return OFFSETS[i+1] - OFFSETS[i]; + return lane_offset(i+1) - lane_offset(i); endfunction : lo_width function int unsigned hi_width(input int unsigned i); automatic int unsigned lw = lo_width(i); @@ -140,7 +140,9 @@ module mvu_4sx4u #( endfunction : init_leave_loads // Pipeline for last indicator flag +/* verilator lint_off LITENDIAN */ logic [1:5] L = '0; +/* verilator lint_on LITENDIAN */ always_ff @(posedge clk) begin if(rst) L <= '0; else if(en) L <= { last, L[1:4] }; @@ -190,8 +192,8 @@ module mvu_4sx4u #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - automatic int unsigned ofs = OFFSETS[pe + PE_REM]; - dd[ofs+:3] = ww[pe]; + automatic int unsigned ofs = lane_offset(pe + PE_REM); + dd[ofs+:3] = ww[pe][2:0]; assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin $warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment."); end @@ -228,7 +230,8 @@ module mvu_4sx4u #( end // Stage #2: Multiply - logic signed [A_WIDTH+B_WIDTH-1:0] M2 = 0; + localparam int unsigned M_WIDTH = A_WIDTH + B_WIDTH; + logic signed [M_WIDTH-1:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -642,14 +645,14 @@ module mvu_4sx4u #( X1 <= xx; X2 <= X1; foreach(X3[i]) begin - X3[i] <= X2[i] + (L[3]? 2'h0 : pp[OFFSETS[i]+:2]); + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[lane_offset(i)+:2]); end end end // Derive actual cross-lane overflows for(genvar i = 0; i < 3; i++) begin - assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1]; + assign h3[s][i] = pp[lane_offset(i+1)+:2] - X3[i+1]; end // Overflow out of high lane logic PZ = 0; @@ -694,12 +697,12 @@ module mvu_4sx4u #( always_ff @(posedge clk) begin if(rst) Hi4 <= 0; else if(en) begin - automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); + automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? {(HI_WIDTH+1){1'b0}} : {Hi4[$left(Hi4)], Hi4}) + $signed(tree[0]); assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); $stop; end - Hi4 <= h; + Hi4 <= h[HI_WIDTH-1:0]; end end assign hi4[i] = Hi4; @@ -714,7 +717,10 @@ module mvu_4sx4u #( // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; + for(genvar s = 0; s < SIMD; s++) begin + uwire [P_WIDTH-1:0] p = p3[s]; + assign tree[SIMD-1+s] = p[lane_offset(i)+:LO_WIDTH]; + end for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH); From 49419c3191ab385cef4768e711bcf513cdf626af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 23 Oct 2024 15:56:50 +0100 Subject: [PATCH 5/5] Fixing reset of arithmetically relevant pipeline register. --- finn-rtllib/mvu/mvu_4sx4u.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index f8a9258408..4d7afd0f21 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -533,9 +533,9 @@ module mvu_4sx4u #( typedef logic signed [$clog2(1+LEAVE_LOAD[n]):0] sum_t; uwire sum_t s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); if((0 < n) && (n <= HI_NODE_REGISTERED)) begin - sum_t S = 'x; + sum_t S = 0; always_ff @(posedge clk) begin - if(rst) S <= 'x; + if(rst) S <= 0; else S <= s; end assign tree[n] = S; @@ -550,7 +550,7 @@ module mvu_4sx4u #( else if(en) begin automatic logic signed [HI_WIDTH:0] h = $signed(L[PIPELINE_DEPTH-1]? 0 : Hi4) + $signed(tree[0]); assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin - $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); + $error("%m [%0d:%0d]: Accumulation overflow for ACCU_WIDTH=%0d", c, i, ACCU_WIDTH); $stop; end Hi4 <= h;