Skip to content

Commit

Permalink
Merge branch 'dev' into bugfix/verification_step_name
Browse files Browse the repository at this point in the history
  • Loading branch information
hannahxy13 committed Jun 21, 2024
2 parents c228ce8 + 25c1c99 commit 9afa88b
Show file tree
Hide file tree
Showing 83 changed files with 1,584 additions and 1,086 deletions.
1 change: 1 addition & 0 deletions .github/workflows/quicktest-dev-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ jobs:
export FINN_ROOT=$(pwd)
export FINN_BUILD_DIR=/tmp/finn_gha
export FINN_INST_NAME=finn_gha
export FINN_SKIP_XRT_DOWNLOAD=1
./run-docker.sh quicktest
21 changes: 15 additions & 6 deletions docker/Dockerfile.finn
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ FROM ubuntu:jammy-20230126
LABEL maintainer="Jakoba Petri-Koenig <[email protected]>, Yaman Umuroglu <[email protected]>"

ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"
ARG SKIP_XRT
ARG LOCAL_XRT

WORKDIR /workspace

Expand Down Expand Up @@ -78,15 +80,19 @@ RUN cd verilator && \
make install

# install XRT
RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
RUN rm /tmp/$XRT_DEB_VERSION.deb
RUN if [ -z "$LOCAL_XRT" ] && [ -z "$SKIP_XRT" ];then \
wget -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' "https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb" -O /tmp/$XRT_DEB_VERSION.deb; fi

COPY requirements.txt $XRT_DEB_VERSION.* /tmp/

RUN if [ -z "$SKIP_XRT" ];then \
apt install -y /tmp/$XRT_DEB_VERSION.deb && \
rm /tmp/$XRT_DEB_VERSION.deb; fi

# versioned Python package requirements for FINN compiler
# these are given in requirements.txt
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN rm requirements.txt
RUN pip install -r /tmp/requirements.txt
RUN rm /tmp/requirements.txt

# install PyTorch
RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
Expand Down Expand Up @@ -126,6 +132,9 @@ RUN pip install tokenize-rt==4.2.1
# pyverilator
RUN pip install tclwrapper==0.0.1

# assure that we have the right setuptools version
RUN pip install setuptools==68.2.2

# extra environment variables for FINN compiler
ENV VIVADO_IP_CACHE "/tmp/vivado_ip_cache"

Expand Down
2 changes: 1 addition & 1 deletion docker/finn_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
source $XILINX_XRT/setup.sh
gecho "Found XRT at $XILINX_XRT"
else
recho "XRT not found on $XILINX_XRT, did the installation fail?"
recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
exit -1
fi
else
Expand Down
2 changes: 1 addition & 1 deletion fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
Expand Down
9 changes: 5 additions & 4 deletions finn-rtllib/fifo/hdl/Q_srl.v
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256)
parameter width = 16; // - width of data (i_d, o_d)

parameter addrwidth = $clog2(depth);
localparam countwidth = $clog2(depth + 1);
localparam addrwidth = $clog2(depth);

input clock;
input reset;
Expand All @@ -89,10 +90,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
input o_r; // - output stream ready
wire o_b; // - output stream back-pressure

output [addrwidth:0] count; // - output number of elems in queue
output [addrwidth:0] maxcount; // - maximum observed count since reset
output [countwidth-1:0] count; // - output number of elems in queue
output [countwidth-1:0] maxcount; // - maximum observed count since reset

reg [addrwidth:0] maxcount_reg; // - maximum count seen until now
reg [countwidth-1:0] maxcount_reg; // - maximum count seen until now
reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address
// for data output
reg shift_en_; // - SRL16 shift enable
Expand Down
160 changes: 114 additions & 46 deletions finn-rtllib/mvu/mvu_4sx4u.sv
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ module mvu_4sx4u #(
int unsigned SIMD,
int unsigned ACCU_WIDTH,

int unsigned VERSION = 1,
int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
bit SIGNED_ACTIVATIONS = 0,
bit NARROW_WEIGHTS = 0, // Weights from [-7:7] rather than [-8:7]
bit FORCE_BEHAVIORAL = 0
)(
// Global Control
Expand All @@ -62,6 +63,55 @@ module mvu_4sx4u #(
`endif
FORCE_BEHAVIORAL;

//-----------------------------------------------------------------------
// Determine Lane Configuration
initial begin
if(!NARROW_WEIGHTS && (VERSION == 1)) begin
$error("%m: Need NARROW_WEIGHTS for DSP48E1.");
$finish;
end
end

/**
* Lane Slicing
* Assumptions:
* - Internal lane widths differ, at most, by a single bit.
* - The rightmost lane (#0) has the maximum internal width.
* - The leftmost lane (#3) extends into the wide DSP accumulation path and
* is constrained by ACCU_WIDTH rather than the next lane. It doesn't have
* an external high extension.
* - The one but leftmost lane (#2) has the minimum internal width and, hence,
* the macimum external high extension.
*/
typedef int unsigned lane_offset_v[4:0];
function lane_offset_v sliceLanes();
unique case(VERSION)
1: begin
return NARROW_WEIGHTS?
lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } :
lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported
end
2: begin
return NARROW_WEIGHTS?
lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } :
lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 };
end
endcase
endfunction : sliceLanes
localparam lane_offset_v OFFSETS = sliceLanes();

function int unsigned lo_width(input int unsigned i);
return OFFSETS[i+1] - OFFSETS[i];
endfunction : lo_width
function int unsigned hi_width(input int unsigned i);
return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD);
endfunction : hi_width
localparam int unsigned LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0];
localparam int unsigned HI_WIDTH_MAX = hi_width(2);

localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath

// Compute the count of decendents for all nodes in the reduction trees.
typedef int unsigned leave_load_t[2*SIMD-1];
function leave_load_t init_leave_loads();
automatic leave_load_t res;
Expand All @@ -79,16 +129,14 @@ module mvu_4sx4u #(
assign vld = L[5];

// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets

localparam int unsigned PIPE_COUNT = (PE+3)/4;
for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes

localparam int unsigned PE_BEG = 4*c;
localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1);
localparam int unsigned PE_REM = 4*(c+1) - PE_END;

uwire [57:0] p3[SIMD];
uwire [47:0] p3[SIMD];
uwire signed [ 1:0] h3[SIMD][3];
for(genvar s = 0; s < SIMD; s++) begin : genSIMD

Expand All @@ -98,10 +146,10 @@ module mvu_4sx4u #(
logic [26:0] dd;
logic [ 1:0] xx[3:1];
if(1) begin : blkVectorize
uwire [3:0] ww[PE_END - PE_BEG];
uwire signed [3:0] ww[PE_END - PE_BEG];
for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
assign ww[pe] = w[PE_BEG + pe][s];
if(pe) begin
if(pe > 0) begin
if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
`ifndef VERILATOR
else begin
Expand All @@ -123,8 +171,19 @@ module mvu_4sx4u #(
dd = '0;
aa = '0;
for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin
dd[D[pe + PE_REM]+:3] = ww[pe];
aa[D[pe + PE_REM]+ 3] = ww[pe][3];
automatic int unsigned ofs = OFFSETS[pe + PE_REM];
dd[ofs+:3] = ww[pe];
assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin
$warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment.");
end

// The sign of the weights are generally put on the subtracted A port.
// However, when coinciding with the actual sign bit position of the
// multiplier input path, it also goes onto the D input. This prevents
// sign extensions that may happen when a DSP primitive is auto-promoted
// to a newer generation.
if(ofs+3 == A_WIDTH-1) dd[ofs+3] = ww[pe][3];
else aa[ofs+3] = ww[pe][3];
end
end
end : blkVectorize
Expand All @@ -135,14 +194,15 @@ module mvu_4sx4u #(
// rst can be only applied to AD and zero only to B
// with the same effect as zeroing both.
if(BEHAVIORAL) begin : genBehav

// Stage #1: Input Refine
logic signed [17:0] B1 = 0;
always_ff @(posedge clk) begin
if(zero) B1 <= 0;
else if(en) B1 <= bb;
end

logic signed [26:0] AD1 = 0;
logic signed [A_WIDTH-1:0] AD1 = 0;
always_ff @(posedge clk) begin
if(rst) AD1 <= 0;
else if(en) AD1 <= dd - aa;
Expand Down Expand Up @@ -429,14 +489,14 @@ module mvu_4sx4u #(
X1 <= xx;
X2 <= X1;
foreach(X3[i]) begin
X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
X3[i] <= X2[i] + (L[3]? 2'h0 : pp[OFFSETS[i]+:2]);
end
end
end

// Derive actual cross-lane overflows
for(genvar i = 0; i < 3; i++) begin
assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1];
end
assign p3[s] = pp;

Expand All @@ -445,48 +505,59 @@ module mvu_4sx4u #(
// Stage #4: Cross-SIMD Reduction

// Count leaves reachable from each node
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop

uwire signed [ACCU_WIDTH -1:0] up4;
uwire signed [ACCU_WIDTH -8:0] hi4[3];
uwire [$clog2(SIMD)+7:0] lo4[3];
uwire signed [ACCU_WIDTH-1:0] up4;
uwire signed [ HI_WIDTH_MAX-1:0] hi4[3];
uwire [$clog2(SIMD)+LO_WIDTH_MAX-1:0] lo4[3];
for(genvar i = 0; i < 4; i++) begin
localparam int unsigned LO_WIDTH = D[i+1] - D[i];
localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH;

// Conclusive high part accumulation
if(i >= PE_REM && i < 3) begin : genHi
// Adder Tree across all SIMD high contributions, each from [-1:1]
uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
assign tree[n] = s;
end
if(i < 3) begin : genHi
if(i < PE_REM) assign hi4[i] = '0;
else begin
localparam int unsigned HI_WIDTH = hi_width(i);

// Adder Tree across all SIMD high contributions, each from [-1:1]
uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
assign tree[n] = s;
end

// High Sideband Accumulation
logic signed [HI_WIDTH-1:0] Hi4 = 0;
always_ff @(posedge clk) begin
if(rst) Hi4 <= 0;
else if(en) begin
automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
$stop;
end
Hi4 <= h;
end
end
assign hi4[i] = Hi4;

// High Sideband Accumulation
logic signed [HI_WIDTH-1:0] Hi4 = 0;
always_ff @(posedge clk) begin
if(rst) Hi4 <= 0;
else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
end
assign hi4[i] = Hi4;
end : genHi
else if (i < 3) begin : genHiZero
assign hi4[i] = '0;
end : genHiZero

// Conclusive low part accumulation
if(i >= PE_REM) begin : blkLo
// Conclusive low part accumulation (all unsigned arithmetic)
if(i < PE_REM) assign lo4[i] = '0;
else begin : genLo
localparam int unsigned LO_WIDTH = lo_width(i);

// Adder Tree across all SIMD low contributions
localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2];
assign tree[n] = s;
end

Expand All @@ -498,10 +569,7 @@ module mvu_4sx4u #(

if(i == 3) assign up4 = Lo4;
else assign lo4[i] = Lo4;
end : blkLo
else begin : blkLoZero
assign lo4[i] = '0;
end : blkLoZero
end : genLo

end

Expand All @@ -511,9 +579,9 @@ module mvu_4sx4u #(
if(rst) Res5 <= '{ default: 0 };
else if(en) begin
Res5[3] <= up4 - hi4[2];
Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] });
end
end

Expand Down
Loading

0 comments on commit 9afa88b

Please sign in to comment.