Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tracking issue for implementing wide multiply #94

Open
gussmith23 opened this issue Aug 3, 2024 · 8 comments
Open

Tracking issue for implementing wide multiply #94

gussmith23 opened this issue Aug 3, 2024 · 8 comments

Comments

@gussmith23
Copy link
Owner

No description provided.

@gussmith23
Copy link
Owner Author

2024-08-03

I'm starting on wide multiply.

Smallest test case that uses just 2 DSPs is 16x32. I mean, there's definitely a smaller one, but that's small enough.

I'll just post it here:

// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
// --------------------------------------------------------------------------------
// Tool Version: Vivado v.2022.1 (lin64) Build 3526262 Mon Apr 18 15:47:01 MDT 2022
// Date        : Mon Jul 22 07:00:25 2024
// Host        : aoi running 64-bit Ubuntu 18.04.6 LTS
// Command     : write_verilog compiled.v -force
// Design      : multiplier
// Purpose     : This is a Verilog netlist of the current design or from a specific cell of the design. The output is an
//               IEEE 1364-2001 compliant Verilog HDL file that contains netlist information obtained from the input
//               design files.
// Device      : xc7z020clg484-1
// --------------------------------------------------------------------------------
`timescale 1 ps / 1 ps

(* use_dsp = "yes" *) 
(* STRUCTURAL_NETLIST = "yes" *)
module multiplier
   (a,
    b,
    out);
  input [15:0]a;
  input [31:0]b;
  output [63:0]out;

  wire \<const0> ;
  wire \<const1> ;
  wire GND_2;
  wire VCC_2;
  wire [15:0]a;
  wire [31:0]b;
  wire [47:0]\^out ;
  wire out__0_n_106;
  wire out__0_n_107;
  wire out__0_n_108;
  wire out__0_n_109;
  wire out__0_n_110;
  wire out__0_n_111;
  wire out__0_n_112;
  wire out__0_n_113;
  wire out__0_n_114;
  wire out__0_n_115;
  wire out__0_n_116;
  wire out__0_n_117;
  wire out__0_n_118;
  wire out__0_n_119;
  wire out__0_n_120;
  wire out__0_n_121;
  wire out__0_n_122;
  wire out__0_n_123;
  wire out__0_n_124;
  wire out__0_n_125;
  wire out__0_n_126;
  wire out__0_n_127;
  wire out__0_n_128;
  wire out__0_n_129;
  wire out__0_n_130;
  wire out__0_n_131;
  wire out__0_n_132;
  wire out__0_n_133;
  wire out__0_n_134;
  wire out__0_n_135;
  wire out__0_n_136;
  wire out__0_n_137;
  wire out__0_n_138;
  wire out__0_n_139;
  wire out__0_n_140;
  wire out__0_n_141;
  wire out__0_n_142;
  wire out__0_n_143;
  wire out__0_n_144;
  wire out__0_n_145;
  wire out__0_n_146;
  wire out__0_n_147;
  wire out__0_n_148;
  wire out__0_n_149;
  wire out__0_n_150;
  wire out__0_n_151;
  wire out__0_n_152;
  wire out__0_n_153;
  wire out__0_n_58;
  wire out__0_n_59;
  wire out__0_n_60;
  wire out__0_n_61;
  wire out__0_n_62;
  wire out__0_n_63;
  wire out__0_n_64;
  wire out__0_n_65;
  wire out__0_n_66;
  wire out__0_n_67;
  wire out__0_n_68;
  wire out__0_n_69;
  wire out__0_n_70;
  wire out__0_n_71;
  wire out__0_n_72;
  wire out__0_n_73;
  wire out__0_n_74;
  wire out__0_n_75;
  wire out__0_n_76;
  wire out__0_n_77;
  wire out__0_n_78;
  wire out__0_n_79;
  wire out__0_n_80;
  wire out__0_n_81;
  wire out__0_n_82;
  wire out__0_n_83;
  wire out__0_n_84;
  wire out__0_n_85;
  wire out__0_n_86;
  wire out__0_n_87;
  wire out__0_n_88;
  wire out__1_n_58;
  wire out__1_n_59;
  wire out__1_n_60;
  wire out__1_n_61;
  wire out__1_n_62;
  wire out__1_n_63;
  wire out__1_n_64;
  wire out__1_n_65;
  wire out__1_n_66;
  wire out__1_n_67;
  wire out__1_n_68;
  wire out__1_n_69;
  wire out__1_n_70;
  wire out__1_n_71;
  wire out__1_n_72;
  wire out__1_n_73;
  wire out__1_n_74;

  assign out[63] = \<const0> ;
  assign out[62] = \<const0> ;
  assign out[61] = \<const0> ;
  assign out[60] = \<const0> ;
  assign out[59] = \<const0> ;
  assign out[58] = \<const0> ;
  assign out[57] = \<const0> ;
  assign out[56] = \<const0> ;
  assign out[55] = \<const0> ;
  assign out[54] = \<const0> ;
  assign out[53] = \<const0> ;
  assign out[52] = \<const0> ;
  assign out[51] = \<const0> ;
  assign out[50] = \<const0> ;
  assign out[49] = \<const0> ;
  assign out[48] = \<const0> ;
  assign out[47:0] = \^out [47:0];
  GND GND
       (.G(\<const0> ));
  GND GND_1
       (.G(GND_2));
  VCC VCC
       (.P(\<const1> ));
  VCC VCC_1
       (.P(VCC_2));
  (* METHODOLOGY_DRC_VIOS = "{SYNTH-13 {cell *THIS*}}" *) 
  DSP48E1 #(
    .ACASCREG(0),
    .ADREG(1),
    .ALUMODEREG(0),
    .AREG(0),
    .AUTORESET_PATDET("NO_RESET"),
    .A_INPUT("DIRECT"),
    .BCASCREG(0),
    .BREG(0),
    .B_INPUT("DIRECT"),
    .CARRYINREG(0),
    .CARRYINSELREG(0),
    .CREG(1),
    .DREG(1),
    .INMODEREG(0),
    .MASK(48'h3FFFFFFFFFFF),
    .MREG(0),
    .OPMODEREG(0),
    .PATTERN(48'h000000000000),
    .PREG(0),
    .SEL_MASK("MASK"),
    .SEL_PATTERN("PATTERN"),
    .USE_DPORT("FALSE"),
    .USE_MULT("MULTIPLY"),
    .USE_PATTERN_DETECT("NO_PATDET"),
    .USE_SIMD("ONE48")) 
    out__0
       (.A({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,b[16:0]}),
        .ACIN({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .ALUMODE({\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .B({\<const0> ,\<const0> ,a}),
        .BCIN({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .C({VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2}),
        .CARRYCASCIN(\<const0> ),
        .CARRYIN(\<const0> ),
        .CARRYINSEL({\<const0> ,\<const0> ,\<const0> }),
        .CEA1(\<const0> ),
        .CEA2(\<const0> ),
        .CEAD(\<const0> ),
        .CEALUMODE(\<const0> ),
        .CEB1(\<const0> ),
        .CEB2(\<const0> ),
        .CEC(\<const0> ),
        .CECARRYIN(\<const0> ),
        .CECTRL(\<const0> ),
        .CED(\<const0> ),
        .CEINMODE(\<const0> ),
        .CEM(\<const0> ),
        .CEP(\<const0> ),
        .CLK(\<const0> ),
        .D({GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2}),
        .INMODE({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .MULTSIGNIN(\<const0> ),
        .OPMODE({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const1> ,\<const0> ,\<const1> }),
        .P({out__0_n_58,out__0_n_59,out__0_n_60,out__0_n_61,out__0_n_62,out__0_n_63,out__0_n_64,out__0_n_65,out__0_n_66,out__0_n_67,out__0_n_68,out__0_n_69,out__0_n_70,out__0_n_71,out__0_n_72,out__0_n_73,out__0_n_74,out__0_n_75,out__0_n_76,out__0_n_77,out__0_n_78,out__0_n_79,out__0_n_80,out__0_n_81,out__0_n_82,out__0_n_83,out__0_n_84,out__0_n_85,out__0_n_86,out__0_n_87,out__0_n_88,\^out [16:0]}),
        .PCIN({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .PCOUT({out__0_n_106,out__0_n_107,out__0_n_108,out__0_n_109,out__0_n_110,out__0_n_111,out__0_n_112,out__0_n_113,out__0_n_114,out__0_n_115,out__0_n_116,out__0_n_117,out__0_n_118,out__0_n_119,out__0_n_120,out__0_n_121,out__0_n_122,out__0_n_123,out__0_n_124,out__0_n_125,out__0_n_126,out__0_n_127,out__0_n_128,out__0_n_129,out__0_n_130,out__0_n_131,out__0_n_132,out__0_n_133,out__0_n_134,out__0_n_135,out__0_n_136,out__0_n_137,out__0_n_138,out__0_n_139,out__0_n_140,out__0_n_141,out__0_n_142,out__0_n_143,out__0_n_144,out__0_n_145,out__0_n_146,out__0_n_147,out__0_n_148,out__0_n_149,out__0_n_150,out__0_n_151,out__0_n_152,out__0_n_153}),
        .RSTA(\<const0> ),
        .RSTALLCARRYIN(\<const0> ),
        .RSTALUMODE(\<const0> ),
        .RSTB(\<const0> ),
        .RSTC(\<const0> ),
        .RSTCTRL(\<const0> ),
        .RSTD(\<const0> ),
        .RSTINMODE(\<const0> ),
        .RSTM(\<const0> ),
        .RSTP(\<const0> ));
  (* METHODOLOGY_DRC_VIOS = "{SYNTH-13 {cell *THIS*}}" *) 
  DSP48E1 #(
    .ACASCREG(0),
    .ADREG(1),
    .ALUMODEREG(0),
    .AREG(0),
    .AUTORESET_PATDET("NO_RESET"),
    .A_INPUT("DIRECT"),
    .BCASCREG(0),
    .BREG(0),
    .B_INPUT("DIRECT"),
    .CARRYINREG(0),
    .CARRYINSELREG(0),
    .CREG(1),
    .DREG(1),
    .INMODEREG(0),
    .MASK(48'h3FFFFFFFFFFF),
    .MREG(0),
    .OPMODEREG(0),
    .PATTERN(48'h000000000000),
    .PREG(0),
    .SEL_MASK("MASK"),
    .SEL_PATTERN("PATTERN"),
    .USE_DPORT("FALSE"),
    .USE_MULT("MULTIPLY"),
    .USE_PATTERN_DETECT("NO_PATDET"),
    .USE_SIMD("ONE48")) 
    out__1
       (.A({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,a}),
        .ACIN({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .ALUMODE({\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .B({\<const0> ,\<const0> ,\<const0> ,b[31:17]}),
        .BCIN({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .C({VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2,VCC_2}),
        .CARRYCASCIN(\<const0> ),
        .CARRYIN(\<const0> ),
        .CARRYINSEL({\<const0> ,\<const0> ,\<const0> }),
        .CEA1(\<const0> ),
        .CEA2(\<const0> ),
        .CEAD(\<const0> ),
        .CEALUMODE(\<const0> ),
        .CEB1(\<const0> ),
        .CEB2(\<const0> ),
        .CEC(\<const0> ),
        .CECARRYIN(\<const0> ),
        .CECTRL(\<const0> ),
        .CED(\<const0> ),
        .CEINMODE(\<const0> ),
        .CEM(\<const0> ),
        .CEP(\<const0> ),
        .CLK(\<const0> ),
        .D({GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2,GND_2}),
        .INMODE({\<const0> ,\<const0> ,\<const0> ,\<const0> ,\<const0> }),
        .MULTSIGNIN(\<const0> ),
        .OPMODE({\<const1> ,\<const0> ,\<const1> ,\<const0> ,\<const1> ,\<const0> ,\<const1> }),
        .P({out__1_n_58,out__1_n_59,out__1_n_60,out__1_n_61,out__1_n_62,out__1_n_63,out__1_n_64,out__1_n_65,out__1_n_66,out__1_n_67,out__1_n_68,out__1_n_69,out__1_n_70,out__1_n_71,out__1_n_72,out__1_n_73,out__1_n_74,\^out [47:17]}),
        .PCIN({out__0_n_106,out__0_n_107,out__0_n_108,out__0_n_109,out__0_n_110,out__0_n_111,out__0_n_112,out__0_n_113,out__0_n_114,out__0_n_115,out__0_n_116,out__0_n_117,out__0_n_118,out__0_n_119,out__0_n_120,out__0_n_121,out__0_n_122,out__0_n_123,out__0_n_124,out__0_n_125,out__0_n_126,out__0_n_127,out__0_n_128,out__0_n_129,out__0_n_130,out__0_n_131,out__0_n_132,out__0_n_133,out__0_n_134,out__0_n_135,out__0_n_136,out__0_n_137,out__0_n_138,out__0_n_139,out__0_n_140,out__0_n_141,out__0_n_142,out__0_n_143,out__0_n_144,out__0_n_145,out__0_n_146,out__0_n_147,out__0_n_148,out__0_n_149,out__0_n_150,out__0_n_151,out__0_n_152,out__0_n_153}),
        .RSTA(\<const0> ),
        .RSTALLCARRYIN(\<const0> ),
        .RSTALUMODE(\<const0> ),
        .RSTB(\<const0> ),
        .RSTC(\<const0> ),
        .RSTCTRL(\<const0> ),
        .RSTD(\<const0> ),
        .RSTINMODE(\<const0> ),
        .RSTM(\<const0> ),
        .RSTP(\<const0> ));
endmodule

First thing to notice is that PCOUT/PCIN are being used.
It's actually just an optimization, I think. It's just the P port. So we could start with just using the output of the P port.

How's it being used? I'm assuming it's beign used as an intermediate output that's then added to another intermediate product to produce the final result.

Ok, seems not all that crazy. a*b0 + a*b1, ish.

@gussmith23
Copy link
Owner Author

2024-08-04

Working on multiply smoketest. Plan is this:

  • Rewrites to split up multiply
  • Rewrite to convert multiply add to a DSP

I'm hoping that's all I need.

Again, there's the optimization using PCIN instead of C. We should get that going ASAP, or at least be thinking about how to capture it.

Here's one way to rewrite:

#lang rosette

(define bw 12)
(define half-bw (/ bw 2))
(define-symbolic a (bitvector half-bw))
(define-symbolic b (bitvector bw))
(define b0 (extract (- half-bw 1) 0 b))
(define b1 (extract (- bw 1) half-bw b))

(define spec (bvmul (zero-extend a (bitvector bw)) b))

(define rewritten
  (bvadd
   (bvmul (zero-extend a (bitvector bw)) (zero-extend b0 (bitvector bw)))
   (bvshl (bvmul (zero-extend a (bitvector bw)) (zero-extend b1 (bitvector bw))) (bv half-bw bw))))

(verify (assert (bveq spec rewritten)))

That's not quite how Vivado implements. I suspect because it's using a different feature of the DSP.

Yeah, see:

             3'b101 : qz_o_mux <= {{17{pcin_in[47]}}, pcin_in[47:17]};

in the DSP model.
That's sign extending the top bits of PCIN. Basically, we don't have to sum the bottom bits of the a*b0 multiplication part, because those bits are the final value.

For now I think my rewriting will also work.

2024-08-11

Something might be up with the

ah, here's the problem: the verilog outputting function needs to take roots as an argument. those things will become the outputs.

Ok, done.

Next thing up: Add rewrite to handle DSP with three inputs.

@gussmith23
Copy link
Owner Author

2024-08-12

Ok so now the issue i'm running into is this:
When extracting a spec for a node, you have to get the spec that the sketch node was originally instantiated for. That is, a node might have a 3-arg MAC or a 2-arg MUL in it. if you instantiated a DSP sketch on the 3-arg MAC, you have to be able to get it back.

  • When you initially rewrite it, can you store the expression that it fired on?
  • When you extract a spec, maybe extract multiple different specs?
  • This also might be an argument for using the indirect style, because we could rewrite the AST node instead of the apply node. i think.

This is a somewhat difficult problem. I don't think it's a dealbreaker, but it will require some thought.

I think it might be best to use the indirect approach here.

@gussmith23
Copy link
Owner Author

2024-08-19

I don't know if the issue from last time makes sense. The alleged problem is that we need to get a spec for a 3-arg DSP sketch, but when we get the spec, it only has 2 args. But how could that be?

No yeah, it does make sense. Because the top-level a*b == (some big expression involving subexpr mapped to c)

Maybe we can just force the choice of the three args to the DSP? will that be enough? I don't think so. Consider the spec that's being currently erroneously extracted:

    (Op2 (Mul) (Op1 (ZeroExtend 32) (Var "a" 16)) (Var "b" 32))

If we forced the choice of a, b, and the expression mapped to c, we could still legally pick all of these nodes, as those other nodes come further down the chain.

I think the quick hack is to use an iteratively refining extraction that biases towards ensuring the specific nodes are extracted.
Then I think the longer term solution is to use the indirect approach, so we can match on ASTs exactly. We may need to think that through a bit more b/c it might not actually be true.
The other option is to put in a primitive to add the expression to a queue after matching it. but even then I don't know if that's possible, because I'm not sure if you get the full match.

Algorithm for iteratively extracting:
Map nodes to the number of desired-to-be-extracted nodes that they contain. This number should be strictly increasing (if there are no loops, but unclear what will happen with loops.)
While there are updates to be dealt with (always run once):
We need to recalculate those values. For each eclass, re-choose based on new numbers. For each node, update based on new eclass choices.

Ok, that seems to be working. I'm now getting all three modules instantiated in the egraph. Now just struggling with extraction.

what's wrong in extraction? It's saying wires aren't yet implemented.

I swear I encountered this already: namely that we do need to support extracting to wires, because sometimes wires are placeholders for outputs. When did we encounter this? Why isn't it fixed?

Hm, I'm not actually sure that yet applies here though. I don't think we should be extracting wires.

I think it might have to do with how we convert the expression back to Churchroad commands?

    ; inputs
    (IsPort "" "a" (Input) v22-1724085807299992000)
    (union v22-1724085807299992000 (Wire "v0-1724085797870886000" 16))
    (IsPort "" "b" (Input) v23-1724085807299992000)
    (union v23-1724085807299992000 (Op1 (Extract 15 0) (Wire "v1-1724085797870886000" 32)))
    (IsPort "" "c" (Input) v24-1724085807299992000)
    (union v24-1724085807299992000 (Op2 (Shl) (Op2 (Mul) (Op1 (ZeroExtend 32) (Wire "v0-1724085797870886000" 16)) (Op1 (ZeroExtend 32) (Op1 (Extract 31 16) (Wire "v1-1724085797870886000" 32)))) (Op0 (BV 16 32))))

Seems like the A input is a wire when it shouldn't be.

Is there nothing else in that eclass? how?

In the meeting, Zach was confused about the whole spec extraction problem. Fundamentally, he was confused as to why we needed a free variable.

Here's an attempt at a clean explanation.

The top level problem looks ROUGHLY (ignoring shifts) like

a (16 bits) * b (32 bits) == a*bhi + a*blo

Which maps like this:

DSP1(a, bhi, DSP0(a, blo))

where DSP0 is programmed to implement 2-input mul and DSP1 is programmed to implement 3-input multiply accumulate.

Synthesizing configurations for DSP0 and DSP1 such that

a * b == DSP1(a, bhi, DSP0(a, blo))

is where we run into solver issues w/ multiplication. Thus, our strategy is to rewrite the LHS:

a * b == (roughly) a*bhi + a*blo == DSP1(a, bhi, DSP0(a, blo))

Now, instead of sending the full a * b query above, we can send two queries:
query 1 will be:

a*blo == DSP0(a, blo)

But now, what should query 2 be?

Option 1 is this:

a * b == DSP1(a, bhi, a*blo)

Essentially, ignore the rewritten LHS and use the original a * b spec. However, this should run into the same solver issues w/r/t reasoning about multiplication. I haven't tested it, though, so I can't say for certain.

Option 2 is this:

a*bhi + a*blo == DSP1(a, bhi, a*blo)

That is, use the rewritten LHS.

Lastly, option 3 is this:

a*bhi + c == DSP1(a, bhi, c)

Specifically, take option 2, and rewrite the a*blo expression (which is the same on both sides) into a fresh variable, c.

In the meeting on Monday, I stated that, when calling Lakeroad to configure DSP1, we need to force the extraction of a spec looking like the LHS of Option 2 or Option 3. Zach's question in meeting was, essentially, why can't the spec look like a * b? I failed to answer clearly, but I'm hoping this makes the reason more clear: it's because, if we used a * b as our spec, we wouldn't sidestep the reasoning-about-multiplication issue that is the whole reason we're doing egraph rewriting in the first place!

There's another question, about whether to use Option 2 or Option 3 -- I only really realized Option 2 was an option while typing this up. I'll give it a try.

@gussmith23
Copy link
Owner Author

2024-08-21

Coming back to the problem at hand. Input a is getting unioned with a wire. Why? And is nothing else in that eclass with the wire?

How do we pick what a, b, c are unioned with?

The problem is a familiar one in the end.

we have these things defined:

    ; P_0[31:0]
    (let v2-1724272697812159000 (Wire "v2-1724272697812159000" 32))
    ; P_0[47:32]
    (let v3-1724272697812159000 (Wire "v3-1724272697812159000" 16))

Then later we do:

    ; { \P_0[47:32] \P_0[31:0] }
    (let v9-1724272697812159000 (Op2 (Concat) v2-1724272697812159000 v3-1724272697812159000))

I thought this was the issue we fixed by splicing.

We could potentially fix this with rewrites, too.

I am trying out the rewrites, but these introduce a cycle that then leads to a stack overflow.

I really feel like there should be a way to fix this via Yosys.

Can I get the problematic Verilog?

Posted a question here: YosysHQ/yosys#4557

@gussmith23
Copy link
Owner Author

2024-08-25

The Yosys folks told me in a meeting that there wasn't a pass to do what I'm asking for.

Talking to them, though, I realized I could probably do it myself.

I think the thought was this: in the churchroad backend, just compile concatenations differently. Always give them a new name.

Can I describe the problem really simply?

In Verilog, you can assign to a concatenation: assign {w0, w1} = ....
When we translate this to egglog, it results in unioning the RHS expression with the concatenation of the w0 and w1 eclasses. However, this doesn't actually have any effect on w0 and w1 themselves, unless we include a rewrite that says that, if x = something and x = {w0,w1}, then w0 = extract(something) and w1 = extract(something). If we then add that rewrite in, we get cycles in the egraph, because if you want to extract w0 e.g., it can be written in terms of itself e.g. w0 = extract({w0, w1}) = extract({extract({w0, w1}), w1}) = ...

Interestingly, we've extracted an infinite loop on the structural Verilog we extract.
Oh, stupidly, it's in a debug message, too. So this might literally go away if we just get rid of the debug message.

OK, we're getting a final extracted expression, but it's a bit weird:

    module top(
      
      input [32-1:0] b,
      
      output [32-1:0] out,
      output [32-1:0] out,
      output [32-1:0] out,
    );
      assign out = wire_57;
      assign out = wire_335;
      assign out = wire_414;
      logic [32-1:0] wire_414 = wire_499[31:0];
      logic [48-1:0] wire_499;
      localparam [5*8:0] wire_181 = "XOR12";
      localparam [5*8:0] wire_179 = "FALSE";
      localparam [5*8:0] wire_177 = "ONE48";
      localparam [9*8:0] wire_175 = "NO_PATDET";
      localparam [8*8:0] wire_173 = "MULTIPLY";
      localparam [7*8:0] wire_171 = "PATTERN";
      localparam [4*8:0] wire_169 = "MASK";
      localparam [1*8:0] wire_167 = "A";
      localparam [9-1:0] wire_165 = 9'd0;
      localparam [32-1:0] wire_456 = 32'd1;
      localparam [1*8:0] wire_161 = "B";
      localparam [6*8:0] wire_159 = "DIRECT";
      localparam [5*8:0] wire_157 = "RESET";
      localparam [8*8:0] wire_155 = "NO_RESET";
      localparam [2*8:0] wire_153 = "AD";
      localparam [32-1:0] wire_151 = 32'd0;
      localparam [48-1:0] wire_453 = 48'd0;
      localparam [9-1:0] wire_450 = 9'd309;
      localparam [5-1:0] wire_449 = 5'd0;
      logic [27-1:0] wire_400 = wire_412[26:0];
      logic [32-1:0] wire_412 = wire_396[31:0];
      logic [33-1:0] wire_396 = wire_398[32:0];
      logic [34-1:0] wire_398 = wire_364[33:0];
      logic [35-1:0] wire_364 = wire_366[34:0];
      logic [36-1:0] wire_366 = wire_368[35:0];
      logic [37-1:0] wire_368 = wire_370[36:0];
      logic [38-1:0] wire_370 = wire_372[37:0];
      logic [39-1:0] wire_372 = wire_374[38:0];
      logic [40-1:0] wire_374 = wire_376[39:0];
      logic [41-1:0] wire_376 = wire_378[40:0];
      logic [42-1:0] wire_378 = wire_380[41:0];
      logic [43-1:0] wire_380 = wire_382[42:0];
      logic [44-1:0] wire_382 = wire_384[43:0];
      logic [45-1:0] wire_384 = wire_386[44:0];
      logic [46-1:0] wire_386 = wire_388[45:0];
      logic [47-1:0] wire_388 = wire_390[46:0];
      localparam [1-1:0] wire_448 = 1'd1;
      localparam [3-1:0] wire_447 = 3'd6;
      localparam [1-1:0] wire_446 = 1'd0;
      logic [48-1:0] wire_390 = { wire_402, wire_388 };
      logic [1-1:0] wire_402 = wire_368[36:36];
      localparam [18-1:0] wire_445 = 18'd0;
      logic [18-1:0] wire_392 = { wire_429, wire_410 };
      logic [16-1:0] wire_410 = wire_392[15:0];
      localparam [2-1:0] wire_429 = 2'd0;
      localparam [4-1:0] wire_444 = 4'd0;
      localparam [30-1:0] wire_443 = 30'd0;
      logic [30-1:0] wire_394 = { wire_431, wire_408 };
      logic [16-1:0] wire_408 = wire_394[15:0];
      localparam [14-1:0] wire_431 = 14'd0;
      logic [32-1:0] wire_335 = wire_360[31:0];
      logic [48-1:0] wire_360;
      localparam [9-1:0] wire_353 = 9'd53;
      localparam [27-1:0] wire_351 = 27'd0;
      logic [18-1:0] wire_323 = { wire_429, wire_333 };
      logic [16-1:0] wire_333 = wire_13[31:16];
      logic [32-1:0] wire_13 = b;
      localparam [4-1:0] wire_345 = 4'd1;
      logic [32-1:0] wire_57 = wire_306[31:0];
      logic [48-1:0] wire_306;
      
    
      DSP48E2 #(
        .USE_MULT(wire_173),
        .AUTORESET_PATDET(wire_155),
        .IS_RSTINMODE_INVERTED(wire_446),
        .PREADDINSEL(wire_167),
        .IS_RSTM_INVERTED(wire_446),
        .SEL_PATTERN(wire_171),
        .IS_RSTB_INVERTED(wire_446),
        .IS_CLK_INVERTED(wire_446),
        .BMULTSEL(wire_161),
        .OPMODEREG(wire_151),
        .ACASCREG(wire_151),
        .INMODEREG(wire_151),
        .USE_PATTERN_DETECT(wire_175),
        .IS_CARRYIN_INVERTED(wire_446),
        .CARRYINSELREG(wire_151),
        .MASK(wire_453),
        .IS_OPMODE_INVERTED(wire_165),
        .IS_RSTALUMODE_INVERTED(wire_446),
        .IS_RSTP_INVERTED(wire_446),
        .A_INPUT(wire_159),
        .USE_WIDEXOR(wire_179),
        .IS_RSTC_INVERTED(wire_446),
        .ADREG(wire_151),
        .DREG(wire_151),
        .ALUMODEREG(wire_151),
        .BCASCREG(wire_151),
        .IS_INMODE_INVERTED(wire_449),
        .IS_RSTD_INVERTED(wire_446),
        .IS_ALUMODE_INVERTED(wire_444),
        .RND(wire_453),
        .MREG(wire_151),
        .CARRYINREG(wire_151),
        .AUTORESET_PRIORITY(wire_157),
        .CREG(wire_151),
        .IS_RSTALLCARRYIN_INVERTED(wire_446),
        .AREG(wire_151),
        .BREG(wire_151),
        .IS_RSTA_INVERTED(wire_446),
        .PATTERN(wire_453),
        .PREG(wire_151),
        .IS_RSTCTRL_INVERTED(wire_446),
        .SEL_MASK(wire_169),
        .B_INPUT(wire_159),
        .USE_SIMD(wire_177),
        .XORSIMD(wire_181),
        .AMULTSEL(wire_153)
    ) module_354 (
        .A(wire_394),
        .ACIN(wire_443),
        .ALUMODE(wire_345),
        .B(wire_323),
        .BCIN(wire_445),
        .C(wire_453),
        .CARRYCASCIN(wire_446),
        .CARRYIN(wire_446),
        .CARRYINSEL(wire_447),
        .CEA1(wire_448),
        .CEA2(wire_448),
        .CEAD(wire_448),
        .CEALUMODE(wire_448),
        .CEB1(wire_448),
        .CEB2(wire_448),
        .CEC(wire_448),
        .CECARRYIN(wire_448),
        .CECTRL(wire_448),
        .CED(wire_448),
        .CEINMODE(wire_448),
        .CEM(wire_448),
        .CEP(wire_448),
        .CLK(wire_446),
        .D(wire_351),
        .INMODE(wire_449),
        .MULTSIGNIN(wire_446),
        .OPMODE(wire_353),
        .PCIN(wire_453),
        .RSTA(wire_446),
        .RSTALLCARRYIN(wire_446),
        .RSTALUMODE(wire_446),
        .RSTB(wire_446),
        .RSTC(wire_446),
        .RSTCTRL(wire_446),
        .RSTD(wire_446),
        .RSTINMODE(wire_446),
        .RSTM(wire_446),
        .RSTP(wire_446),
        .P(wire_360));
      DSP48E2 #(
        .MASK(wire_453),
        .MREG(wire_151),
        .ADREG(wire_151),
        .IS_RSTCTRL_INVERTED(wire_446),
        .ALUMODEREG(wire_151),
        .PREADDINSEL(wire_167),
        .ACASCREG(wire_151),
        .BCASCREG(wire_151),
        .BMULTSEL(wire_161),
        .IS_RSTINMODE_INVERTED(wire_446),
        .BREG(wire_151),
        .SEL_MASK(wire_169),
        .SEL_PATTERN(wire_171),
        .CREG(wire_151),
        .USE_PATTERN_DETECT(wire_175),
        .AMULTSEL(wire_153),
        .USE_SIMD(wire_177),
        .AREG(wire_151),
        .IS_ALUMODE_INVERTED(wire_444),
        .IS_OPMODE_INVERTED(wire_165),
        .IS_RSTC_INVERTED(wire_446),
        .PREG(wire_151),
        .IS_RSTB_INVERTED(wire_446),
        .CARRYINREG(wire_151),
        .RND(wire_453),
        .B_INPUT(wire_159),
        .IS_INMODE_INVERTED(wire_449),
        .IS_RSTP_INVERTED(wire_446),
        .USE_WIDEXOR(wire_179),
        .CARRYINSELREG(wire_151),
        .A_INPUT(wire_159),
        .DREG(wire_151),
        .IS_RSTM_INVERTED(wire_446),
        .IS_CARRYIN_INVERTED(wire_446),
        .INMODEREG(wire_151),
        .OPMODEREG(wire_151),
        .USE_MULT(wire_173),
        .IS_RSTD_INVERTED(wire_446),
        .XORSIMD(wire_181),
        .AUTORESET_PRIORITY(wire_157),
        .IS_CLK_INVERTED(wire_446),
        .AUTORESET_PATDET(wire_155),
        .IS_RSTA_INVERTED(wire_446),
        .PATTERN(wire_453),
        .IS_RSTALLCARRYIN_INVERTED(wire_446),
        .IS_RSTALUMODE_INVERTED(wire_446)
    ) module_102 (
        .A(wire_394),
        .ACIN(wire_443),
        .ALUMODE(wire_345),
        .B(wire_392),
        .BCIN(wire_445),
        .C(wire_453),
        .CARRYCASCIN(wire_446),
        .CARRYIN(wire_446),
        .CARRYINSEL(wire_447),
        .CEA1(wire_448),
        .CEA2(wire_448),
        .CEAD(wire_448),
        .CEALUMODE(wire_448),
        .CEB1(wire_448),
        .CEB2(wire_448),
        .CEC(wire_448),
        .CECARRYIN(wire_448),
        .CECTRL(wire_448),
        .CED(wire_448),
        .CEINMODE(wire_448),
        .CEM(wire_448),
        .CEP(wire_448),
        .CLK(wire_446),
        .D(wire_351),
        .INMODE(wire_449),
        .MULTSIGNIN(wire_446),
        .OPMODE(wire_353),
        .PCIN(wire_453),
        .RSTA(wire_446),
        .RSTALLCARRYIN(wire_446),
        .RSTALUMODE(wire_446),
        .RSTB(wire_446),
        .RSTC(wire_446),
        .RSTCTRL(wire_446),
        .RSTD(wire_446),
        .RSTINMODE(wire_446),
        .RSTM(wire_446),
        .RSTP(wire_446),
        .P(wire_306));
      DSP48E2 #(
        .IS_RSTB_INVERTED(wire_446),
        .IS_RSTCTRL_INVERTED(wire_446),
        .ALUMODEREG(wire_151),
        .IS_RSTC_INVERTED(wire_446),
        .IS_RSTINMODE_INVERTED(wire_446),
        .CARRYINSELREG(wire_456),
        .RND(wire_453),
        .PATTERN(wire_453),
        .XORSIMD(wire_181),
        .IS_RSTM_INVERTED(wire_446),
        .ADREG(wire_151),
        .A_INPUT(wire_159),
        .BREG(wire_151),
        .DREG(wire_151),
        .AUTORESET_PRIORITY(wire_157),
        .ACASCREG(wire_151),
        .CREG(wire_151),
        .IS_RSTALLCARRYIN_INVERTED(wire_446),
        .IS_RSTP_INVERTED(wire_446),
        .MASK(wire_453),
        .IS_INMODE_INVERTED(wire_449),
        .MREG(wire_151),
        .IS_RSTA_INVERTED(wire_446),
        .BCASCREG(wire_151),
        .IS_RSTALUMODE_INVERTED(wire_446),
        .B_INPUT(wire_159),
        .OPMODEREG(wire_151),
        .PREADDINSEL(wire_167),
        .SEL_PATTERN(wire_171),
        .USE_PATTERN_DETECT(wire_175),
        .USE_MULT(wire_173),
        .CARRYINREG(wire_151),
        .BMULTSEL(wire_161),
        .USE_SIMD(wire_177),
        .INMODEREG(wire_151),
        .IS_CLK_INVERTED(wire_446),
        .SEL_MASK(wire_169),
        .IS_RSTD_INVERTED(wire_446),
        .PREG(wire_151),
        .IS_ALUMODE_INVERTED(wire_444),
        .AREG(wire_151),
        .USE_WIDEXOR(wire_179),
        .IS_CARRYIN_INVERTED(wire_446),
        .AMULTSEL(wire_153),
        .AUTORESET_PATDET(wire_155),
        .IS_OPMODE_INVERTED(wire_165)
    ) module_454 (
        .A(wire_394),
        .ACIN(wire_443),
        .ALUMODE(wire_444),
        .B(wire_392),
        .BCIN(wire_445),
        .C(wire_390),
        .CARRYCASCIN(wire_446),
        .CARRYIN(wire_446),
        .CARRYINSEL(wire_447),
        .CEA1(wire_448),
        .CEA2(wire_448),
        .CEAD(wire_448),
        .CEALUMODE(wire_448),
        .CEB1(wire_448),
        .CEB2(wire_448),
        .CEC(wire_448),
        .CECARRYIN(wire_448),
        .CECTRL(wire_448),
        .CED(wire_448),
        .CEINMODE(wire_448),
        .CEM(wire_448),
        .CEP(wire_448),
        .CLK(wire_446),
        .D(wire_400),
        .INMODE(wire_449),
        .MULTSIGNIN(wire_446),
        .OPMODE(wire_450),
        .PCIN(wire_453),
        .RSTA(wire_446),
        .RSTALLCARRYIN(wire_446),
        .RSTALUMODE(wire_446),
        .RSTB(wire_446),
        .RSTC(wire_446),
        .RSTCTRL(wire_446),
        .RSTD(wire_446),
        .RSTINMODE(wire_446),
        .RSTM(wire_446),
        .RSTP(wire_446),
        .P(wire_499));
    endmodule

Why are there three DSPs? I think this is doable with two. That question is a little less important to me right now, we can worry about finding more optimal designs once it's correct.

Why are there three outputs?

Where is our input a?

Okay so now we're extracting cycles, which isn't a problem until you realize the generated Verilog is useless. So we should figure out how to prevent cycles.

One thought: soemtimes shr/shls are ok in structural when they're shifts by constants. How do we encode that into the extractor? One thing we cdould do would be to rewrite shifts by constants.

Temporary solution is to compile shls, but assert that they are shls by constants. Taht's a hacky but conservative soln. because the compiler shouldn't care whether they're shls by constants, but i care right now b/c i don't want to start accidentally allowing shifts into our structural verilog

working now! Woohoo! working on an even wider mul now.

assume all multiplies are full-size, i.e. nbit*mbit = n+m bit

a * b = {ahi,alo}*b = (ahi*b)<<n + alo*b

similar for b, so if we apply both rules we get

a * b = {ahi,alo}*b = (ahi*b)<<n + alo*b
      = (ahi*{bhi,blo})<<n + alo*{bhi,blo}
      = ( (ahi*bhi)<<n + ahi*blo)<<n + (alo*bhi)<<n + alo*blo
      = (ahi*bhi)<<2n + (ahi*blo)<<n + (alo*bhi)<<n + alo*blo

so that's two multiplication rules (one for each side) plus a rule that distributes a shift plus a rule that simplifies shifts.

@gussmith23
Copy link
Owner Author

gussmith23 commented Aug 28, 2024

2024-08-26

Thinking about how to implement even wider mul. The interesting thing is that our representation of multiply preserves width. In the DSP we're really aiming to implement n*n->2n, but we have n*n->n as our representation. This is annoying for us because it means we need our DSP rewrites to bake in zero/sign extension somehow, because we're specifically looking for 16x16->32-ish things.

Can we just ignore the fact that our muls are n*n->2n? Can we just write our rewrites over n*n->n type muls, and maybe have a few rewrites that make things work?

How do we rewrite n*n->n muls?

I think it's similar to what we had before.

a:n * b:n 
  = {ahi:(n/2),alo:(n/2)}*b:n
  = (ahi:n*b:n)<<(n/2) + alo:n*b:n

Note that we're actually adding in the zero/sign extensions here, when we cast ahi/alo (which are n/2 bits) up to n bits. In our case, if a and b are already zero-extended up to n bits (e.g. a and b are zero-extended from 16 bit numbers because we're trying to do a full-width multiply), then we know the upper bits of both are zero, and an analysis can then delete the multiply.

If we have a rewrite doing the same for the second argument, then we should eventually get only multiplies which are sign- or zero-extended, which we can then simplify.

That is, if we then do the equivalent second-arg rewrite:

a:n * b:n 
  = {ahi:(n/2),alo:(n/2)}*b:n
  = (n'(ahi:(n/2))*b:n)<<(n/2) + n'(alo:(n/2))*b:n
  = (n'(ahi:(n/2))*{bhi,blo}:n)<<(n/2) + n'(alo:(n/2))*b:n
  = ((n'(ahi:(n/2))*n'(bhi))<<(n/2) +  n'(ahi)*n'(blo))<<(n/2) + n'(alo:(n/2))*b:n
  = ((n'(ahi:(n/2))*n'(bhi))<<(n/2) +  n'(ahi)*n'(blo))<<(n/2) 
      + (n'(alo)*n'(bhi))<<(n/2) + n'(alo)*n'(blo)
  = (n'(ahi:(n/2))*n'(bhi))<<n + (n'(ahi)*n'(blo))<<(n/2) 
      + (n'(alo)*n'(bhi))<<(n/2) + n'(alo)*n'(blo)

There's a quick and dirty way to implement this: just implement this exact rewrite as-is. Easy!

2024-08-27

Every time I go to start implementing this, it feels a bit more complicated than expected.

Also, random thought: does SMTLIB choose to use nxn=n (rather than nxn=2n) multiplication so that they don't have to worry about signedness? maybe not. i don't think you need to care about signedness if the two inputs are the same length, right?

gussmith23 added a commit that referenced this issue Aug 28, 2024
@gussmith23
Copy link
Owner Author

2024-09-01

Need to just get this transformation done. So how do we do it?

I wrote before (plus annotations i'm writing rn):

a:n * b:n 
  = {ahi:(n/2),alo:(n/2)}*b:n
  = (n'(ahi:(n/2))*b:n)<<(n/2) + n'(alo:(n/2))*b:n
  (above two steps by mul splitting rewrite on first arg)
  = (n'(ahi:(n/2))*{bhi,blo}:n)<<(n/2) + n'(alo:(n/2))*b:n
  = ((n'(ahi:(n/2))*n'(bhi))<<(n/2) +  n'(ahi)*n'(blo))<<(n/2) + n'(alo:(n/2))*b:n
  (above two steps by mul splitting rewrite on second arg in parens)
  = ((n'(ahi:(n/2))*n'(bhi))<<(n/2) +  n'(ahi)*n'(blo))<<(n/2) 
      + (n'(alo)*n'(bhi))<<(n/2) + n'(alo)*n'(blo)
  = (n'(ahi:(n/2))*n'(bhi))<<n + (n'(ahi)*n'(blo))<<(n/2) 
      + (n'(alo)*n'(bhi))<<(n/2) + n'(alo)*n'(blo)
  (above two steps by mul splitting on remaining `b`, plus distribution of shift)

So primarily we need to implement

  • Mul splitting on left and right
  • Shift distribution
  • Analyses to detect "real" bitwidth

2024-09-02

Made progress yesterday and today. Getting to the point where I can finally make at least one call out to Lakeroad on the wide multiply example. However, now I'm running into an issue: the spec looks a bit odd. Not odd, maybe, but not what we want.

[2024-09-02T16:22:47Z INFO  churchroad] Calling Lakeroad with spec:
    (Op2 (Mul) (Op1 (ZeroExtend 64) (Op1 (Extract 31 16) (Op1 (ZeroExtend 64) (Var "a" 32)))) (Op1 (ZeroExtend 64) (Op1 (Extract 15 0) (Op1 (ZeroExtend 64) (Var "b" 32)))))
    and sketch:
    PrimitiveInterfaceDSP
[2024-09-02T16:22:47Z DEBUG churchroad] a expr: (Op1 (ZeroExtend 64) (Op1 (Extract 31 16) (Op1 (ZeroExtend 64) (Var "a" 32))))
[2024-09-02T16:22:47Z DEBUG churchroad] b expr: (Op1 (ZeroExtend 64) (Op1 (Extract 15 0) (Op1 (ZeroExtend 64) (Var "b" 32))))

Seems to me like the problem is that the inputs a and b are too wide.

I fixed that by putting an extract on top of them when instantiating the PrimitiveInterfaceDSP.

Now the problem is that the output is too wide (64 bits).

I feel like that might be due to our rewrite being overly aggressive about keeping bitwidths wide. Like, when we split up the mult, we should absolutely be able to shrink the overall bitwidth eventually instead of keeping it at 64.

~~

some time later...

I've gotten pretty far now. The issue I'm running into now is that the ensure extract extractor can fail---it can potentially not find ways to extract the desired subexprs, seemingly. I gotta double chekc that that's what's actually happening.

Unclear what the issue is. Seems like the nodes I'm trying to force extraction of are somehow not reachable from any expression nodes. They're children of HasType/RealBitwidth/DSP nodes, but not children of any other nodes somehow. Disabling subsumes didn't fix it.

Okay, everything's working okay -- next need to implement mapping to DSP3, as the Add in the top level node is still unimplemented.

2024-09-03

So close. One thing that's weird, though, is that our

Okay, finally got Churchroad calling out to Lakeroad for a 3-arg DSP. Synth problem is timing out. Here's the generated file (with command at the top)

// /Users/gus/lakeroad/bin/main.rkt --architecture xilinx-ultrascale-plus --verilog-module-filepath tmp.v --top-module-name top --verilog-module-out-signal out:48 --input-signal a:16 --input-signal b:16 --template dsp --pipeline-depth 0 --out-format verilog --timeout 120 --input-signal c:48

module top(
  
  input [16-1:0] a,
  input [16-1:0] b,
  input [48-1:0] c,
  
  output [48-1:0] out,
);
  assign out = wire_Expr_84;
  logic [48-1:0] wire_Expr_84 = (wire_Expr_82+wire_Expr_83);
  logic [48-1:0] wire_Expr_83 = 48'(wire_Expr_115);
  logic [48-1:0] wire_Expr_115 = c;
  logic [48-1:0] wire_Expr_82 = wire_Expr_57[47:0];
  logic [64-1:0] wire_Expr_57 = (wire_Expr_48*wire_Expr_34);
  logic [64-1:0] wire_Expr_34 = 64'(wire_Expr_112);
  logic [16-1:0] wire_Expr_112 = b;
  logic [64-1:0] wire_Expr_48 = 64'(wire_Expr_110);
  logic [16-1:0] wire_Expr_110 = a;
endmodule

I'm wondering if it's the wide mul in there that's the problem.

Interesting, if i shrink it down to 16 bits (i.e. all signals 16 bits) it still times out on bitwuzla, but yices gets it.

So what bitwidth here is the problem? the add bitwidth? the mul bitwidth?

This still times out:

/*
/Users/gus/lakeroad/bin/lakeroad-portfolio.py --cvc5 --bitwuzla --yices  --architecture xilinx-ultrascale-plus --verilog-module-filepath tmp.v --top-module-name top --verilog-module-out-signal out:48 --input-signal a:16 --input-signal b:16 --template dsp --pipeline-depth 0 --out-format verilog --timeout 120 --input-signal c:48
 */

module top(
  
  input [16-1:0] a,
  input [16-1:0] b,
  input [48-1:0] c,
  
  output [48-1:0] out
);
  assign out = a*b + c;
endmodule

But when output and c are 40 bits, it doesn't.

Terminates at 45. Seems to spin at 46.

Weird -- is this really just the limit of the solvers? Or is there something more complicated (or less complicated, more stupid) happening?

Regardless, I think it's probably time to use the internal shift feature inside the DSP, rather than trying to use a full wide add.

Next thing to do is to add a unit test to Lakeraod to see if we can use the shift feature.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant