diff --git a/software/glasgow/applet/interface/qspi_controller/__init__.py b/software/glasgow/applet/interface/qspi_controller/__init__.py index 4fcdca35e..a5f7729ef 100644 --- a/software/glasgow/applet/interface/qspi_controller/__init__.py +++ b/software/glasgow/applet/interface/qspi_controller/__init__.py @@ -32,8 +32,10 @@ def elaborate(self, platform): m = Module() m.submodules.qspi = qspi = QSPIController(self._ports, use_ddr_buffers=True, - sample_delay_half_clocks = self._sample_delay_half_clocks) + max_sample_delay_half_clocks=self._sample_delay_half_clocks, + min_divisor=self._divisor) m.d.comb += qspi.divisor.eq(self._divisor) + m.d.comb += qspi.sample_delay_half_clocks.eq(self._sample_delay_half_clocks) o_fifo = self._out_fifo.stream i_fifo = self._in_fifo.stream diff --git a/software/glasgow/gateware/iostream.py b/software/glasgow/gateware/iostream.py index fb579f10c..b4d431996 100644 --- a/software/glasgow/gateware/iostream.py +++ b/software/glasgow/gateware/iostream.py @@ -5,7 +5,7 @@ from glasgow.gateware.ports import PortGroup -__all__ = ["IOStreamer"] +__all__ = ["IOStreamerTop"] def _filter_ioshape(direction, ioshape): @@ -65,71 +65,91 @@ def elaborate(self, platform): return m +def LaneLayout(actual_layout, /, *, meta_layout=0): + return data.StructLayout({ + "actual": actual_layout, + "meta": meta_layout, + }) -class IOStreamer(wiring.Component): - """I/O buffer to stream adapter. +def MetaLayoutWithTag(*, tag_layout, meta_layout=0): + return data.StructLayout({ + "inner_meta": meta_layout, + "tag": tag_layout, + "last": 1, + }) - This adapter instantiates I/O buffers for a port (FF or DDR) and connects them to a pair of - streams, one for the outputs of the buffers and one for the inputs. Whenever an `o_stream` - transfer occurs, the state of the output is updated _t1_ cycles later; if `o_stream.p.i_en` - is set, then _t2_ cycles later, a payload with the data captured at the same time as - the outputs were updated appears on `i_stream.p.i`. +def IOOutputActualLayout(ioshape): + return data.StructLayout({ + "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({ + "o": width, + "oe": 1, + })), + "i_en": 1, + }) - Arbitrary ancillary data may be provided with `o_stream` transfers via `o_stream.p.meta`, - and this data will be relayed back as `i_stream.p.meta` with the output-to-input latency - of the buffer. Higher-level protocol engines can use this data to indicate how the inputs - must be processed without needing counters or state machines on a higher level to match - the latency (and, usually, without needing any knowledge of the latency at all). +def IOOutputStreamSignature(ioshape, /, lane_count=2, *, meta_layout=0): + actual_layout = IOOutputActualLayout(ioshape) + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=meta_layout), + lane_count + ) + ) - On reset, output ports have their drivers enabled, and bidirectional ports have them disabled. - All of the signals are deasserted, which could be a low or a high level depending on the port - polarity. - """ +def IOInputActualLayout(ioshape): + return data.StructLayout({ + "port": _map_ioshape("i", ioshape, lambda width: data.StructLayout({ + "i": width, + })), + "i_valid": 1, + }) - @staticmethod - def o_stream_signature(ioshape, /, *, ratio=1, meta_layout=0): - return stream.Signature(data.StructLayout({ - "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({ - "o": width if ratio == 1 else data.ArrayLayout(width, ratio), - "oe": 1, - })), - "i_en": 1, - "meta": meta_layout, - })) +def IOInputStreamSignature(ioshape, /, lane_count=2, *, meta_layout=0): + actual_layout = IOInputActualLayout(ioshape) + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=meta_layout), + lane_count + ) + ) - @staticmethod - def i_stream_signature(ioshape, /, *, ratio=1, meta_layout=0): - return stream.Signature(data.StructLayout({ - "port": _map_ioshape("i", ioshape, lambda width: data.StructLayout({ - "i": width if ratio == 1 else data.ArrayLayout(width, ratio), - })), - "meta": meta_layout, - })) - - def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0, sample_delay_half_clocks=0): - if ratio == 1: - assert (sample_delay_half_clocks % 2) == 0 +class IOStreamer(wiring.Component): + def __init__(self, ioshape, ports, /, *, ratio=1, meta_layout=0): assert isinstance(ioshape, (int, dict)) assert ratio in (1, 2) self._ioshape = ioshape - self._ports = ports self._ratio = ratio - self._init = init - self._sample_delay_half_clocks = sample_delay_half_clocks + self._ports = ports super().__init__({ - "o_stream": In(self.o_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)), - "i_stream": Out(self.i_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)), + "o_stream": In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), + "i_stream": Out(IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), }) + self.o_stream.valid = Const(1) + self.o_stream.ready = Const(1) + for lane_index in range(ratio): + self.o_stream.p[lane_index].actual.i_en = Const(1) # Must always sample! + # self.i_stream.valid = Const(1) # i_stream is not really valid for the first `latency` cycles after reset + self.i_stream.ready = Const(1) + + def get_latency(self, platform): + # May be platform-dependent in the future + if self._ratio == 1: + return 1 + if self._ratio == 2: + return 2 + def elaborate(self, platform): m = Module() if self._ratio == 1: - buffer_cls, latency = io.FFBuffer, 1 + self._sample_delay_half_clocks // 2 + buffer_cls = io.FFBuffer if self._ratio == 2: - buffer_cls, latency = SimulatableDDRBuffer, 2 + (self._sample_delay_half_clocks // 2) + (self._sample_delay_half_clocks % 2) + buffer_cls = SimulatableDDRBuffer + + latency = self.get_latency(platform) if isinstance(self._ports, io.PortLike): m.submodules.buffer = buffer = buffer_cls("io", self._ports) @@ -139,30 +159,17 @@ def elaborate(self, platform): direction, _width = self._ioshape[name] m.submodules[f"buffer_{name}"] = buffer[name] = buffer_cls(direction, sub_port) - o_latch = Signal(_map_ioshape("o", self._ioshape, lambda width: data.StructLayout({ - "o": width, - "oe": 1, - })), init=self._init) - with m.If(self.o_stream.valid & self.o_stream.ready): + for lane_index in range(self._ratio): for _, buffer_parts, stream_parts in _iter_ioshape("o", self._ioshape, - buffer, self.o_stream.p.port): - m.d.comb += buffer_parts.o.eq(stream_parts.o) - m.d.comb += buffer_parts.oe.eq(stream_parts.oe) - for _, latch_parts, stream_parts in _iter_ioshape("o", self._ioshape, - o_latch, self.o_stream.p.port): - if self._ratio == 1: - m.d.sync += latch_parts.o.eq(stream_parts.o) - else: - m.d.sync += latch_parts.o.eq(stream_parts.o[-1]) - m.d.sync += latch_parts.oe.eq(stream_parts.oe) - with m.Else(): - for _, buffer_parts, latch_parts in _iter_ioshape("o", self._ioshape, - buffer, o_latch): - if self._ratio == 1: - m.d.comb += buffer_parts.o.eq(latch_parts.o) - else: - m.d.comb += buffer_parts.o.eq(latch_parts.o.replicate(self._ratio)) - m.d.comb += buffer_parts.oe.eq(latch_parts.oe) + buffer, self.o_stream.p[lane_index].actual.port): + m.d.comb += buffer_parts.o[lane_index].eq(stream_parts.o) + + for name, buffer_parts in _iter_ioshape("o", self._ioshape, + buffer): + oe_any = self.o_stream.p[0].actual.port[name].oe + for lane_index in range(1, self._ratio): + oe_any |= self.o_stream.p[1].actual.port[name].oe + m.d.comb += buffer_parts.oe.eq(oe_any) def delay(value, name): delayed_values = [] @@ -173,149 +180,644 @@ def delay(value, name): delayed_values.append(next_value) return delayed_values - i_en_delays = delay(self.o_stream.valid & self.o_stream.ready & - self.o_stream.p.i_en, name="i_en") - i_en = i_en_delays[-1] - meta = delay(self.o_stream.p.meta, name="meta")[-1] + i_en = delay(Const(1), name="i_en")[-1] # We always output samples, except for `latency` cycles after reset + for lane_index in range(self._ratio): + for name, i_payload_parts, buffer_parts in _iter_ioshape("i", self._ioshape, self.i_stream.p[lane_index].actual.port, buffer): + if self._ratio > 1: + m.d.comb += i_payload_parts.i.eq(buffer_parts.i[lane_index]) + else: + m.d.comb += i_payload_parts.i.eq(buffer_parts.i) + m.d.comb += self.i_stream.p[lane_index].actual.i_valid.eq(1) + m.d.comb += self.i_stream.valid.eq(i_en) + + return m + + +class StreamStretcher(wiring.Component): + """ + This component makes sure that any stream is not allowed to transfer more often + than every `divisor` cycles. If `divisor` is 0 or 1, then the StreamStretcher has + no effect. + """ + def __init__(self, stream_signature, *, divisor_width=16): + super().__init__({ + "i_stream": In(stream_signature), + "o_stream": Out(stream_signature), + "divisor": In(divisor_width), + }) + + def elaborate(self, platform): + m = Module() + timer = Signal.like(self.divisor) + timer_done = Signal() + m.d.comb += timer_done.eq((timer == 0) | (timer == 1)) + + m.d.comb += self.o_stream.p.eq(self.i_stream.p) + m.d.comb += self.o_stream.valid.eq(self.i_stream.valid & timer_done) + m.d.comb += self.i_stream.ready.eq(self.o_stream.ready & timer_done) + + with m.If(timer_done): + with m.If(self.o_stream.ready & self.o_stream.valid): + m.d.sync += timer.eq(self.divisor) + with m.Else(): + m.d.sync += timer.eq(timer - 1) + + return m + + +class IOLatcher(wiring.Component): + """ + This component has an always valid, always ready output stream, + which passes through the "o" and "oe" fields when a transaction + is presented at the input stream, otherwise it keeps repeating the + last transaction, which it memorises. + Other fields such as i_en, and meta are dropped. + """ + def __init__(self, ioshape, /, *, ratio=1, init=None, meta_layout=0): + assert isinstance(ioshape, (int, dict)) + assert ratio in (1, 2) + + self._ioshape = ioshape + self._ratio = ratio + self._init = init + + super().__init__({ + "i_stream": In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), + "o_stream": Out(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), + }) + + self.o_stream.valid = Const(1) + self.o_stream.ready = Const(1) + self.i_stream.ready = Const(1) + + def elaborate(self, platform): + m = Module() + + o_latch = Signal(_map_ioshape("o", self._ioshape, lambda width: data.StructLayout({ + "o": width, + "oe": 1, + })), init=self._init) + with m.If(self.i_stream.valid & self.i_stream.ready): + for lane_index in range(self._ratio): + m.d.comb += self.o_stream.p[lane_index].actual.port.eq(self.i_stream.p[lane_index].actual.port) + + for _, latch_parts, stream_parts in _iter_ioshape("o", self._ioshape, + o_latch, self.i_stream.p[-1].actual.port): + m.d.sync += latch_parts.eq(stream_parts) + + with m.Else(): + for lane_index in range(self._ratio): + for _, simple_stream_parts, latch_parts in _iter_ioshape("o", self._ioshape, + self.o_stream.p[lane_index].actual.port, o_latch): + m.d.comb += simple_stream_parts.eq(latch_parts) + + return m + +class SkidBuffer(wiring.Component): + """ + This component is a generic skid buffer. + It is essentially a `depth` deep FIFO with a stream interface. + """ + def __init__(self, depth, stream_signature): + self._depth = depth + super().__init__({ + "i_stream": In(stream_signature), + "o_stream": Out(stream_signature), + }) + + def elaborate(self, platform): + m = Module() # This skid buffer is organized as a shift register to avoid any uncertainties associated # with the use of an async read memory. On platforms that have LUTRAM, this implementation # may be slightly worse than using LUTRAM, and may have to be revisited in the future. - skid = Array(Signal(self.i_stream.payload.shape(), name=f"skid_{stage}") - for stage in range(1 + latency)) - for name, skid_parts, buffer_parts in _iter_ioshape("i", self._ioshape, skid[0].port, buffer): - if self._sample_delay_half_clocks % 2: - m.d.comb += skid_parts.i[1].eq(buffer_parts.i[0]) - i1_delayed = Signal.like(buffer_parts.i[1], name=f"{name}_i1_delayed") - m.d.sync += i1_delayed.eq(buffer_parts.i[1]) - m.d.comb += skid_parts.i[0].eq(i1_delayed) - else: - m.d.comb += skid_parts.i.eq(buffer_parts.i) - m.d.comb += skid[0].meta.eq(meta) - - skid_at = Signal(range(1 + latency)) - - with m.If(i_en): - for n_shift in range(latency): + skid = Array(Signal(self.i_stream.p.shape(), name=f"skid_{stage}") + for stage in range(1 + self._depth)) + + skid_at = Signal(range(1 + self._depth)) + + m.d.comb += skid[0].eq(self.i_stream.p) + + with m.If(self.i_stream.valid): + for n_shift in range(self._depth): m.d.sync += skid[n_shift + 1].eq(skid[n_shift]) - with m.If(i_en & ~self.i_stream.ready): - # m.d.sync += Assert(skid_at != latency) + not_full = Signal() + m.d.comb += not_full.eq(skid_at != self._depth) + + m.d.comb += self.o_stream.p.eq(skid[skid_at]) + m.d.comb += self.o_stream.valid.eq(self.i_stream.valid | (skid_at != 0)) + m.d.comb += self.i_stream.ready.eq(self.o_stream.ready | not_full) + + with m.If(self.i_stream.valid & self.i_stream.ready & ~self.o_stream.ready): m.d.sync += skid_at.eq(skid_at + 1) - with m.Elif((skid_at != 0) & ~i_en & self.i_stream.ready): + with m.Elif(~self.i_stream.valid & self.o_stream.valid & self.o_stream.ready): m.d.sync += skid_at.eq(skid_at - 1) - m.d.comb += self.i_stream.payload.eq(skid[skid_at]) - m.d.comb += self.i_stream.valid.eq(i_en | (skid_at != 0)) - m.d.comb += self.o_stream.ready.eq(self.i_stream.ready | ~((skid_at!=0) | Cat(*i_en_delays).any())) + return m + +class SampleRequestDelayer(wiring.Component): + def __init__(self, /, *, ratio, meta_layout, min_latency, max_sample_delay_half_clocks, min_divisor): + self._ratio = ratio + self._min_latency = min_latency + self._max_sample_delay_half_clocks = max_sample_delay_half_clocks + self._min_divisor = min_divisor + self._max_latency_except_hcyc = min_latency + self._max_sample_delay_half_clocks // 2 + + super().__init__({ + "i_en": In(data.ArrayLayout(1, ratio)), + "meta": In(data.ArrayLayout(meta_layout, ratio)), + "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1)), + "i_en_delayed": Out(data.ArrayLayout(1, ratio)), + "meta_delayed": Out(data.ArrayLayout(meta_layout, ratio)), + "reads_in_flight": Out(1), + }) + + def elaborate(self, platform): + m = Module() + + def delay(value, name, cycles): + delayed_values = Array(Signal(value.shape(), name=f"delayed_{name}_{stage}") + for stage in range(cycles)) + m.d.sync += delayed_values[0].eq(value) + for stage in range(1, cycles): + m.d.sync += delayed_values[stage].eq(delayed_values[stage-1]) + return delayed_values + + i_en_delayed_except_half_cyc = Signal.like(self.i_en_delayed) + meta_delayed_except_half_cyc = Signal.like(self.meta_delayed) + reads_in_flight_except_half_cyc = Signal.like(self.reads_in_flight) + + # Following are two implementations: the second one is really simple, using only shift registers, + # while the first one relies on a `min_divisor` setting to use a counter for the first part of the + # delay mechanism. + + # Some statistics using the memory-25x applet: + # divisor=24 sample_delay=0 => simple: 810 ICESTORM_LCs, optimized: 811 ICESTORM_LCs + # divisor=24 sample_delay=1 => simple: 830 ICESTORM_LCs, optimized: 832 ICESTORM_LCs + # divisor=24 sample_delay=2 => simple: 823 ICESTORM_LCs, optimized: 825 ICESTORM_LCs + # divisor=24 sample_delay=3 => simple: 832 ICESTORM_LCs, optimized: 824 ICESTORM_LCs + # divisor=24 sample_delay=6 => simple: 836 ICESTORM_LCs, optimized: 823 ICESTORM_LCs + # divisor=24 sample_delay=12 => simple: 849 ICESTORM_LCs, optimized: 825 ICESTORM_LCs + # divisor=24 sample_delay=24 => simple: 888 ICESTORM_LCs, optimized: 830 ICESTORM_LCs + # divisor=24 sample_delay=36 => simple: 928 ICESTORM_LCs, optimized: 833 ICESTORM_LCs + # divisor=24 sample_delay=47 => simple: 1001 ICESTORM_LCs, optimized: 877 ICESTORM_LCs + # divisor=3 sample_delay=0 => simple: 813 ICESTORM_LCs, optimized: 820 ICESTORM_LCs + # divisor=3 sample_delay=3 => simple: 859 ICESTORM_LCs, optimized: 860 ICESTORM_LCs + # divisor=3 sample_delay=6 => simple: 872 ICESTORM_LCs, optimized: 858 ICESTORM_LCs + # divisor=3 sample_delay=12 => simple: 894 ICESTORM_LCs, optimized: 901 ICESTORM_LCs + # divisor=3 sample_delay=24 => simple: 980 ICESTORM_LCs, optimized: 970 ICESTORM_LCs + # divisor=4 sample_delay=8 => simple: 874 ICESTORM_LCs, optimized: 866 ICESTORM_LCs + # divisor=4 sample_delay=16 => simple: 903 ICESTORM_LCs, optimized: 893 ICESTORM_LCs + # divisor=4 sample_delay=32 => simple: 999 ICESTORM_LCs, optimized: 988 ICESTORM_LCs + # divisor=5 sample_delay=10 => simple: 868 ICESTORM_LCs, optimized: 858 ICESTORM_LCs + # divisor=8 sample_delay=6 => simple: 836 ICESTORM_LCs, optimized: 830 ICESTORM_LCs + # divisor=8 sample_delay=16 => simple: 886 ICESTORM_LCs, optimized: 866 ICESTORM_LCs + # divisor=240 sample_delay=100 => simple: 1114 ICESTORM_LCs, optimized: 826 ICESTORM_LCs + # divisor=240 sample_delay=238 => simple: 1528 ICESTORM_LCs, optimized: 830 ICESTORM_LCs + + #if self._min_divisor >= 1: # The optimized implementation works correctly as long as _min_divisor >= 1 + if self._min_divisor >= 4: # however it may not make sense to use it when min_divisor is a low number + # Optimized implementaiton using a counter as a first-stage delay mechanism + assert self._min_divisor >= 1, "with a divisor of less than 1, the counter logic wouldn't work" + assert self._min_latency >= 1, "with a min latency less then 1, and sample delay of zero, the counter logic wouldn't work" + counting = Signal() + counter = Signal(range(min(self._min_divisor, self._max_latency_except_hcyc))) + i_en_cached = Signal.like(self.i_en) + meta_cached = Signal.like(self.meta) + i_en_delay_chain_input = Signal.like(self.i_en) + + latency_minus_1 = self._min_latency - 1 + self.sample_delay_half_clocks // 2 + + with m.If(counting): + with m.If((counter == self._min_divisor - 1) | + (counter == latency_minus_1)): + m.d.sync += counting.eq(0) + m.d.comb += i_en_delay_chain_input.eq(i_en_cached) + with m.Else(): + m.d.sync += counter.eq(counter + 1) + + with m.If(Signal.cast(self.i_en).any()): + m.d.sync += ( + counting.eq(1), + i_en_cached.eq(self.i_en), + meta_cached.eq(self.meta), + counter.eq(0), + ) + + m.d.comb += ( + i_en_delayed_except_half_cyc.eq(i_en_delay_chain_input), + meta_delayed_except_half_cyc.eq(meta_cached), + reads_in_flight_except_half_cyc.eq(counting), + ) + + if self._max_latency_except_hcyc > self._min_divisor: + delay_chain_cycles = self._max_latency_except_hcyc - self._min_divisor + i_en_delays = delay(i_en_delay_chain_input, name=f"i_en", cycles=delay_chain_cycles) + meta_delays = delay(meta_cached, name=f"meta", cycles=delay_chain_cycles) + + delay_selector = latency_minus_1 - self._min_divisor + + i_en_in_flight_up_to = Array(Signal(1, name=f"i_en_in_flight_{stage}") for stage in range(delay_chain_cycles)) + m.d.comb += i_en_in_flight_up_to[0].eq(Signal.cast(i_en_delays[0]).any()) + for stage in range(1, delay_chain_cycles): + value = Signal.cast(i_en_delays[stage]).any() | i_en_in_flight_up_to[stage - 1] + m.d.comb += i_en_in_flight_up_to[stage].eq(value) + + with m.If(latency_minus_1 >= self._min_divisor): + m.d.comb += i_en_delayed_except_half_cyc.eq(i_en_delays[delay_selector]) + m.d.comb += meta_delayed_except_half_cyc.eq(meta_delays[delay_selector]) + m.d.comb += reads_in_flight_except_half_cyc.eq(counting | i_en_in_flight_up_to[delay_selector]) + + else: # Simple shift-register-only based implementation + meta, i_en_delays, i_en = [], [], [] + delay_selector = self._min_latency - 1 + self.sample_delay_half_clocks // 2 + + i_en_delays = delay(self.i_en, name=f"i_en", cycles=self._max_latency_except_hcyc) + meta_delays = delay(self.meta, name=f"meta", cycles=self._max_latency_except_hcyc) + + m.d.comb += i_en_delayed_except_half_cyc.eq(i_en_delays[delay_selector]) + m.d.comb += meta_delayed_except_half_cyc.eq(meta_delays[delay_selector]) + + i_en_in_flight_up_to = Array(Signal(1, name=f"i_en_in_flight_{stage}") for stage in range(self._max_latency_except_hcyc)) + m.d.comb += i_en_in_flight_up_to[0].eq(Signal.cast(i_en_delays[0]).any()) + for stage in range(1, self._max_latency_except_hcyc): + value = Signal.cast(i_en_delays[stage]).any() | i_en_in_flight_up_to[stage - 1] + m.d.comb += i_en_in_flight_up_to[stage].eq(value) + + m.d.comb += reads_in_flight_except_half_cyc.eq(i_en_in_flight_up_to[delay_selector]) + + # Here follows code common to both implementations, that handles half a cycle delays. + # Half-cycle delays are handled as an additional delay step. (The sample payload will + # be combined from two different clock cycles.) We're using an additional shift + # register stage to avoid having to calculate a dynamic delay of + # (sample_delay // 2 + sample_delay % 2) + m.d.comb += self.i_en_delayed.eq(i_en_delayed_except_half_cyc) + m.d.comb += self.meta_delayed.eq(meta_delayed_except_half_cyc) + m.d.comb += self.reads_in_flight.eq(reads_in_flight_except_half_cyc) + + if self._ratio == 2: + i_en_hcyc = delay(i_en_delayed_except_half_cyc, name=f"i_en_hcyc", cycles=1)[0] + meta_hcyc = delay(meta_delayed_except_half_cyc, name=f"meta_hcyc", cycles=1)[0] + with m.If(self.sample_delay_half_clocks % 2): + m.d.comb += self.i_en_delayed.eq(i_en_hcyc) + m.d.comb += self.meta_delayed.eq(meta_hcyc) + m.d.comb += self.reads_in_flight.eq(reads_in_flight_except_half_cyc | Signal.cast(i_en_hcyc).any()) + + return m + +class IOStreamerTop(wiring.Component): + """I/O buffer to stream adapter. + + This adapter instantiates I/O buffers for a port (FF or DDR) and connects them to a pair of + streams, one for the outputs of the buffers and one for the inputs. Whenever an `o_stream` + transfer occurs, the state of the output is updated _t1_ cycles later; if `o_stream.p.i_en` + is set, then _t2_ cycles later, a payload with the data captured at the same time as + the outputs were updated appears on `i_stream.p.i`. + + Arbitrary ancillary data may be provided with `o_stream` transfers via `o_stream.p.meta`, + and this data will be relayed back as `i_stream.p.meta` with the output-to-input latency + of the buffer. Higher-level protocol engines can use this data to indicate how the inputs + must be processed without needing counters or state machines on a higher level to match + the latency (and, usually, without needing any knowledge of the latency at all). + + On reset, output ports have their drivers enabled, and bidirectional ports have them disabled. + All of the signals are deasserted, which could be a low or a high level depending on the port + polarity. + """ + + def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0, divisor_width=16, max_sample_delay_half_clocks=0, min_divisor=0): + assert isinstance(ioshape, (int, dict)) + assert ratio in (1, 2) + + self._ioshape = ioshape + self._ports = ports + self._ratio = ratio + self._init = init + self._divisor_width = divisor_width + self._max_sample_delay_half_clocks = max_sample_delay_half_clocks + self._meta_layout = meta_layout + self._min_divisor = min_divisor + + super().__init__({ + "o_stream": In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), + "i_stream": Out(IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)), + "divisor": In(divisor_width), + "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1)), + }) + + def elaborate(self, platform): + m = Module() + + #if self._min_divisor: + # m.d.sync += Assert(self.divisor >= self._min_divisor) + + #if self._ratio == 1: + # m.d.sync += Assert(self.sample_delay_half_clocks % 2 == 0) + + m.submodules.stream_stretcher = stream_stretcher = StreamStretcher( + IOOutputStreamSignature(self._ioshape, lane_count=self._ratio, meta_layout=self._meta_layout), + divisor_width = self._divisor_width) + m.d.comb += stream_stretcher.divisor.eq(self.divisor) + wiring.connect(m, io_streamer=wiring.flipped(self.o_stream), stream_strecher=stream_stretcher.i_stream) + + m.submodules.io_streamer = io_streamer = IOStreamer(self._ioshape, self._ports, ratio=self._ratio, meta_layout=0) + m.submodules.io_latcher = io_latcher = IOLatcher(self._ioshape, ratio=self._ratio, init=self._init, meta_layout=0) + wiring.connect(m, io_latcher=io_latcher.o_stream, io_streamer=io_streamer.o_stream) + for lane_index in range(self._ratio): + m.d.comb += io_latcher.i_stream.p[lane_index].actual.port.eq(stream_stretcher.o_stream.p[lane_index].actual.port) + m.d.comb += io_latcher.i_stream.valid.eq(stream_stretcher.o_stream.valid & stream_stretcher.o_stream.ready) + # ^ note: the above makes sure IOLatcher doesn't take a new transaction if we're blocking the input + + min_latency = io_streamer.get_latency(platform) + max_latency = min_latency + self._max_sample_delay_half_clocks // 2 + self._max_sample_delay_half_clocks % 2 + + m.submodules.sample_request_delayer = sample_request_delayer = SampleRequestDelayer(ratio=self._ratio, + meta_layout=self._meta_layout, + min_latency=min_latency, + max_sample_delay_half_clocks=self._max_sample_delay_half_clocks, + min_divisor=self._min_divisor) + m.d.comb += sample_request_delayer.sample_delay_half_clocks.eq(self.sample_delay_half_clocks) + for lane_index in range(self._ratio): + m.d.comb += sample_request_delayer.i_en[lane_index].eq(stream_stretcher.o_stream.valid & + stream_stretcher.o_stream.ready & + stream_stretcher.o_stream.p[lane_index].actual.i_en) + m.d.comb += sample_request_delayer.meta[lane_index].eq(stream_stretcher.o_stream.p[lane_index].meta) + + skid_buffer_depth = max_latency + if self._min_divisor > 1: + # This is an optimisation we can apply if we know at elaboration time that divisor can never be larger than min_divisor + skid_buffer_depth = (max_latency + self._min_divisor - 1) // self._min_divisor + + m.submodules.skid_buffer = skid_buffer = SkidBuffer( + skid_buffer_depth, + IOInputStreamSignature(self._ioshape, lane_count=self._ratio, meta_layout=self._meta_layout), + ) + m.d.comb += skid_buffer.i_stream.valid.eq(Signal.cast(sample_request_delayer.i_en_delayed).any()) + #with m.If(skid_buffer.i_stream.valid): + # m.d.sync += Assert(skid_buffer.i_stream.ready) + + for lane_index in range(self._ratio): + m.d.comb += skid_buffer.i_stream.p[lane_index].actual.port.eq(io_streamer.i_stream.p[lane_index].actual.port) + + if self._ratio == 2: + with m.If(self.sample_delay_half_clocks % 2): + m.d.comb += skid_buffer.i_stream.p[1].actual.port.eq(io_streamer.i_stream.p[0].actual.port) + i1_delayed = Signal.like(io_streamer.i_stream.p[1].actual.port, name=f"i1_delayed") + m.d.sync += i1_delayed.eq(io_streamer.i_stream.p[1].actual.port) + m.d.comb += skid_buffer.i_stream.p[0].actual.port.eq(i1_delayed) + + for lane_index in range(self._ratio): + m.d.comb += skid_buffer.i_stream.p[lane_index].meta.eq(sample_request_delayer.meta_delayed[lane_index]) + m.d.comb += skid_buffer.i_stream.p[lane_index].actual.i_valid.eq(sample_request_delayer.i_en_delayed[lane_index]) + + wiring.connect(m, skid_buffer=skid_buffer.o_stream, io_streamer_top=wiring.flipped(self.i_stream)) + + m.d.comb += stream_stretcher.o_stream.ready.eq(self.i_stream.ready | (~skid_buffer.o_stream.valid & ~sample_request_delayer.reads_in_flight)) + + return m + + +class IO2LaneTo1Lane(wiring.Component): + """ + This component down-converts a 2-lane stream to a 1-lane stream, while adding + information to the metadata, which includes: + tag: the index of the lane the original data belonged to + last: a flag signifying if a second beat is expected in the case of later up-conversion + The last fields is optionally determined using the supplied is_beat_0_last argument + to the constructor, which must be a function that returns an amaranth expression + """ + @staticmethod + def i_stream_signature(actual_layout, /, *, meta_layout=0): + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=meta_layout), + 2 + ) + ) + + @staticmethod + def o_stream_signature(actual_layout, /, *, meta_layout=0): + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)), + 1 + ) + ) + + def __init__(self, actual_layout, *, meta_layout=0, is_beat_0_last=lambda payload: 0): + self._is_beat_0_last = is_beat_0_last + super().__init__({ + "i_stream": In(self.i_stream_signature(actual_layout, meta_layout=meta_layout)), + "o_stream": Out(self.o_stream_signature(actual_layout, meta_layout=meta_layout)), + }) + + def elaborate(self, platform): + m = Module() + + phase = Signal() + m.d.comb += self.o_stream.p[0].actual.eq(self.i_stream.p[phase].actual) + m.d.comb += self.o_stream.p[0].meta.inner_meta.eq(self.i_stream.p[phase].meta) + m.d.comb += self.o_stream.p[0].meta.tag.eq(phase) + m.d.comb += self.o_stream.p[0].meta.last.eq(1) + with m.If((phase == 0) & ~self._is_beat_0_last(self.i_stream.p)): + m.d.comb += self.o_stream.p[0].meta.last.eq(0) + + m.d.comb += self.o_stream.valid.eq(self.i_stream.valid) + with m.If(self.o_stream.ready): + with m.If(phase == 0): + with m.If(self.i_stream.valid): + m.d.sync += phase.eq(1) + + with m.Else(): # phase == 1 + m.d.comb += self.i_stream.ready.eq(1) + m.d.sync += phase.eq(0) return m class IOClocker(wiring.Component): + """ + In case of ratio=1: + This component down-converts (serializes) 2 lanes to 1 lane, while adding metadata to identify which lane each beat belonged to. + In case of ratio=2, divisor=0: + This component adds useless metadata, but is otherwise a pass-through. Adding the metadata is necessary, because divisor is not a compile-time parameter + In case of ratio=2, divisor!=0: + This component down-converts (serializes) 2 lanes to 1 lane, just like in the ratio=1 case above, except, it duplicates the resulting lane, while also making + sure to force `i_en` of the second resulting lane to 0. This means that one `i_en` bit high doesn't result in two samples. + ratio=2, divisor!=0 is designed to behave exactly like ratio=1, divisor!=0, so the read samples associated with output lane 0 are the only ones we care about. + """ @staticmethod - def i_stream_signature(ioshape, /, *, _ratio=1, meta_layout=0): - # Currently the only supported ratio is 1, but this will change in the future for - # interfaces like HyperBus. - return stream.Signature(data.StructLayout({ - "bypass": 1, - "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({ - "o": width if _ratio == 1 else data.ArrayLayout(width, _ratio), - "oe": 1, - })), - "i_en": 1, - "meta": meta_layout, - })) + def i_stream_signature(ioshape, /, *, _ratio=2, meta_layout=0): + # Currently the only supported ratio is 2 + return IOOutputStreamSignature(ioshape, lane_count=_ratio, meta_layout=meta_layout) @staticmethod def o_stream_signature(ioshape, /, *, ratio=1, meta_layout=0): - return IOStreamer.o_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout) + return IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)) - def __init__(self, ioshape, *, clock, o_ratio=1, meta_layout=0, divisor_width=16): + def __init__(self, ioshape, *, o_ratio=1, meta_layout=0, divisor_width=16): assert isinstance(ioshape, dict) - assert isinstance(clock, str) assert o_ratio in (1, 2) - assert clock in ioshape - self._clock = clock self._ioshape = ioshape self._o_ratio = o_ratio + self._meta_layout = meta_layout super().__init__({ "i_stream": In(self.i_stream_signature(ioshape, meta_layout=meta_layout)), "o_stream": Out(self.o_stream_signature(ioshape, ratio=o_ratio, meta_layout=meta_layout)), - - # f_clk = f_sync if (o_ratio == 2 and divisor == 0) else f_sync / (2 * max(1, divisor)) "divisor": In(divisor_width), }) def elaborate(self, platform): m = Module() - # Forward the inputs to the outputs as-is. This includes the clock; it is overridden below - # if the clocker is used (not bypassed). - for _, i_parts, o_parts in _iter_ioshape("io", self._ioshape, - self.i_stream.p.port, self.o_stream.p.port): - m.d.comb += o_parts.o .eq(i_parts.o.replicate(self._o_ratio)) - m.d.comb += o_parts.oe.eq(i_parts.oe) - m.d.comb += self.o_stream.p.i_en.eq(self.i_stream.p.i_en) - m.d.comb += self.o_stream.p.meta.eq(self.i_stream.p.meta) + m.submodules.io_2to1_lane = io_2to1_lane = IO2LaneTo1Lane(IOOutputActualLayout(self._ioshape), meta_layout=self._meta_layout, + is_beat_0_last = lambda payload: payload[1].actual.i_en==0) phase = Signal() - # If the clocker is used... - with m.If(~self.i_stream.p.bypass): - # ... ignore the clock in the inputs and replace it with the generated one... - if self._o_ratio == 1: - m.d.comb += self.o_stream.p.port[self._clock].o.eq(phase) - if self._o_ratio == 2: - with m.If(self.divisor == 0): - m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(~phase, phase)) - with m.Else(): - m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(phase, phase)) - m.d.comb += self.o_stream.p.port[self._clock].oe.eq(1) - # ... while requesting input sampling only for the rising edge. (Interfaces triggering - # transfers on falling edge will be inverting the clock at the `IOPort` level.) - m.d.comb += self.o_stream.p.i_en.eq(self.i_stream.p.i_en & phase) + if self._o_ratio == 1: + wiring.connect(m, ioclocker=wiring.flipped(self.i_stream), io_2to1_lane=io_2to1_lane.i_stream) + wiring.connect(m, ioclocker=wiring.flipped(self.o_stream), io_2to1_lane=io_2to1_lane.o_stream) + if self._o_ratio == 2: + with m.If(self.divisor == 0): + # Just pass-through, we're doing nothing, but adding currently-useless tag metadata + for lane_index in range(self._o_ratio): + m.d.comb += self.o_stream.p[lane_index].actual.eq(self.i_stream.p[lane_index].actual) + m.d.comb += self.o_stream.p[lane_index].meta.inner_meta.eq(self.i_stream.p[lane_index].meta) + m.d.comb += self.o_stream.p[lane_index].meta.tag.eq(lane_index) + m.d.comb += self.o_stream.p[lane_index].meta.last.eq(~self.i_stream.p[1].actual.i_en) + m.d.comb += self.o_stream.valid.eq(self.i_stream.valid) + m.d.comb += self.i_stream.ready.eq(self.o_stream.ready) + with m.Else(): + wiring.connect(m, ioclocker=wiring.flipped(self.i_stream), io_2to1_lane=io_2to1_lane.i_stream) + for olane_index in range(2): + m.d.comb += self.o_stream.p[olane_index].eq(io_2to1_lane.o_stream.p[0]) + m.d.comb += self.o_stream.p[1].actual.i_en.eq(0) # Override i_en, to only sample once + m.d.comb += self.o_stream.valid.eq(io_2to1_lane.o_stream.valid) + m.d.comb += io_2to1_lane.o_stream.ready.eq(self.o_stream.ready) - timer = Signal.like(self.divisor) - with m.If((timer == 0) | (timer == 1)): - # Only produce output when the timer has expired. This ensures that no clock pulse - # exceeds the frequency set by `divisor`, except the ones that bypass the clocker. - m.d.comb += self.o_stream.valid.eq(self.i_stream.valid) + return m + + +class IO1LaneTo2Lane(wiring.Component): + """ + This component up-converts a 1-lane stream to a 2-lane stream, using + information from the metadata, to determine which lane to put each beat in: + tag: the index of the lane the data should be put on + last: a flag signifying if a second beat is expected + An output stream transaction occurs only when the last bit is high for an + input-stream transaction. + """ + @staticmethod + def o_stream_signature(actual_layout, /, *, meta_layout=0): + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=meta_layout), + 2 + ) + ) + + @staticmethod + def i_stream_signature(actual_layout, /, *, meta_layout=0): + return stream.Signature( + data.ArrayLayout( + LaneLayout(actual_layout, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)), + 1 + ) + ) + + def __init__(self, actual_layout, *, meta_layout=0): + super().__init__({ + "i_stream": In(self.i_stream_signature(actual_layout, meta_layout=meta_layout)), + "o_stream": Out(self.o_stream_signature(actual_layout, meta_layout=meta_layout)), + }) - with m.FSM(): - with m.State("Falling"): - with m.If(self.i_stream.p.bypass): # Bypass the clocker entirely. - m.d.comb += self.i_stream.ready.eq(self.o_stream.ready) - - with m.Else(): # Produce a falling edge at the output. - # Whenever DDR output is used, with `divisor == 0`, we output a low state - # on the first half of the clock cycle, and a high state on the second half. - # This mode allows clocking the peripheral at the `sync` frequency. - # In this case the signal sampled at the rising edge will be output on i[1] - # (if sample_delay was set to zero) - # In all other cases the signal sampled at the rising edge will be output on i[0] - # (if sample_delay was set to zero) - with m.If((self._o_ratio == 2) & (self.divisor == 0)): - m.d.comb += phase.eq(1) - with m.If(self.o_stream.ready): - m.d.comb += self.i_stream.ready.eq(1) - with m.Else(): - m.d.comb += phase.eq(0) - with m.If(self.o_stream.ready & self.i_stream.valid): - m.d.sync += timer.eq(self.divisor) - m.next = "Rising" - - with m.State("Rising"): - m.d.comb += phase.eq(1) - with m.If(self.o_stream.ready): - m.d.comb += self.i_stream.ready.eq(1) - m.d.sync += timer.eq(self.divisor) - m.next = "Falling" + def elaborate(self, platform): + m = Module() + + untagged_istream_lane = Signal.like(self.o_stream.p[0]) + m.d.comb += untagged_istream_lane.actual.eq(self.i_stream.p[0].actual) + m.d.comb += untagged_istream_lane.meta.eq(self.i_stream.p[0].meta.inner_meta) + + phase_0_stored = Signal.like(untagged_istream_lane) + + m.d.comb += self.i_stream.ready.eq(self.o_stream.ready) + with m.If(self.i_stream.valid): + with m.If(self.i_stream.p[0].meta.last): + m.d.comb += self.o_stream.p[self.i_stream.p[0].meta.tag].eq(untagged_istream_lane) + with m.If(self.i_stream.p[0].meta.tag != 0): + m.d.comb += self.o_stream.p[0].eq(phase_0_stored) + m.d.comb += self.o_stream.valid.eq(1) + with m.If(self.i_stream.ready): + m.d.sync += phase_0_stored.eq(0) + with m.Else(): + with m.If(self.i_stream.ready): + m.d.sync += phase_0_stored.eq(untagged_istream_lane) + return m + + +class IOClockerDeframer(wiring.Component): + """ + In case of ratio=1: + This component up-converts (deserializes) 1-lane samples to 2-lane. + In case of ratio=2, divisor=0: + This component is a simple pass-through, doing nothing + In case of ratio=2, divisor!=0: + This component throws away lane[1] of the input, and up-converts (deserializes) lane[0] to 2 lanes + See IO1LaneTo2Lane subcomponent for more details + """ + @staticmethod + def o_stream_signature(ioshape, /, *, _ratio=1, meta_layout=0): + # Currently the only supported ratio is 1, but this will change in the future for + # interfaces like HyperBus. + return IOInputStreamSignature(ioshape, lane_count=2, meta_layout=meta_layout) + + @staticmethod + def i_stream_signature(ioshape, /, *, ratio=1, meta_layout=0): + return IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)) + + def __init__(self, ioshape, *, i_ratio=1, meta_layout=0, divisor_width=16): + assert isinstance(ioshape, dict) + assert i_ratio in (1, 2) + + self._ioshape = ioshape + self._i_ratio = i_ratio + self._meta_layout = meta_layout + + super().__init__({ + "i_stream": In(self.i_stream_signature(ioshape, + ratio=i_ratio, meta_layout=meta_layout)), + "o_stream": Out(self.o_stream_signature(ioshape, + meta_layout=meta_layout)), + "divisor": In(divisor_width), + }) + + def elaborate(self, platform): + m = Module() + + m.submodules.io_1to2_lane = io_1to2_lane = IO1LaneTo2Lane(IOInputActualLayout(self._ioshape), meta_layout=self._meta_layout) + + with m.If((self.divisor == 0) & (self._i_ratio == 2)): + # Just pass-through everyting + for lane_index in range(self._i_ratio): + m.d.comb += self.o_stream.p[lane_index].actual.eq(self.i_stream.p[lane_index].actual) + m.d.comb += self.o_stream.p[lane_index].meta.eq(self.i_stream.p[lane_index].meta.inner_meta) + m.d.comb += self.o_stream.valid.eq(self.i_stream.valid) + m.d.comb += self.i_stream.ready.eq(self.o_stream.ready) with m.Else(): - m.d.sync += timer.eq(timer - 1) + m.d.comb += io_1to2_lane.i_stream.valid.eq(self.i_stream.valid) + m.d.comb += self.i_stream.ready.eq(io_1to2_lane.i_stream.ready) + m.d.comb += io_1to2_lane.i_stream.p[0].eq(self.i_stream.p[0]) + # ^ `wiring.connect` won't work here in case of i_ratio=2, we're explicitly + # throwing away the second lane here, cause we know IOClocker always sends + # sample requests on lane 0, (when divisor != 0). In case of i_ratio=1, + # this is equivalent to `wiring.connect` + + wiring.connect(m, io_1to2_lane=io_1to2_lane.o_stream, io_clocker_deframer=wiring.flipped(self.o_stream)) return m diff --git a/software/glasgow/gateware/qspi.py b/software/glasgow/gateware/qspi.py index 6a3bb8241..f470f9c60 100644 --- a/software/glasgow/gateware/qspi.py +++ b/software/glasgow/gateware/qspi.py @@ -3,7 +3,7 @@ from amaranth.lib.wiring import In, Out, connect, flipped from .ports import PortGroup -from .iostream import IOStreamer, IOClocker +from .iostream import IOStreamerTop, IOClocker, IOClockerDeframer, MetaLayoutWithTag __all__ = ["QSPIMode", "QSPIEnframer", "QSPIDeframer", "QSPIController"] @@ -57,43 +57,54 @@ def elaborate(self, platform): m.d.comb += self.octets.ready.eq(cycle == 0) m.d.sync += cycle.eq(Mux(self.octets.ready, 0, cycle + 1)) - # When no chip is selected, keep clock in the idle state. The only supported `mode` - # in this case is `QSPIMode.Dummy`, which should be used to deassert CS# at the end of - # a transfer. - m.d.comb += self.frames.p.bypass.eq(self.octets.p.chip == 0) - m.d.comb += self.frames.p.port.sck.o.eq(1) # (for bypass only) - m.d.comb += self.frames.p.port.sck.oe.eq(1) # (for bypass only) + for lane_index in range(2): + m.d.comb += self.frames.p[lane_index].actual.port.sck.oe.eq(1) + + with m.If(self.octets.p.chip == 0): + # When no chip is selected, keep clock in the idle state. The only supported `mode` + # in this case is `QSPIMode.Dummy`, which should be used to deassert CS# at the end of + # a transfer. + m.d.comb += self.frames.p[0].actual.port.sck.o.eq(1) + m.d.comb += self.frames.p[1].actual.port.sck.o.eq(1) + with m.Else(): + m.d.comb += self.frames.p[0].actual.port.sck.o.eq(0) + m.d.comb += self.frames.p[1].actual.port.sck.o.eq(1) rev_data = self.octets.p.data[::-1] # MSB first with m.Switch(self.octets.p.mode): with m.Case(QSPIMode.PutX1, QSPIMode.Swap): - m.d.comb += self.frames.p.port.io0.o.eq(rev_data.word_select(cycle, 1)) - m.d.comb += self.frames.p.port.io0.oe.eq(0b1) - m.d.comb += self.frames.p.i_en.eq(self.octets.p.mode == QSPIMode.Swap) + for lane_index in range(2): + m.d.comb += self.frames.p[lane_index].actual.port.io0.o.eq(rev_data.word_select(cycle, 1)) + m.d.comb += self.frames.p[lane_index].actual.port.io0.oe.eq(0b1) + m.d.comb += self.frames.p[1].actual.i_en.eq(self.octets.p.mode == QSPIMode.Swap) with m.Case(QSPIMode.GetX1): - m.d.comb += self.frames.p.port.io0.oe.eq(0b1) - m.d.comb += self.frames.p.i_en.eq(1) + for lane_index in range(2): + m.d.comb += self.frames.p[lane_index].actual.port.io0.oe.eq(0b1) + m.d.comb += self.frames.p[1].actual.i_en.eq(1) with m.Case(QSPIMode.PutX2): - m.d.comb += Cat(self.frames.p.port.io1.o, - self.frames.p.port.io0.o).eq(rev_data.word_select(cycle, 2)) - m.d.comb += Cat(self.frames.p.port.io1.oe, - self.frames.p.port.io0.oe).eq(0b11) + for lane_index in range(2): + m.d.comb += Cat(self.frames.p[lane_index].actual.port.io1.o, + self.frames.p[lane_index].actual.port.io0.o).eq(rev_data.word_select(cycle, 2)) + m.d.comb += Cat(self.frames.p[lane_index].actual.port.io1.oe, + self.frames.p[lane_index].actual.port.io0.oe).eq(0b11) with m.Case(QSPIMode.GetX2): - m.d.comb += self.frames.p.i_en.eq(1) + m.d.comb += self.frames.p[1].actual.i_en.eq(1) with m.Case(QSPIMode.PutX4): - m.d.comb += Cat(self.frames.p.port.io3.o, - self.frames.p.port.io2.o, - self.frames.p.port.io1.o, - self.frames.p.port.io0.o).eq(rev_data.word_select(cycle, 4)) - m.d.comb += Cat(self.frames.p.port.io3.oe, - self.frames.p.port.io2.oe, - self.frames.p.port.io1.oe, - self.frames.p.port.io0.oe).eq(0b1111) + for lane_index in range(2): + m.d.comb += Cat(self.frames.p[lane_index].actual.port.io3.o, + self.frames.p[lane_index].actual.port.io2.o, + self.frames.p[lane_index].actual.port.io1.o, + self.frames.p[lane_index].actual.port.io0.o).eq(rev_data.word_select(cycle, 4)) + m.d.comb += Cat(self.frames.p[lane_index].actual.port.io3.oe, + self.frames.p[lane_index].actual.port.io2.oe, + self.frames.p[lane_index].actual.port.io1.oe, + self.frames.p[lane_index].actual.port.io0.oe).eq(0b1111) with m.Case(QSPIMode.GetX4): - m.d.comb += self.frames.p.i_en.eq(1) - m.d.comb += self.frames.p.port.cs.o.eq((1 << self.octets.p.chip)[1:]) - m.d.comb += self.frames.p.port.cs.oe.eq(1) - m.d.comb += self.frames.p.meta.eq(self.octets.p.mode) + m.d.comb += self.frames.p[1].actual.i_en.eq(1) + for lane_index in range(2): + m.d.comb += self.frames.p[lane_index].actual.port.cs.o.eq((1 << self.octets.p.chip)[1:]) + m.d.comb += self.frames.p[lane_index].actual.port.cs.oe.eq(1) + m.d.comb += self.frames.p[1].meta.eq(self.octets.p.mode) return m @@ -101,7 +112,7 @@ def elaborate(self, platform): class QSPIDeframer(wiring.Component): # meow :3 def __init__(self): super().__init__({ - "frames": In(IOStreamer.i_stream_signature({ + "frames": In(IOClockerDeframer.o_stream_signature({ "io0": ("io", 1), "io1": ("io", 1), "io2": ("io", 1), @@ -118,7 +129,7 @@ def elaborate(self, platform): cycle = Signal(range(8)) m.d.comb += self.frames.ready.eq(~self.octets.valid | self.octets.ready) with m.If(self.frames.valid): - with m.Switch(self.frames.p.meta): + with m.Switch(self.frames.p[1].meta): with m.Case(QSPIMode.GetX1, QSPIMode.Swap): m.d.comb += self.octets.valid.eq(cycle == 7) with m.Case(QSPIMode.GetX2): @@ -129,17 +140,17 @@ def elaborate(self, platform): m.d.sync += cycle.eq(Mux(self.octets.valid, 0, cycle + 1)) data_reg = Signal(8) - with m.Switch(self.frames.p.meta): + with m.Switch(self.frames.p[1].meta): with m.Case(QSPIMode.GetX1, QSPIMode.Swap): # note: samples IO1 - m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io1.i, data_reg)) + m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io1.i, data_reg)) with m.Case(QSPIMode.GetX2): - m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io0.i, - self.frames.p.port.io1.i, data_reg)) + m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io0.i, + self.frames.p[1].actual.port.io1.i, data_reg)) with m.Case(QSPIMode.GetX4): - m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io0.i, - self.frames.p.port.io1.i, - self.frames.p.port.io2.i, - self.frames.p.port.io3.i, data_reg)) + m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io0.i, + self.frames.p[1].actual.port.io1.i, + self.frames.p[1].actual.port.io2.i, + self.frames.p[1].actual.port.io3.i, data_reg)) with m.If(self.frames.valid & self.frames.ready): m.d.sync += data_reg.eq(self.octets.p.data) @@ -147,7 +158,7 @@ def elaborate(self, platform): class QSPIController(wiring.Component): - def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_half_clocks=0): + def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, divisor_width=16, max_sample_delay_half_clocks=0, min_divisor=0): assert len(ports.sck) == 1 and ports.sck.direction in (io.Direction.Output, io.Direction.Bidir) assert len(ports.io) == 4 and ports.io.direction == io.Direction.Bidir assert len(ports.cs) >= 1 and ports.cs.direction in (io.Direction.Output, io.Direction.Bidir) @@ -162,7 +173,9 @@ def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_h ) self._ddr = use_ddr_buffers self._chip_count = chip_count - self._sample_delay_half_clocks = sample_delay_half_clocks + self._divisor_width = divisor_width + self._max_sample_delay_half_clocks = max_sample_delay_half_clocks + self._min_divisor = min_divisor super().__init__({ "o_octets": In(stream.Signature(data.StructLayout({ @@ -174,7 +187,8 @@ def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_h "data": 8 }))), - "divisor": In(16), + "divisor": In(divisor_width), + "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1)) }) def elaborate(self, platform): @@ -194,36 +208,28 @@ def elaborate(self, platform): connect(m, controller=flipped(self.o_octets), enframer=enframer.octets) m.submodules.io_clocker = io_clocker = IOClocker(ioshape, - clock="sck", o_ratio=ratio, meta_layout=QSPIMode) + o_ratio=ratio, meta_layout=QSPIMode, divisor_width=self._divisor_width) connect(m, enframer=enframer.frames, io_clocker=io_clocker.i_stream) m.d.comb += io_clocker.divisor.eq(self.divisor) - m.submodules.io_streamer = io_streamer = IOStreamer(ioshape, self._ports, init={ + m.submodules.io_streamer = io_streamer = IOStreamerTop(ioshape, self._ports, init={ "sck": {"o": 1, "oe": 1}, # Motorola "Mode 3" with clock idling high "cs": {"o": 0, "oe": 1}, # deselected - }, ratio=ratio, meta_layout=QSPIMode, - sample_delay_half_clocks=self._sample_delay_half_clocks) + }, ratio=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=QSPIMode), + divisor_width=self._divisor_width, + max_sample_delay_half_clocks=self._max_sample_delay_half_clocks, + min_divisor=self._min_divisor) connect(m, io_clocker=io_clocker.o_stream, io_streamer=io_streamer.o_stream) + m.d.comb += io_streamer.divisor.eq(self.divisor) + m.d.comb += io_streamer.sample_delay_half_clocks.eq(self.sample_delay_half_clocks) + + m.submodules.io_clocker_deframer = io_clocker_deframer = IOClockerDeframer(ioshape, + i_ratio=ratio, meta_layout=QSPIMode) + connect(m, io_streamer=io_streamer.i_stream, io_clocker_deframer=io_clocker_deframer.i_stream) + m.d.comb += io_clocker_deframer.divisor.eq(self.divisor) m.submodules.deframer = deframer = QSPIDeframer() - m.d.comb += [ # connect() wouldn't work if DDR buffers are used - deframer.frames.p.port.io0.i.eq(io_streamer.i_stream.p.port.io0.i[0]), - deframer.frames.p.port.io1.i.eq(io_streamer.i_stream.p.port.io1.i[0]), - deframer.frames.p.port.io2.i.eq(io_streamer.i_stream.p.port.io2.i[0]), - deframer.frames.p.port.io3.i.eq(io_streamer.i_stream.p.port.io3.i[0]), - deframer.frames.p.meta.eq(io_streamer.i_stream.p.meta), - deframer.frames.valid.eq(io_streamer.i_stream.valid), - io_streamer.i_stream.ready.eq(deframer.frames.ready), - ] - - if self._ddr: - with m.If(self.divisor == 0): - m.d.comb += [ - deframer.frames.p.port.io0.i.eq(io_streamer.i_stream.p.port.io0.i[1]), - deframer.frames.p.port.io1.i.eq(io_streamer.i_stream.p.port.io1.i[1]), - deframer.frames.p.port.io2.i.eq(io_streamer.i_stream.p.port.io2.i[1]), - deframer.frames.p.port.io3.i.eq(io_streamer.i_stream.p.port.io3.i[1]), - ] + connect(m, io_clocker_deframer=io_clocker_deframer.o_stream, deframer=deframer.frames) connect(m, deframer=deframer.octets, controller=flipped(self.i_octets)) diff --git a/software/tests/gateware/test_iostream.py b/software/tests/gateware/test_iostream.py index 2b50d982f..6a8fe9202 100644 --- a/software/tests/gateware/test_iostream.py +++ b/software/tests/gateware/test_iostream.py @@ -5,7 +5,7 @@ from amaranth.lib import io from glasgow.gateware.ports import PortGroup -from glasgow.gateware.iostream import IOStreamer +from glasgow.gateware.iostream import IOStreamerTop MAX_SKIDBUFFER_SIZE = 4 @@ -55,7 +55,7 @@ class IOStreamTimeoutError(Exception): class IOStreamTestCase(unittest.TestCase): def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_bits, timeout_clocks=None, iready_comb_path=False): """ - This is a latency-agnostic test, that verifies that the IOStreamer samples the inputs at the same time as the output signals change. + This is a latency-agnostic test, that verifies that the IOStreamerTop samples the inputs at the same time as the output signals change. o_valid_bits: is a string of "1"s and "0"s. Each character refers to one (or more) clock cycles. "1" means to send a payload, and "0" means to leave o_stream idle for 1 clock cycle. When sending a payload o_stream is waited upon if it's not ready, so @@ -76,7 +76,7 @@ def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_ - back-pressure on i_stream may result in back-pressure on o_stream, never allowing the full o_valid_bits string to be completely played back, and resulting in a timeout. - the playback of the o_valid_bits string may complete, however it's possible a number of sample requests that are in flight remain stuck - in the IOStreamer, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final + in the IOStreamerTop, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final samples have been lost. To make sure a testcase completes, you will see some testcases have a large number of "1"s in this string past the length of o_valid_bits. @@ -93,7 +93,7 @@ def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_ if timeout_clocks is None: timeout_clocks = len(o_valid_bits) + len(i_ready_bits) + 20 - dut = IOStreamer({ + dut = IOStreamerTop({ "clk_out": ("o", 1), "data_in": ("i", 8), }, ports, meta_layout=4) @@ -118,7 +118,7 @@ async def input_generator_tb(ctx): async def save_expected_sample_values_tb(ctx): """ This testbench looks at the clk_out port and when it sees a positive or negative edge it knows that - IOStreamer is expected to sample the input signal, so the current state of the data_in port + IOStreamerTop is expected to sample the input signal, so the current state of the data_in port becomes one of the expected sampled values. This is saved into expected_sample[] to be compared later, when the sample actually arrives back on i_stream. """ @@ -135,7 +135,8 @@ async def i_stream_consumer_tb(ctx): if i_ready_bit == "1" and (not iready_comb_path or ctx.get(dut.i_stream.valid)): payload = await stream_get_maybe(ctx,dut.i_stream) if payload is not None: - actually_sampled.append(payload.port.data_in.i) + actually_sampled.append(payload[0].actual.port.data_in.i) + assert payload[0].actual.i_valid else: await ctx.tick() i_stream_consumer_finished = True @@ -168,15 +169,17 @@ async def main_testbench(ctx): if i_en_bit: expected_samples_count += 1 o_bit ^= 1 - await stream_put(ctx, dut.o_stream, { + await stream_put(ctx, dut.o_stream, [{ "meta": i, - "i_en": i_en_bit, - "port": { - "clk_out": { - "o": o_bit, + "actual": { + "i_en": i_en_bit, + "port": { + "clk_out": { + "o": o_bit, + } } } - }) + }]) else: await ctx.tick() @@ -184,7 +187,7 @@ async def main_testbench(ctx): await ctx.tick() assert len(actually_sampled) == expected_samples_count # This should be checked as well, because a - # possible failure mode is if IOStreamer never generates clock edges. We don't want to end up + # possible failure mode is if IOStreamerTop never generates clock edges. We don't want to end up # comparing two empty lists against eachother. assert actually_sampled == expected_sample, (f"Expected [" + ", ".join(f"0x{s:02x}" for s in expected_sample) + @@ -209,7 +212,7 @@ def test_sdr_input_sampled_correctly(self): def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_bits, timeout_clocks=None, iready_comb_path=False): """ - This is a latency-agnostic test, that verifies that the IOStreamer samples the inputs at the same time as the output signals change. + This is a latency-agnostic test, that verifies that the IOStreamerTop samples the inputs at the same time as the output signals change. o_valid_bits: is a string of "1"s and "0"s. Each character refers to one (or more) internal clock cycles. "1" means to send a payload, and "0" means to leave o_stream idle for 1 clock cycle. When sending a payload o_stream is waited upon if it's not ready, so @@ -231,7 +234,7 @@ def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_ - back-pressure on i_stream may result in back-pressure on o_stream, never allowing the full o_valid_bits string to be completely played back, and resulting in a timeout. - the playback of the o_valid_bits string may complete, however it's possible a number of sample requests that are in flight remain stuck - in the IOStreamer, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final + in the IOStreamerTop, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final samples have been lost. To make sure a testcase completes, you will see some testcases have a large number of "1"s in this string past the length of o_valid_bits. @@ -251,7 +254,7 @@ def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_ CLK_PERIOD = 1e-6 - dut = IOStreamer({ + dut = IOStreamerTop({ "clk_out": ("o", 1), "data_in": ("i", 8), }, ports, ratio=2, meta_layout=4) @@ -280,7 +283,7 @@ async def input_generator_tb(ctx): async def save_expected_sample_values_tb(ctx): """ This testbench looks at the clk_out port and when it sees a positive edge it knows that - IOStreamer is expected to sample the input signal, so the current state of the data_in port + IOStreamerTop is expected to sample the input signal, so the current state of the data_in port becomes one of the expected sampled values. This is saved into expected_sample[] to be compared later, when the sample actually arrives back on i_stream. """ @@ -298,7 +301,9 @@ async def i_stream_consumer_tb(ctx): if i_ready_bit == "1" and (not iready_comb_path or ctx.get(dut.i_stream.valid)): payload = await stream_get_maybe(ctx,dut.i_stream) if payload is not None: - data = payload.port.data_in.i[0], payload.port.data_in.i[1] + data = payload[0].actual.port.data_in.i, payload[1].actual.port.data_in.i + assert payload[0].actual.i_valid + assert payload[1].actual.i_valid actually_sampled.append(data) else: await ctx.tick() @@ -329,15 +334,30 @@ async def main_testbench(ctx): if o_valid_bit: if i_en_bit: expected_samples_count += 1 - await stream_put(ctx, dut.o_stream, { + await stream_put(ctx, dut.o_stream, [ + { "meta": i, - "i_en": i_en_bit, - "port": { - "clk_out": { - "o": (i_en_bit, 0), + "actual": { + "i_en": i_en_bit, + "port": { + "clk_out": { + "o": i_en_bit, + } } } - }) + }, + { + "meta": i, + "actual": { + "i_en": i_en_bit, + "port": { + "clk_out": { + "o": 0, + } + } + } + }, + ]) else: await ctx.tick() @@ -345,7 +365,7 @@ async def main_testbench(ctx): await ctx.tick() assert len(actually_sampled) == expected_samples_count # This should be checked as well, because a - # possible failure mode is if IOStreamer never generates clock edges. We don't want to end up + # possible failure mode is if IOStreamerTop never generates clock edges. We don't want to end up # comparing two empty lists against eachother. assert actually_sampled == expected_sample, (f"Expected [" + @@ -434,45 +454,45 @@ def test_basic(self): ports = PortGroup() ports.data = port = io.SimulationPort("io", 1) - dut = IOStreamer({ + dut = IOStreamerTop({ "data": ("io", 1), }, ports, meta_layout=4) async def testbench(ctx): await ctx.tick() - ctx.set(dut.o_stream.p.port.data.o[0], 1) - ctx.set(dut.o_stream.p.port.data.oe, 0) - ctx.set(dut.o_stream.p.i_en, 1) - ctx.set(dut.o_stream.p.meta, 1) + ctx.set(dut.o_stream.p[0].actual.port.data.o, 1) + ctx.set(dut.o_stream.p[0].actual.port.data.oe, 0) + ctx.set(dut.o_stream.p[0].actual.i_en, 1) + ctx.set(dut.o_stream.p[0].meta, 1) ctx.set(dut.o_stream.valid, 1) ctx.set(dut.i_stream.ready, 1) await ctx.tick() assert ctx.get(port.o[0]) == 1 assert ctx.get(port.oe) == 0 assert ctx.get(dut.i_stream.valid) == 1 - assert ctx.get(dut.i_stream.p.port.data.i[0]) == 0 - assert ctx.get(dut.i_stream.p.meta) == 1 + assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 0 + assert ctx.get(dut.i_stream.p[0].meta) == 1 - ctx.set(dut.o_stream.p.port.data.oe, 1) - ctx.set(dut.o_stream.p.meta, 2) + ctx.set(dut.o_stream.p[0].actual.port.data.oe, 1) + ctx.set(dut.o_stream.p[0].meta, 2) await ctx.tick() assert ctx.get(port.o[0]) == 1 assert ctx.get(port.oe) == 1 assert ctx.get(dut.i_stream.valid) == 1 - assert ctx.get(dut.i_stream.p.port.data.i[0]) == 0 - assert ctx.get(dut.i_stream.p.meta) == 2 + assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 0 + assert ctx.get(dut.i_stream.p[0].meta) == 2 - ctx.set(dut.o_stream.p.meta, 3) + ctx.set(dut.o_stream.p[0].meta, 3) await ctx.tick() assert ctx.get(port.o[0]) == 1 assert ctx.get(port.oe) == 1 assert ctx.get(dut.i_stream.valid) == 1 - assert ctx.get(dut.i_stream.p.port.data.i[0]) == 1 - assert ctx.get(dut.i_stream.p.meta) == 3 + assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 1 + assert ctx.get(dut.i_stream.p[0].meta) == 3 - ctx.set(dut.o_stream.p.port.data.o[0], 0) - ctx.set(dut.o_stream.p.i_en, 0) + ctx.set(dut.o_stream.p[0].actual.port.data.o[0], 0) + ctx.set(dut.o_stream.p[0].actual.i_en, 0) await ctx.tick() assert ctx.get(port.o[0]) == 0 assert ctx.get(port.oe) == 1 @@ -495,7 +515,7 @@ def test_skid(self): ports = PortGroup() ports.data = port = io.SimulationPort("io", 4) - dut = IOStreamer({ + dut = IOStreamerTop({ "data": ("io", 4), }, ports, meta_layout=4) @@ -504,31 +524,31 @@ async def testbench(ctx): ctx.set(dut.i_stream.ready, 1) ctx.set(dut.o_stream.valid, 1) - ctx.set(dut.o_stream.p.i_en, 1) - ctx.set(dut.o_stream.p.meta, 0b0101) + ctx.set(dut.o_stream.p[0].actual.i_en, 1) + ctx.set(dut.o_stream.p[0].meta, 0b0101) ctx.set(port.i, 0b0101) await ctx.tick() - assert ctx.get(dut.i_stream.p.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}" - assert ctx.get(dut.i_stream.p.meta) == 0b0101, f"{ctx.get(dut.i_stream.p.meta):#06b}" + assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}" + assert ctx.get(dut.i_stream.p[0].meta) == 0b0101, f"{ctx.get(dut.i_stream.p[0].meta):#06b}" - ctx.set(dut.o_stream.p.meta, 0b1111) + ctx.set(dut.o_stream.p[0].meta, 0b1111) ctx.set(port.i, 0b1111) ctx.set(dut.i_stream.ready, 0) await ctx.tick().repeat(10) # The skid buffer should protect the input stream from changes on the input signal - assert ctx.get(dut.i_stream.p.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}" - assert ctx.get(dut.i_stream.p.meta) == 0b0101, f"{ctx.get(dut.i_stream.p.meta):#06b}" + assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}" + assert ctx.get(dut.i_stream.p[0].meta) == 0b0101, f"{ctx.get(dut.i_stream.p[0].meta):#06b}" ctx.set(dut.i_stream.ready, 1) await ctx.tick() - assert ctx.get(dut.i_stream.p.port.data.i) == 0b1111, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}" - assert ctx.get(dut.i_stream.p.meta) == 0b1111, f"{ctx.get(dut.i_stream.p.meta):#06b}" + assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b1111, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}" + assert ctx.get(dut.i_stream.p[0].meta) == 0b1111, f"{ctx.get(dut.i_stream.p[0].meta):#06b}" sim = Simulator(dut) sim.add_clock(1e-6) diff --git a/software/tests/gateware/test_qspi.py b/software/tests/gateware/test_qspi.py index 65e43b488..fa46faeda 100644 --- a/software/tests/gateware/test_qspi.py +++ b/software/tests/gateware/test_qspi.py @@ -181,19 +181,36 @@ async def data_put(*, chip, data, mode): async def testbench_out(ctx): async def bits_get(*, cs, ox, oe, i_en, mode): for cycle, o in enumerate(ox): - expected = { - "bypass": (cs == 0), - "port": { - "sck": {"o": 1, "oe": 1}, - "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1}, - "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1}, - "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1}, - "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1}, - "cs": {"o": cs, "oe": 1}, - }, - "i_en": i_en, - "meta": mode - } + expected = [ + { + "actual": { + "port": { + "sck": {"o": ~cs, "oe": 1}, + "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1}, + "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1}, + "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1}, + "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1}, + "cs": {"o": cs, "oe": 1}, + }, + "i_en": 0, + }, + "meta": 0, + }, + { + "actual": { + "port": { + "sck": {"o": 1, "oe": 1}, + "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1}, + "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1}, + "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1}, + "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1}, + "cs": {"o": cs, "oe": 1}, + }, + "i_en": i_en, + }, + "meta": mode, + }, + ] assert (actual := await stream_get(ctx, dut.frames)) == expected, \ f"(cycle {cycle}) {actual} != {expected}" @@ -232,15 +249,33 @@ def test_qspi_deframer(self): async def testbench_in(ctx): async def bits_put(*, ix, mode): for cycle, i in enumerate(ix): - await stream_put(ctx, dut.frames, { - "port": { - "io0": {"i": (i>>0)&1}, - "io1": {"i": (i>>1)&1}, - "io2": {"i": (i>>2)&1}, - "io3": {"i": (i>>3)&1}, - }, - "meta": mode - }) + await stream_put(ctx, dut.frames, + [ + { + "actual": { + "port": { + "io0": {"i": 0}, + "io1": {"i": 0}, + "io2": {"i": 0}, + "io3": {"i": 0}, + }, + "i_valid": 0, + }, + "meta": mode + }, + { + "actual": { + "port": { + "io0": {"i": (i>>0)&1}, + "io1": {"i": (i>>1)&1}, + "io2": {"i": (i>>2)&1}, + "io3": {"i": (i>>3)&1}, + }, + "i_valid": 1, + }, + "meta": mode + }, + ]) await bits_put(ix=[i<<1 for i in [1,0,1,1,1,0,1,0]], mode=QSPIMode.Swap) @@ -290,7 +325,7 @@ def subtest_qspi_controller(self, *, use_ddr_buffers:bool, divisor:int, roundtri ports.io = io.SimulationPort("io", 4) ports.cs = io.SimulationPort("o", 1) - dut = QSPIController(ports, use_ddr_buffers=use_ddr_buffers, sample_delay_half_clocks=sample_delay_half_clocks) + dut = QSPIController(ports, use_ddr_buffers=use_ddr_buffers, max_sample_delay_half_clocks=sample_delay_half_clocks, min_divisor=divisor) async def testbench_controller(ctx): async def ctrl_idle(): @@ -323,6 +358,7 @@ async def ctrl_get(*, mode, count=1): return words if divisor is not None: ctx.set(dut.divisor, divisor) + ctx.set(dut.sample_delay_half_clocks, sample_delay_half_clocks) await ctrl_idle() @@ -435,3 +471,41 @@ def test_qspi_controller_needs_sample_delay_sdr_div0_too_little_turnaround(self) pass else: assert False, "QSPI controller should have failed with too little turnaround time" + + # The Div10 tests are here to verify the optimized sample_request_delayer: + def test_qspi_controller_ddr_div10(self): + self.subtest_qspi_controller(use_ddr_buffers=True, divisor=10) + + def test_qspi_controller_needs_sample_delay_ddr_div10_max_turnaround(self): + for sample_delay_half_clocks in [7, 8, 9, 10, 20]: + divisor = 10 + self.subtest_qspi_controller(use_ddr_buffers=True, + divisor=divisor, + roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) + self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.49, + sample_delay_half_clocks=sample_delay_half_clocks) + + def test_qspi_controller_needs_sample_delay_ddr_div10_too_much_turnaround(self): + for sample_delay_half_clocks in [7, 8, 9, 10, 20]: + try: + divisor = 10 + self.subtest_qspi_controller(use_ddr_buffers=True, + divisor=divisor, + roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) + self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.51, + sample_delay_half_clocks = sample_delay_half_clocks) + except AssertionError: + pass + else: + assert False, "QSPI controller should have failed with too much turnaround time" + + def test_qspi_controller_needs_sample_delay_ddr_div10_too_little_turnaround(self): + for sample_delay_half_clocks in [7, 8, 9, 10, 20]: + try: + divisor = 10 + self.subtest_qspi_controller(use_ddr_buffers=True, + divisor=divisor, + roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) - self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.51, + sample_delay_half_clocks = sample_delay_half_clocks) + except AssertionError: + pass + else: + assert False, "QSPI controller should have failed with too little turnaround time"