diff --git a/software/glasgow/applet/interface/qspi_controller/__init__.py b/software/glasgow/applet/interface/qspi_controller/__init__.py
index 4fcdca35e..a5f7729ef 100644
--- a/software/glasgow/applet/interface/qspi_controller/__init__.py
+++ b/software/glasgow/applet/interface/qspi_controller/__init__.py
@@ -32,8 +32,10 @@ def elaborate(self, platform):
         m = Module()
 
         m.submodules.qspi = qspi = QSPIController(self._ports, use_ddr_buffers=True,
-                                                  sample_delay_half_clocks = self._sample_delay_half_clocks)
+                                                  max_sample_delay_half_clocks=self._sample_delay_half_clocks,
+                                                  min_divisor=self._divisor)
         m.d.comb += qspi.divisor.eq(self._divisor)
+        m.d.comb += qspi.sample_delay_half_clocks.eq(self._sample_delay_half_clocks)
 
         o_fifo  = self._out_fifo.stream
         i_fifo  = self._in_fifo.stream
diff --git a/software/glasgow/gateware/iostream.py b/software/glasgow/gateware/iostream.py
index fb579f10c..b4d431996 100644
--- a/software/glasgow/gateware/iostream.py
+++ b/software/glasgow/gateware/iostream.py
@@ -5,7 +5,7 @@
 from glasgow.gateware.ports import PortGroup
 
 
-__all__ = ["IOStreamer"]
+__all__ = ["IOStreamerTop"]
 
 
 def _filter_ioshape(direction, ioshape):
@@ -65,71 +65,91 @@ def elaborate(self, platform):
 
         return m
 
+def LaneLayout(actual_layout, /, *, meta_layout=0):
+    return data.StructLayout({
+        "actual": actual_layout,
+        "meta": meta_layout,
+    })
 
-class IOStreamer(wiring.Component):
-    """I/O buffer to stream adapter.
+def MetaLayoutWithTag(*, tag_layout, meta_layout=0):
+    return data.StructLayout({
+        "inner_meta": meta_layout,
+        "tag": tag_layout,
+        "last": 1,
+    })
 
-    This adapter instantiates I/O buffers for a port (FF or DDR) and connects them to a pair of
-    streams, one for the outputs of the buffers and one for the inputs. Whenever an `o_stream`
-    transfer occurs, the state of the output is updated _t1_ cycles later; if `o_stream.p.i_en`
-    is set, then _t2_ cycles later, a payload with the data captured at the same time as
-    the outputs were updated appears on `i_stream.p.i`.
+def IOOutputActualLayout(ioshape):
+    return data.StructLayout({
+        "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({
+            "o":  width,
+            "oe": 1,
+        })),
+        "i_en": 1,
+    })
 
-    Arbitrary ancillary data may be provided with `o_stream` transfers via `o_stream.p.meta`,
-    and this data will be relayed back as `i_stream.p.meta` with the output-to-input latency
-    of the buffer. Higher-level protocol engines can use this data to indicate how the inputs
-    must be processed without needing counters or state machines on a higher level to match
-    the latency (and, usually, without needing any knowledge of the latency at all).
+def IOOutputStreamSignature(ioshape, /, lane_count=2, *, meta_layout=0):
+    actual_layout = IOOutputActualLayout(ioshape)
+    return stream.Signature(
+        data.ArrayLayout(
+            LaneLayout(actual_layout, meta_layout=meta_layout),
+            lane_count
+        )
+    )
 
-    On reset, output ports have their drivers enabled, and bidirectional ports have them disabled.
-    All of the signals are deasserted, which could be a low or a high level depending on the port
-    polarity.
-    """
+def IOInputActualLayout(ioshape):
+    return data.StructLayout({
+        "port": _map_ioshape("i", ioshape, lambda width: data.StructLayout({
+            "i":  width,
+        })),
+        "i_valid": 1,
+    })
 
-    @staticmethod
-    def o_stream_signature(ioshape, /, *, ratio=1, meta_layout=0):
-        return stream.Signature(data.StructLayout({
-            "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({
-                "o":  width if ratio == 1 else data.ArrayLayout(width, ratio),
-                "oe": 1,
-            })),
-            "i_en": 1,
-            "meta": meta_layout,
-        }))
+def IOInputStreamSignature(ioshape, /, lane_count=2, *, meta_layout=0):
+    actual_layout = IOInputActualLayout(ioshape)
+    return stream.Signature(
+        data.ArrayLayout(
+            LaneLayout(actual_layout, meta_layout=meta_layout),
+            lane_count
+        )
+    )
 
-    @staticmethod
-    def i_stream_signature(ioshape, /, *, ratio=1, meta_layout=0):
-        return stream.Signature(data.StructLayout({
-            "port": _map_ioshape("i", ioshape, lambda width: data.StructLayout({
-                "i":  width if ratio == 1 else data.ArrayLayout(width, ratio),
-            })),
-            "meta": meta_layout,
-        }))
-
-    def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0, sample_delay_half_clocks=0):
-        if ratio == 1:
-            assert (sample_delay_half_clocks % 2) == 0
+class IOStreamer(wiring.Component):
+    def __init__(self, ioshape, ports, /, *, ratio=1, meta_layout=0):
         assert isinstance(ioshape, (int, dict))
         assert ratio in (1, 2)
 
         self._ioshape = ioshape
-        self._ports   = ports
         self._ratio   = ratio
-        self._init    = init
-        self._sample_delay_half_clocks = sample_delay_half_clocks
+        self._ports   = ports
 
         super().__init__({
-            "o_stream":  In(self.o_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)),
-            "i_stream": Out(self.i_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)),
+            "o_stream":  In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
+            "i_stream": Out(IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
         })
 
+        self.o_stream.valid = Const(1)
+        self.o_stream.ready = Const(1)
+        for lane_index in range(ratio):
+            self.o_stream.p[lane_index].actual.i_en = Const(1) # Must always sample!
+        # self.i_stream.valid = Const(1) # i_stream is not really valid for the first `latency` cycles after reset
+        self.i_stream.ready = Const(1)
+
+    def get_latency(self, platform):
+        # May be platform-dependent in the future
+        if self._ratio == 1:
+            return 1
+        if self._ratio == 2:
+            return 2
+
     def elaborate(self, platform):
         m = Module()
 
         if self._ratio == 1:
-            buffer_cls, latency = io.FFBuffer, 1 + self._sample_delay_half_clocks // 2
+            buffer_cls = io.FFBuffer
         if self._ratio == 2:
-            buffer_cls, latency = SimulatableDDRBuffer, 2 + (self._sample_delay_half_clocks // 2) + (self._sample_delay_half_clocks % 2)
+            buffer_cls = SimulatableDDRBuffer
+
+        latency = self.get_latency(platform)
 
         if isinstance(self._ports, io.PortLike):
             m.submodules.buffer = buffer = buffer_cls("io", self._ports)
@@ -139,30 +159,17 @@ def elaborate(self, platform):
                 direction, _width = self._ioshape[name]
                 m.submodules[f"buffer_{name}"] = buffer[name] = buffer_cls(direction, sub_port)
 
-        o_latch = Signal(_map_ioshape("o", self._ioshape, lambda width: data.StructLayout({
-            "o":  width,
-            "oe": 1,
-        })), init=self._init)
-        with m.If(self.o_stream.valid & self.o_stream.ready):
+        for lane_index in range(self._ratio):
             for _, buffer_parts, stream_parts in _iter_ioshape("o", self._ioshape,
-                    buffer, self.o_stream.p.port):
-                m.d.comb += buffer_parts.o.eq(stream_parts.o)
-                m.d.comb += buffer_parts.oe.eq(stream_parts.oe)
-            for _, latch_parts, stream_parts in _iter_ioshape("o", self._ioshape,
-                    o_latch, self.o_stream.p.port):
-                if self._ratio == 1:
-                    m.d.sync += latch_parts.o.eq(stream_parts.o)
-                else:
-                    m.d.sync += latch_parts.o.eq(stream_parts.o[-1])
-                m.d.sync += latch_parts.oe.eq(stream_parts.oe)
-        with m.Else():
-            for _, buffer_parts, latch_parts in _iter_ioshape("o", self._ioshape,
-                    buffer, o_latch):
-                if self._ratio == 1:
-                    m.d.comb += buffer_parts.o.eq(latch_parts.o)
-                else:
-                    m.d.comb += buffer_parts.o.eq(latch_parts.o.replicate(self._ratio))
-                m.d.comb += buffer_parts.oe.eq(latch_parts.oe)
+                    buffer, self.o_stream.p[lane_index].actual.port):
+                m.d.comb += buffer_parts.o[lane_index].eq(stream_parts.o)
+
+        for name, buffer_parts in _iter_ioshape("o", self._ioshape,
+                    buffer):
+            oe_any = self.o_stream.p[0].actual.port[name].oe
+            for lane_index in range(1, self._ratio):
+                oe_any |= self.o_stream.p[1].actual.port[name].oe
+            m.d.comb += buffer_parts.oe.eq(oe_any)
 
         def delay(value, name):
             delayed_values = []
@@ -173,149 +180,644 @@ def delay(value, name):
                 delayed_values.append(next_value)
             return delayed_values
 
-        i_en_delays = delay(self.o_stream.valid & self.o_stream.ready &
-                            self.o_stream.p.i_en, name="i_en")
-        i_en = i_en_delays[-1]
-        meta = delay(self.o_stream.p.meta, name="meta")[-1]
+        i_en = delay(Const(1), name="i_en")[-1] # We always output samples, except for `latency` cycles after reset
+        for lane_index in range(self._ratio):
+            for name, i_payload_parts, buffer_parts in _iter_ioshape("i", self._ioshape, self.i_stream.p[lane_index].actual.port, buffer):
+                if self._ratio > 1:
+                    m.d.comb += i_payload_parts.i.eq(buffer_parts.i[lane_index])
+                else:
+                    m.d.comb += i_payload_parts.i.eq(buffer_parts.i)
+            m.d.comb += self.i_stream.p[lane_index].actual.i_valid.eq(1)
+        m.d.comb += self.i_stream.valid.eq(i_en)
+
+        return m
+
+
+class StreamStretcher(wiring.Component):
+    """
+    This component makes sure that any stream is not allowed to transfer more often
+    than every `divisor` cycles. If `divisor` is 0 or 1, then the StreamStretcher has
+    no effect.
+    """
+    def __init__(self, stream_signature, *, divisor_width=16):
+        super().__init__({
+            "i_stream":  In(stream_signature),
+            "o_stream": Out(stream_signature),
+            "divisor": In(divisor_width),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+        timer = Signal.like(self.divisor)
+        timer_done = Signal()
+        m.d.comb += timer_done.eq((timer == 0) | (timer == 1))
+
+        m.d.comb += self.o_stream.p.eq(self.i_stream.p)
+        m.d.comb += self.o_stream.valid.eq(self.i_stream.valid & timer_done)
+        m.d.comb += self.i_stream.ready.eq(self.o_stream.ready & timer_done)
+
+        with m.If(timer_done):
+            with m.If(self.o_stream.ready & self.o_stream.valid):
+                m.d.sync += timer.eq(self.divisor)
+        with m.Else():
+            m.d.sync += timer.eq(timer - 1)
+
+        return m
+
+
+class IOLatcher(wiring.Component):
+    """
+    This component has an always valid, always ready output stream,
+    which passes through the "o" and "oe" fields when a transaction
+    is presented at the input stream, otherwise it keeps repeating the
+    last transaction, which it memorises.
+    Other fields such as i_en, and meta are dropped.
+    """
+    def __init__(self, ioshape, /, *, ratio=1, init=None, meta_layout=0):
+        assert isinstance(ioshape, (int, dict))
+        assert ratio in (1, 2)
+
+        self._ioshape = ioshape
+        self._ratio   = ratio
+        self._init    = init
+
+        super().__init__({
+            "i_stream":  In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
+            "o_stream": Out(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
+        })
+
+        self.o_stream.valid = Const(1)
+        self.o_stream.ready = Const(1)
+        self.i_stream.ready = Const(1)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        o_latch = Signal(_map_ioshape("o", self._ioshape, lambda width: data.StructLayout({
+            "o":  width,
+            "oe": 1,
+        })), init=self._init)
+        with m.If(self.i_stream.valid & self.i_stream.ready):
+            for lane_index in range(self._ratio):
+                m.d.comb += self.o_stream.p[lane_index].actual.port.eq(self.i_stream.p[lane_index].actual.port)
+
+            for _, latch_parts, stream_parts in _iter_ioshape("o", self._ioshape,
+                    o_latch, self.i_stream.p[-1].actual.port):
+                m.d.sync += latch_parts.eq(stream_parts)
+
+        with m.Else():
+            for lane_index in range(self._ratio):
+                for _, simple_stream_parts, latch_parts in _iter_ioshape("o", self._ioshape,
+                        self.o_stream.p[lane_index].actual.port, o_latch):
+                    m.d.comb += simple_stream_parts.eq(latch_parts)
+
+        return m
+
+class SkidBuffer(wiring.Component):
+    """
+    This component is a generic skid buffer.
+    It is essentially a `depth` deep FIFO with a stream interface.
+    """
+    def __init__(self, depth, stream_signature):
+        self._depth = depth
+        super().__init__({
+            "i_stream":  In(stream_signature),
+            "o_stream": Out(stream_signature),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
 
         # This skid buffer is organized as a shift register to avoid any uncertainties associated
         # with the use of an async read memory. On platforms that have LUTRAM, this implementation
         # may be slightly worse than using LUTRAM, and may have to be revisited in the future.
-        skid = Array(Signal(self.i_stream.payload.shape(), name=f"skid_{stage}")
-                     for stage in range(1 + latency))
-        for name, skid_parts, buffer_parts in _iter_ioshape("i", self._ioshape, skid[0].port, buffer):
-            if self._sample_delay_half_clocks % 2:
-                m.d.comb += skid_parts.i[1].eq(buffer_parts.i[0])
-                i1_delayed = Signal.like(buffer_parts.i[1], name=f"{name}_i1_delayed")
-                m.d.sync += i1_delayed.eq(buffer_parts.i[1])
-                m.d.comb += skid_parts.i[0].eq(i1_delayed)
-            else:
-                m.d.comb += skid_parts.i.eq(buffer_parts.i)
-        m.d.comb += skid[0].meta.eq(meta)
-
-        skid_at = Signal(range(1 + latency))
-
-        with m.If(i_en):
-            for n_shift in range(latency):
+        skid = Array(Signal(self.i_stream.p.shape(), name=f"skid_{stage}")
+                     for stage in range(1 + self._depth))
+
+        skid_at = Signal(range(1 + self._depth))
+
+        m.d.comb += skid[0].eq(self.i_stream.p)
+
+        with m.If(self.i_stream.valid):
+            for n_shift in range(self._depth):
                 m.d.sync += skid[n_shift + 1].eq(skid[n_shift])
 
-        with m.If(i_en & ~self.i_stream.ready):
-            # m.d.sync += Assert(skid_at != latency)
+        not_full = Signal()
+        m.d.comb += not_full.eq(skid_at != self._depth)
+
+        m.d.comb += self.o_stream.p.eq(skid[skid_at])
+        m.d.comb += self.o_stream.valid.eq(self.i_stream.valid | (skid_at != 0))
+        m.d.comb += self.i_stream.ready.eq(self.o_stream.ready | not_full)
+
+        with m.If(self.i_stream.valid & self.i_stream.ready & ~self.o_stream.ready):
             m.d.sync += skid_at.eq(skid_at + 1)
-        with m.Elif((skid_at != 0) & ~i_en & self.i_stream.ready):
+        with m.Elif(~self.i_stream.valid & self.o_stream.valid & self.o_stream.ready):
             m.d.sync += skid_at.eq(skid_at - 1)
 
-        m.d.comb += self.i_stream.payload.eq(skid[skid_at])
-        m.d.comb += self.i_stream.valid.eq(i_en | (skid_at != 0))
-        m.d.comb += self.o_stream.ready.eq(self.i_stream.ready | ~((skid_at!=0) | Cat(*i_en_delays).any()))
+        return m
+
+class SampleRequestDelayer(wiring.Component):
+    def __init__(self, /, *, ratio, meta_layout, min_latency, max_sample_delay_half_clocks, min_divisor):
+        self._ratio = ratio
+        self._min_latency = min_latency
+        self._max_sample_delay_half_clocks = max_sample_delay_half_clocks
+        self._min_divisor = min_divisor
+        self._max_latency_except_hcyc = min_latency + self._max_sample_delay_half_clocks // 2
+
+        super().__init__({
+            "i_en": In(data.ArrayLayout(1, ratio)),
+            "meta": In(data.ArrayLayout(meta_layout, ratio)),
+            "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1)),
+            "i_en_delayed": Out(data.ArrayLayout(1, ratio)),
+            "meta_delayed": Out(data.ArrayLayout(meta_layout, ratio)),
+            "reads_in_flight": Out(1),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+
+        def delay(value, name, cycles):
+            delayed_values = Array(Signal(value.shape(), name=f"delayed_{name}_{stage}")
+                     for stage in range(cycles))
+            m.d.sync += delayed_values[0].eq(value)
+            for stage in range(1, cycles):
+                m.d.sync += delayed_values[stage].eq(delayed_values[stage-1])
+            return delayed_values
+
+        i_en_delayed_except_half_cyc = Signal.like(self.i_en_delayed)
+        meta_delayed_except_half_cyc = Signal.like(self.meta_delayed)
+        reads_in_flight_except_half_cyc = Signal.like(self.reads_in_flight)
+
+        # Following are two implementations: the second one is really simple, using only shift registers,
+        # while the first one relies on a `min_divisor` setting to use a counter for the first part of the
+        # delay mechanism.
+
+        # Some statistics using the memory-25x applet:
+        # divisor=24  sample_delay=0   => simple:  810 ICESTORM_LCs, optimized: 811 ICESTORM_LCs
+        # divisor=24  sample_delay=1   => simple:  830 ICESTORM_LCs, optimized: 832 ICESTORM_LCs
+        # divisor=24  sample_delay=2   => simple:  823 ICESTORM_LCs, optimized: 825 ICESTORM_LCs
+        # divisor=24  sample_delay=3   => simple:  832 ICESTORM_LCs, optimized: 824 ICESTORM_LCs
+        # divisor=24  sample_delay=6   => simple:  836 ICESTORM_LCs, optimized: 823 ICESTORM_LCs
+        # divisor=24  sample_delay=12  => simple:  849 ICESTORM_LCs, optimized: 825 ICESTORM_LCs
+        # divisor=24  sample_delay=24  => simple:  888 ICESTORM_LCs, optimized: 830 ICESTORM_LCs
+        # divisor=24  sample_delay=36  => simple:  928 ICESTORM_LCs, optimized: 833 ICESTORM_LCs
+        # divisor=24  sample_delay=47  => simple: 1001 ICESTORM_LCs, optimized: 877 ICESTORM_LCs
+        # divisor=3   sample_delay=0   => simple:  813 ICESTORM_LCs, optimized: 820 ICESTORM_LCs
+        # divisor=3   sample_delay=3   => simple:  859 ICESTORM_LCs, optimized: 860 ICESTORM_LCs
+        # divisor=3   sample_delay=6   => simple:  872 ICESTORM_LCs, optimized: 858 ICESTORM_LCs
+        # divisor=3   sample_delay=12  => simple:  894 ICESTORM_LCs, optimized: 901 ICESTORM_LCs
+        # divisor=3   sample_delay=24  => simple:  980 ICESTORM_LCs, optimized: 970 ICESTORM_LCs
+        # divisor=4   sample_delay=8   => simple:  874 ICESTORM_LCs, optimized: 866 ICESTORM_LCs
+        # divisor=4   sample_delay=16  => simple:  903 ICESTORM_LCs, optimized: 893 ICESTORM_LCs
+        # divisor=4   sample_delay=32  => simple:  999 ICESTORM_LCs, optimized: 988 ICESTORM_LCs
+        # divisor=5   sample_delay=10  => simple:  868 ICESTORM_LCs, optimized: 858 ICESTORM_LCs
+        # divisor=8   sample_delay=6   => simple:  836 ICESTORM_LCs, optimized: 830 ICESTORM_LCs
+        # divisor=8   sample_delay=16  => simple:  886 ICESTORM_LCs, optimized: 866 ICESTORM_LCs
+        # divisor=240 sample_delay=100 => simple: 1114 ICESTORM_LCs, optimized: 826 ICESTORM_LCs
+        # divisor=240 sample_delay=238 => simple: 1528 ICESTORM_LCs, optimized: 830 ICESTORM_LCs
+
+        #if self._min_divisor >= 1: # The optimized implementation works correctly as long as _min_divisor >= 1
+        if self._min_divisor >= 4:  # however it may not make sense to use it when min_divisor is a low number
+            # Optimized implementaiton using a counter as a first-stage delay mechanism
+            assert self._min_divisor >= 1, "with a divisor of less than 1, the counter logic wouldn't work"
+            assert self._min_latency >= 1, "with a min latency less then 1, and sample delay of zero, the counter logic wouldn't work"
+            counting = Signal()
+            counter = Signal(range(min(self._min_divisor, self._max_latency_except_hcyc)))
+            i_en_cached = Signal.like(self.i_en)
+            meta_cached = Signal.like(self.meta)
+            i_en_delay_chain_input = Signal.like(self.i_en)
+
+            latency_minus_1 = self._min_latency - 1 + self.sample_delay_half_clocks // 2
+
+            with m.If(counting):
+                with m.If((counter == self._min_divisor - 1) | 
+                          (counter == latency_minus_1)):
+                    m.d.sync += counting.eq(0)
+                    m.d.comb += i_en_delay_chain_input.eq(i_en_cached)
+                with m.Else():
+                    m.d.sync += counter.eq(counter + 1)
+
+            with m.If(Signal.cast(self.i_en).any()):
+                m.d.sync += (
+                    counting.eq(1),
+                    i_en_cached.eq(self.i_en),
+                    meta_cached.eq(self.meta),
+                    counter.eq(0),
+                )
+
+            m.d.comb += (
+                i_en_delayed_except_half_cyc.eq(i_en_delay_chain_input),
+                meta_delayed_except_half_cyc.eq(meta_cached),
+                reads_in_flight_except_half_cyc.eq(counting),
+            )
+
+            if self._max_latency_except_hcyc > self._min_divisor:
+                delay_chain_cycles = self._max_latency_except_hcyc - self._min_divisor
+                i_en_delays = delay(i_en_delay_chain_input, name=f"i_en", cycles=delay_chain_cycles)
+                meta_delays = delay(meta_cached, name=f"meta", cycles=delay_chain_cycles)
+
+                delay_selector = latency_minus_1 - self._min_divisor
+
+                i_en_in_flight_up_to = Array(Signal(1, name=f"i_en_in_flight_{stage}") for stage in range(delay_chain_cycles))
+                m.d.comb += i_en_in_flight_up_to[0].eq(Signal.cast(i_en_delays[0]).any())
+                for stage in range(1, delay_chain_cycles):
+                    value = Signal.cast(i_en_delays[stage]).any() | i_en_in_flight_up_to[stage - 1]
+                    m.d.comb += i_en_in_flight_up_to[stage].eq(value)
+
+                with m.If(latency_minus_1 >= self._min_divisor):
+                    m.d.comb += i_en_delayed_except_half_cyc.eq(i_en_delays[delay_selector])
+                    m.d.comb += meta_delayed_except_half_cyc.eq(meta_delays[delay_selector])
+                    m.d.comb += reads_in_flight_except_half_cyc.eq(counting | i_en_in_flight_up_to[delay_selector])
+
+        else: # Simple shift-register-only based implementation
+            meta, i_en_delays, i_en  = [], [], []
+            delay_selector = self._min_latency - 1 + self.sample_delay_half_clocks // 2
+
+            i_en_delays = delay(self.i_en, name=f"i_en", cycles=self._max_latency_except_hcyc)
+            meta_delays = delay(self.meta, name=f"meta", cycles=self._max_latency_except_hcyc)
+
+            m.d.comb += i_en_delayed_except_half_cyc.eq(i_en_delays[delay_selector])
+            m.d.comb += meta_delayed_except_half_cyc.eq(meta_delays[delay_selector])
+
+            i_en_in_flight_up_to = Array(Signal(1, name=f"i_en_in_flight_{stage}") for stage in range(self._max_latency_except_hcyc))
+            m.d.comb += i_en_in_flight_up_to[0].eq(Signal.cast(i_en_delays[0]).any())
+            for stage in range(1, self._max_latency_except_hcyc):
+                value = Signal.cast(i_en_delays[stage]).any() | i_en_in_flight_up_to[stage - 1]
+                m.d.comb += i_en_in_flight_up_to[stage].eq(value)
+
+            m.d.comb += reads_in_flight_except_half_cyc.eq(i_en_in_flight_up_to[delay_selector])
+
+        # Here follows code common to both implementations, that handles half a cycle delays.
+        # Half-cycle delays are handled as an additional delay step. (The sample payload will
+        # be combined from two different clock cycles.) We're using an additional shift
+        # register stage to avoid having to calculate a dynamic delay of
+        # (sample_delay // 2 + sample_delay % 2)
+        m.d.comb += self.i_en_delayed.eq(i_en_delayed_except_half_cyc)
+        m.d.comb += self.meta_delayed.eq(meta_delayed_except_half_cyc)
+        m.d.comb += self.reads_in_flight.eq(reads_in_flight_except_half_cyc)
+
+        if self._ratio == 2:
+            i_en_hcyc = delay(i_en_delayed_except_half_cyc, name=f"i_en_hcyc", cycles=1)[0]
+            meta_hcyc = delay(meta_delayed_except_half_cyc, name=f"meta_hcyc", cycles=1)[0]
+            with m.If(self.sample_delay_half_clocks % 2):
+                m.d.comb += self.i_en_delayed.eq(i_en_hcyc)
+                m.d.comb += self.meta_delayed.eq(meta_hcyc)
+                m.d.comb += self.reads_in_flight.eq(reads_in_flight_except_half_cyc | Signal.cast(i_en_hcyc).any())
+
+        return m
+
+class IOStreamerTop(wiring.Component):
+    """I/O buffer to stream adapter.
+
+    This adapter instantiates I/O buffers for a port (FF or DDR) and connects them to a pair of
+    streams, one for the outputs of the buffers and one for the inputs. Whenever an `o_stream`
+    transfer occurs, the state of the output is updated _t1_ cycles later; if `o_stream.p.i_en`
+    is set, then _t2_ cycles later, a payload with the data captured at the same time as
+    the outputs were updated appears on `i_stream.p.i`.
+
+    Arbitrary ancillary data may be provided with `o_stream` transfers via `o_stream.p.meta`,
+    and this data will be relayed back as `i_stream.p.meta` with the output-to-input latency
+    of the buffer. Higher-level protocol engines can use this data to indicate how the inputs
+    must be processed without needing counters or state machines on a higher level to match
+    the latency (and, usually, without needing any knowledge of the latency at all).
+
+    On reset, output ports have their drivers enabled, and bidirectional ports have them disabled.
+    All of the signals are deasserted, which could be a low or a high level depending on the port
+    polarity.
+    """
+
+    def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0, divisor_width=16, max_sample_delay_half_clocks=0, min_divisor=0):
+        assert isinstance(ioshape, (int, dict))
+        assert ratio in (1, 2)
+
+        self._ioshape = ioshape
+        self._ports   = ports
+        self._ratio   = ratio
+        self._init    = init
+        self._divisor_width = divisor_width
+        self._max_sample_delay_half_clocks = max_sample_delay_half_clocks
+        self._meta_layout = meta_layout
+        self._min_divisor = min_divisor
+
+        super().__init__({
+            "o_stream":  In(IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
+            "i_stream": Out(IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=meta_layout)),
+            "divisor": In(divisor_width),
+            "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1)),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+
+        #if self._min_divisor:
+        #    m.d.sync += Assert(self.divisor >= self._min_divisor)
+
+        #if self._ratio == 1:
+        #    m.d.sync += Assert(self.sample_delay_half_clocks % 2 == 0)
+
+        m.submodules.stream_stretcher = stream_stretcher = StreamStretcher(
+            IOOutputStreamSignature(self._ioshape, lane_count=self._ratio, meta_layout=self._meta_layout),
+            divisor_width = self._divisor_width)
+        m.d.comb += stream_stretcher.divisor.eq(self.divisor)
+        wiring.connect(m, io_streamer=wiring.flipped(self.o_stream), stream_strecher=stream_stretcher.i_stream)
+
+        m.submodules.io_streamer = io_streamer = IOStreamer(self._ioshape, self._ports, ratio=self._ratio, meta_layout=0)
+        m.submodules.io_latcher = io_latcher = IOLatcher(self._ioshape, ratio=self._ratio, init=self._init, meta_layout=0)
+        wiring.connect(m, io_latcher=io_latcher.o_stream, io_streamer=io_streamer.o_stream)
+        for lane_index in range(self._ratio):
+            m.d.comb += io_latcher.i_stream.p[lane_index].actual.port.eq(stream_stretcher.o_stream.p[lane_index].actual.port)
+        m.d.comb += io_latcher.i_stream.valid.eq(stream_stretcher.o_stream.valid & stream_stretcher.o_stream.ready)
+        #  ^ note: the above makes sure IOLatcher doesn't take a new transaction if we're blocking the input
+
+        min_latency = io_streamer.get_latency(platform)
+        max_latency = min_latency + self._max_sample_delay_half_clocks // 2 + self._max_sample_delay_half_clocks % 2
+
+        m.submodules.sample_request_delayer = sample_request_delayer = SampleRequestDelayer(ratio=self._ratio,
+                                                                                            meta_layout=self._meta_layout,
+                                                                                            min_latency=min_latency,
+                                                                                            max_sample_delay_half_clocks=self._max_sample_delay_half_clocks,
+                                                                                            min_divisor=self._min_divisor)
+        m.d.comb += sample_request_delayer.sample_delay_half_clocks.eq(self.sample_delay_half_clocks)
+        for lane_index in range(self._ratio):
+            m.d.comb += sample_request_delayer.i_en[lane_index].eq(stream_stretcher.o_stream.valid &
+                                                                   stream_stretcher.o_stream.ready &
+                                                                   stream_stretcher.o_stream.p[lane_index].actual.i_en)
+            m.d.comb += sample_request_delayer.meta[lane_index].eq(stream_stretcher.o_stream.p[lane_index].meta)
+
+        skid_buffer_depth = max_latency
+        if self._min_divisor > 1:
+            # This is an optimisation we can apply if we know at elaboration time that divisor can never be larger than min_divisor
+            skid_buffer_depth = (max_latency + self._min_divisor - 1) // self._min_divisor
+
+        m.submodules.skid_buffer = skid_buffer = SkidBuffer(
+            skid_buffer_depth,
+            IOInputStreamSignature(self._ioshape, lane_count=self._ratio, meta_layout=self._meta_layout),
+        )
+        m.d.comb += skid_buffer.i_stream.valid.eq(Signal.cast(sample_request_delayer.i_en_delayed).any())
+        #with m.If(skid_buffer.i_stream.valid):
+        #    m.d.sync += Assert(skid_buffer.i_stream.ready)
+
+        for lane_index in range(self._ratio):
+            m.d.comb += skid_buffer.i_stream.p[lane_index].actual.port.eq(io_streamer.i_stream.p[lane_index].actual.port)
+
+        if self._ratio == 2:
+            with m.If(self.sample_delay_half_clocks % 2):
+                m.d.comb += skid_buffer.i_stream.p[1].actual.port.eq(io_streamer.i_stream.p[0].actual.port)
+                i1_delayed = Signal.like(io_streamer.i_stream.p[1].actual.port, name=f"i1_delayed")
+                m.d.sync += i1_delayed.eq(io_streamer.i_stream.p[1].actual.port)
+                m.d.comb += skid_buffer.i_stream.p[0].actual.port.eq(i1_delayed)
+
+        for lane_index in range(self._ratio):
+            m.d.comb += skid_buffer.i_stream.p[lane_index].meta.eq(sample_request_delayer.meta_delayed[lane_index])
+            m.d.comb += skid_buffer.i_stream.p[lane_index].actual.i_valid.eq(sample_request_delayer.i_en_delayed[lane_index])
+
+        wiring.connect(m, skid_buffer=skid_buffer.o_stream, io_streamer_top=wiring.flipped(self.i_stream))
+
+        m.d.comb += stream_stretcher.o_stream.ready.eq(self.i_stream.ready | (~skid_buffer.o_stream.valid & ~sample_request_delayer.reads_in_flight))
+
+        return m
+
+
+class IO2LaneTo1Lane(wiring.Component):
+    """
+    This component down-converts a 2-lane stream to a 1-lane stream, while adding
+    information to the metadata, which includes:
+        tag: the index of the lane the original data belonged to
+        last: a flag signifying if a second beat is expected in the case of later up-conversion
+    The last fields is optionally determined using the supplied is_beat_0_last argument
+    to the constructor, which must be a function that returns an amaranth expression
+    """
+    @staticmethod
+    def i_stream_signature(actual_layout, /, *, meta_layout=0):
+        return stream.Signature(
+            data.ArrayLayout(
+                LaneLayout(actual_layout, meta_layout=meta_layout),
+                2
+            )
+        )
+
+    @staticmethod
+    def o_stream_signature(actual_layout, /, *, meta_layout=0):
+        return stream.Signature(
+            data.ArrayLayout(
+                LaneLayout(actual_layout, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)),
+                1
+            )
+        )
+
+    def __init__(self, actual_layout, *, meta_layout=0, is_beat_0_last=lambda payload: 0):
+        self._is_beat_0_last = is_beat_0_last
+        super().__init__({
+            "i_stream":  In(self.i_stream_signature(actual_layout, meta_layout=meta_layout)),
+            "o_stream": Out(self.o_stream_signature(actual_layout, meta_layout=meta_layout)),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+
+        phase = Signal()
+        m.d.comb += self.o_stream.p[0].actual.eq(self.i_stream.p[phase].actual)
+        m.d.comb += self.o_stream.p[0].meta.inner_meta.eq(self.i_stream.p[phase].meta)
+        m.d.comb += self.o_stream.p[0].meta.tag.eq(phase)
+        m.d.comb += self.o_stream.p[0].meta.last.eq(1)
+        with m.If((phase == 0) & ~self._is_beat_0_last(self.i_stream.p)):
+            m.d.comb += self.o_stream.p[0].meta.last.eq(0)
+
+        m.d.comb += self.o_stream.valid.eq(self.i_stream.valid)
+        with m.If(self.o_stream.ready):
+            with m.If(phase == 0):
+                with m.If(self.i_stream.valid):
+                    m.d.sync += phase.eq(1)
+
+            with m.Else(): # phase == 1
+                m.d.comb += self.i_stream.ready.eq(1)
+                m.d.sync += phase.eq(0)
 
         return m
 
 
 class IOClocker(wiring.Component):
+    """
+    In case of ratio=1:
+        This component down-converts (serializes) 2 lanes to 1 lane, while adding metadata to identify which lane each beat belonged to.
+    In case of ratio=2, divisor=0:
+        This component adds useless metadata, but is otherwise a pass-through. Adding the metadata is necessary, because divisor is not a compile-time parameter
+    In case of ratio=2, divisor!=0:
+        This component down-converts (serializes) 2 lanes to 1 lane, just like in the ratio=1 case above, except, it duplicates the resulting lane, while also making
+        sure to force `i_en` of the second resulting lane to 0. This means that one `i_en` bit high doesn't result in two samples.
+        ratio=2, divisor!=0 is designed to behave exactly like ratio=1, divisor!=0, so the read samples associated with output lane 0 are the only ones we care about.
+    """
     @staticmethod
-    def i_stream_signature(ioshape, /, *, _ratio=1, meta_layout=0):
-        # Currently the only supported ratio is 1, but this will change in the future for
-        # interfaces like HyperBus.
-        return stream.Signature(data.StructLayout({
-            "bypass": 1,
-            "port": _map_ioshape("o", ioshape, lambda width: data.StructLayout({
-                "o":  width if _ratio == 1 else data.ArrayLayout(width, _ratio),
-                "oe": 1,
-            })),
-            "i_en": 1,
-            "meta": meta_layout,
-        }))
+    def i_stream_signature(ioshape, /, *, _ratio=2, meta_layout=0):
+        # Currently the only supported ratio is 2
+        return IOOutputStreamSignature(ioshape, lane_count=_ratio, meta_layout=meta_layout)
 
     @staticmethod
     def o_stream_signature(ioshape, /, *, ratio=1, meta_layout=0):
-        return IOStreamer.o_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)
+        return IOOutputStreamSignature(ioshape, lane_count=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout))
 
-    def __init__(self, ioshape, *, clock, o_ratio=1, meta_layout=0, divisor_width=16):
+    def __init__(self, ioshape, *, o_ratio=1, meta_layout=0, divisor_width=16):
         assert isinstance(ioshape, dict)
-        assert isinstance(clock, str)
         assert o_ratio in (1, 2)
-        assert clock in ioshape
 
-        self._clock   = clock
         self._ioshape = ioshape
         self._o_ratio = o_ratio
+        self._meta_layout = meta_layout
 
         super().__init__({
             "i_stream":  In(self.i_stream_signature(ioshape,
                 meta_layout=meta_layout)),
             "o_stream": Out(self.o_stream_signature(ioshape,
                 ratio=o_ratio, meta_layout=meta_layout)),
-
-            # f_clk = f_sync if (o_ratio == 2 and divisor == 0) else f_sync / (2 * max(1, divisor))
             "divisor": In(divisor_width),
         })
 
     def elaborate(self, platform):
         m = Module()
 
-        # Forward the inputs to the outputs as-is. This includes the clock; it is overridden below
-        # if the clocker is used (not bypassed).
-        for _, i_parts, o_parts in _iter_ioshape("io", self._ioshape,
-                self.i_stream.p.port, self.o_stream.p.port):
-            m.d.comb += o_parts.o .eq(i_parts.o.replicate(self._o_ratio))
-            m.d.comb += o_parts.oe.eq(i_parts.oe)
-        m.d.comb += self.o_stream.p.i_en.eq(self.i_stream.p.i_en)
-        m.d.comb += self.o_stream.p.meta.eq(self.i_stream.p.meta)
+        m.submodules.io_2to1_lane = io_2to1_lane = IO2LaneTo1Lane(IOOutputActualLayout(self._ioshape), meta_layout=self._meta_layout,
+                is_beat_0_last = lambda payload: payload[1].actual.i_en==0)
 
         phase = Signal()
-        # If the clocker is used...
-        with m.If(~self.i_stream.p.bypass):
-            # ... ignore the clock in the inputs and replace it with the generated one...
-            if self._o_ratio == 1:
-                m.d.comb += self.o_stream.p.port[self._clock].o.eq(phase)
-            if self._o_ratio == 2:
-                with m.If(self.divisor == 0):
-                    m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(~phase, phase))
-                with m.Else():
-                    m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(phase, phase))
-            m.d.comb += self.o_stream.p.port[self._clock].oe.eq(1)
-            # ... while requesting input sampling only for the rising edge. (Interfaces triggering
-            # transfers on falling edge will be inverting the clock at the `IOPort` level.)
-            m.d.comb += self.o_stream.p.i_en.eq(self.i_stream.p.i_en & phase)
+        if self._o_ratio == 1:
+            wiring.connect(m, ioclocker=wiring.flipped(self.i_stream), io_2to1_lane=io_2to1_lane.i_stream)
+            wiring.connect(m, ioclocker=wiring.flipped(self.o_stream), io_2to1_lane=io_2to1_lane.o_stream)
+        if self._o_ratio == 2:
+            with m.If(self.divisor == 0):
+                # Just pass-through, we're doing nothing, but adding currently-useless tag metadata
+                for lane_index in range(self._o_ratio):
+                    m.d.comb += self.o_stream.p[lane_index].actual.eq(self.i_stream.p[lane_index].actual)
+                    m.d.comb += self.o_stream.p[lane_index].meta.inner_meta.eq(self.i_stream.p[lane_index].meta)
+                    m.d.comb += self.o_stream.p[lane_index].meta.tag.eq(lane_index)
+                    m.d.comb += self.o_stream.p[lane_index].meta.last.eq(~self.i_stream.p[1].actual.i_en)
+                m.d.comb += self.o_stream.valid.eq(self.i_stream.valid)
+                m.d.comb += self.i_stream.ready.eq(self.o_stream.ready)
+            with m.Else():
+                wiring.connect(m, ioclocker=wiring.flipped(self.i_stream), io_2to1_lane=io_2to1_lane.i_stream)
+                for olane_index in range(2):
+                    m.d.comb += self.o_stream.p[olane_index].eq(io_2to1_lane.o_stream.p[0])
+                m.d.comb += self.o_stream.p[1].actual.i_en.eq(0) # Override i_en, to only sample once
+                m.d.comb += self.o_stream.valid.eq(io_2to1_lane.o_stream.valid)
+                m.d.comb += io_2to1_lane.o_stream.ready.eq(self.o_stream.ready)
 
-        timer = Signal.like(self.divisor)
-        with m.If((timer == 0) | (timer == 1)):
-            # Only produce output when the timer has expired. This ensures that no clock pulse
-            # exceeds the frequency set by `divisor`, except the ones that bypass the clocker.
-            m.d.comb += self.o_stream.valid.eq(self.i_stream.valid)
+        return m
+
+
+class IO1LaneTo2Lane(wiring.Component):
+    """
+    This component up-converts a 1-lane stream to a 2-lane stream, using
+    information from the metadata, to determine which lane to put each beat in:
+        tag: the index of the lane the data should be put on
+        last: a flag signifying if a second beat is expected
+    An output stream transaction occurs only when the last bit is high for an
+    input-stream transaction.
+    """
+    @staticmethod
+    def o_stream_signature(actual_layout, /, *, meta_layout=0):
+        return stream.Signature(
+            data.ArrayLayout(
+                LaneLayout(actual_layout, meta_layout=meta_layout),
+                2
+            )
+        )
+
+    @staticmethod
+    def i_stream_signature(actual_layout, /, *, meta_layout=0):
+        return stream.Signature(
+            data.ArrayLayout(
+                LaneLayout(actual_layout, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout)),
+                1
+            )
+        )
+
+    def __init__(self, actual_layout, *, meta_layout=0):
+        super().__init__({
+            "i_stream":  In(self.i_stream_signature(actual_layout, meta_layout=meta_layout)),
+            "o_stream": Out(self.o_stream_signature(actual_layout, meta_layout=meta_layout)),
+        })
 
-            with m.FSM():
-                with m.State("Falling"):
-                    with m.If(self.i_stream.p.bypass): # Bypass the clocker entirely.
-                        m.d.comb += self.i_stream.ready.eq(self.o_stream.ready)
-
-                    with m.Else(): # Produce a falling edge at the output.
-                        # Whenever DDR output is used, with `divisor == 0`, we output a low state
-                        # on the first half of the clock cycle, and a high state on the second half.
-                        # This mode allows clocking the peripheral at the `sync` frequency.
-                        # In this case the signal sampled at the rising edge will be output on i[1]
-                        # (if sample_delay was set to zero)
-                        # In all other cases the signal sampled at the rising edge will be output on i[0]
-                        # (if sample_delay was set to zero)
-                        with m.If((self._o_ratio == 2) & (self.divisor == 0)):
-                            m.d.comb += phase.eq(1)
-                            with m.If(self.o_stream.ready):
-                                m.d.comb += self.i_stream.ready.eq(1)
-                        with m.Else():
-                            m.d.comb += phase.eq(0)
-                            with m.If(self.o_stream.ready & self.i_stream.valid):
-                                m.d.sync += timer.eq(self.divisor)
-                                m.next = "Rising"
-
-                with m.State("Rising"):
-                    m.d.comb += phase.eq(1)
-                    with m.If(self.o_stream.ready):
-                        m.d.comb += self.i_stream.ready.eq(1)
-                        m.d.sync += timer.eq(self.divisor)
-                        m.next = "Falling"
+    def elaborate(self, platform):
+        m = Module()
+
+        untagged_istream_lane = Signal.like(self.o_stream.p[0])
+        m.d.comb += untagged_istream_lane.actual.eq(self.i_stream.p[0].actual)
+        m.d.comb += untagged_istream_lane.meta.eq(self.i_stream.p[0].meta.inner_meta)
+
+        phase_0_stored = Signal.like(untagged_istream_lane)
+
+        m.d.comb += self.i_stream.ready.eq(self.o_stream.ready)
+        with m.If(self.i_stream.valid):
+            with m.If(self.i_stream.p[0].meta.last):
+                m.d.comb += self.o_stream.p[self.i_stream.p[0].meta.tag].eq(untagged_istream_lane)
+                with m.If(self.i_stream.p[0].meta.tag != 0):
+                    m.d.comb += self.o_stream.p[0].eq(phase_0_stored)
+                m.d.comb += self.o_stream.valid.eq(1)
+                with m.If(self.i_stream.ready):
+                    m.d.sync += phase_0_stored.eq(0)
+            with m.Else():
+                with m.If(self.i_stream.ready):
+                    m.d.sync += phase_0_stored.eq(untagged_istream_lane)
 
+        return m
+
+
+class IOClockerDeframer(wiring.Component):
+    """
+    In case of ratio=1:
+        This component up-converts (deserializes) 1-lane samples to 2-lane.
+    In case of ratio=2, divisor=0:
+        This component is a simple pass-through, doing nothing
+    In case of ratio=2, divisor!=0:
+        This component throws away lane[1] of the input, and up-converts (deserializes) lane[0] to 2 lanes
+    See IO1LaneTo2Lane subcomponent for more details
+    """
+    @staticmethod
+    def o_stream_signature(ioshape, /, *, _ratio=1, meta_layout=0):
+        # Currently the only supported ratio is 1, but this will change in the future for
+        # interfaces like HyperBus.
+        return IOInputStreamSignature(ioshape, lane_count=2, meta_layout=meta_layout)
+
+    @staticmethod
+    def i_stream_signature(ioshape, /, *, ratio=1, meta_layout=0):
+        return IOInputStreamSignature(ioshape, lane_count=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=meta_layout))
+
+    def __init__(self, ioshape, *, i_ratio=1, meta_layout=0, divisor_width=16):
+        assert isinstance(ioshape, dict)
+        assert i_ratio in (1, 2)
+
+        self._ioshape = ioshape
+        self._i_ratio = i_ratio
+        self._meta_layout = meta_layout
+
+        super().__init__({
+            "i_stream":  In(self.i_stream_signature(ioshape,
+                ratio=i_ratio, meta_layout=meta_layout)),
+            "o_stream": Out(self.o_stream_signature(ioshape,
+                meta_layout=meta_layout)),
+            "divisor": In(divisor_width),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.submodules.io_1to2_lane = io_1to2_lane = IO1LaneTo2Lane(IOInputActualLayout(self._ioshape), meta_layout=self._meta_layout)
+
+        with m.If((self.divisor == 0) & (self._i_ratio == 2)):
+            # Just pass-through everyting
+            for lane_index in range(self._i_ratio):
+                m.d.comb += self.o_stream.p[lane_index].actual.eq(self.i_stream.p[lane_index].actual)
+                m.d.comb += self.o_stream.p[lane_index].meta.eq(self.i_stream.p[lane_index].meta.inner_meta)
+            m.d.comb += self.o_stream.valid.eq(self.i_stream.valid)
+            m.d.comb += self.i_stream.ready.eq(self.o_stream.ready)
         with m.Else():
-            m.d.sync += timer.eq(timer - 1)
+            m.d.comb += io_1to2_lane.i_stream.valid.eq(self.i_stream.valid)
+            m.d.comb += self.i_stream.ready.eq(io_1to2_lane.i_stream.ready)
+            m.d.comb += io_1to2_lane.i_stream.p[0].eq(self.i_stream.p[0])
+            # ^ `wiring.connect` won't work here in case of i_ratio=2, we're explicitly
+            # throwing away the second lane here, cause we know IOClocker always sends
+            # sample requests on lane 0, (when divisor != 0). In case of i_ratio=1,
+            # this is equivalent to `wiring.connect`
+
+            wiring.connect(m, io_1to2_lane=io_1to2_lane.o_stream, io_clocker_deframer=wiring.flipped(self.o_stream))
 
         return m
diff --git a/software/glasgow/gateware/qspi.py b/software/glasgow/gateware/qspi.py
index 6a3bb8241..f470f9c60 100644
--- a/software/glasgow/gateware/qspi.py
+++ b/software/glasgow/gateware/qspi.py
@@ -3,7 +3,7 @@
 from amaranth.lib.wiring import In, Out, connect, flipped
 
 from .ports import PortGroup
-from .iostream import IOStreamer, IOClocker
+from .iostream import IOStreamerTop, IOClocker, IOClockerDeframer, MetaLayoutWithTag
 
 
 __all__ = ["QSPIMode", "QSPIEnframer", "QSPIDeframer", "QSPIController"]
@@ -57,43 +57,54 @@ def elaborate(self, platform):
                     m.d.comb += self.octets.ready.eq(cycle == 0)
             m.d.sync += cycle.eq(Mux(self.octets.ready, 0, cycle + 1))
 
-        # When no chip is selected, keep clock in the idle state. The only supported `mode`
-        # in this case is `QSPIMode.Dummy`, which should be used to deassert CS# at the end of
-        # a transfer.
-        m.d.comb += self.frames.p.bypass.eq(self.octets.p.chip == 0)
-        m.d.comb += self.frames.p.port.sck.o.eq(1)  # (for bypass only)
-        m.d.comb += self.frames.p.port.sck.oe.eq(1) # (for bypass only)
+        for lane_index in range(2):
+            m.d.comb += self.frames.p[lane_index].actual.port.sck.oe.eq(1)
+
+        with m.If(self.octets.p.chip == 0):
+            # When no chip is selected, keep clock in the idle state. The only supported `mode`
+            # in this case is `QSPIMode.Dummy`, which should be used to deassert CS# at the end of
+            # a transfer.
+            m.d.comb += self.frames.p[0].actual.port.sck.o.eq(1)
+            m.d.comb += self.frames.p[1].actual.port.sck.o.eq(1)
+        with m.Else():
+            m.d.comb += self.frames.p[0].actual.port.sck.o.eq(0)
+            m.d.comb += self.frames.p[1].actual.port.sck.o.eq(1)
 
         rev_data = self.octets.p.data[::-1] # MSB first
         with m.Switch(self.octets.p.mode):
             with m.Case(QSPIMode.PutX1, QSPIMode.Swap):
-                m.d.comb += self.frames.p.port.io0.o.eq(rev_data.word_select(cycle, 1))
-                m.d.comb += self.frames.p.port.io0.oe.eq(0b1)
-                m.d.comb += self.frames.p.i_en.eq(self.octets.p.mode == QSPIMode.Swap)
+                for lane_index in range(2):
+                    m.d.comb += self.frames.p[lane_index].actual.port.io0.o.eq(rev_data.word_select(cycle, 1))
+                    m.d.comb += self.frames.p[lane_index].actual.port.io0.oe.eq(0b1)
+                m.d.comb += self.frames.p[1].actual.i_en.eq(self.octets.p.mode == QSPIMode.Swap)
             with m.Case(QSPIMode.GetX1):
-                m.d.comb += self.frames.p.port.io0.oe.eq(0b1)
-                m.d.comb += self.frames.p.i_en.eq(1)
+                for lane_index in range(2):
+                    m.d.comb += self.frames.p[lane_index].actual.port.io0.oe.eq(0b1)
+                m.d.comb += self.frames.p[1].actual.i_en.eq(1)
             with m.Case(QSPIMode.PutX2):
-                m.d.comb += Cat(self.frames.p.port.io1.o,
-                                self.frames.p.port.io0.o).eq(rev_data.word_select(cycle, 2))
-                m.d.comb += Cat(self.frames.p.port.io1.oe,
-                                self.frames.p.port.io0.oe).eq(0b11)
+                for lane_index in range(2):
+                    m.d.comb += Cat(self.frames.p[lane_index].actual.port.io1.o,
+                                    self.frames.p[lane_index].actual.port.io0.o).eq(rev_data.word_select(cycle, 2))
+                    m.d.comb += Cat(self.frames.p[lane_index].actual.port.io1.oe,
+                                    self.frames.p[lane_index].actual.port.io0.oe).eq(0b11)
             with m.Case(QSPIMode.GetX2):
-                m.d.comb += self.frames.p.i_en.eq(1)
+                m.d.comb += self.frames.p[1].actual.i_en.eq(1)
             with m.Case(QSPIMode.PutX4):
-                m.d.comb += Cat(self.frames.p.port.io3.o,
-                                self.frames.p.port.io2.o,
-                                self.frames.p.port.io1.o,
-                                self.frames.p.port.io0.o).eq(rev_data.word_select(cycle, 4))
-                m.d.comb += Cat(self.frames.p.port.io3.oe,
-                                self.frames.p.port.io2.oe,
-                                self.frames.p.port.io1.oe,
-                                self.frames.p.port.io0.oe).eq(0b1111)
+                for lane_index in range(2):
+                    m.d.comb += Cat(self.frames.p[lane_index].actual.port.io3.o,
+                                    self.frames.p[lane_index].actual.port.io2.o,
+                                    self.frames.p[lane_index].actual.port.io1.o,
+                                    self.frames.p[lane_index].actual.port.io0.o).eq(rev_data.word_select(cycle, 4))
+                    m.d.comb += Cat(self.frames.p[lane_index].actual.port.io3.oe,
+                                    self.frames.p[lane_index].actual.port.io2.oe,
+                                    self.frames.p[lane_index].actual.port.io1.oe,
+                                    self.frames.p[lane_index].actual.port.io0.oe).eq(0b1111)
             with m.Case(QSPIMode.GetX4):
-                m.d.comb += self.frames.p.i_en.eq(1)
-        m.d.comb += self.frames.p.port.cs.o.eq((1 << self.octets.p.chip)[1:])
-        m.d.comb += self.frames.p.port.cs.oe.eq(1)
-        m.d.comb += self.frames.p.meta.eq(self.octets.p.mode)
+                m.d.comb += self.frames.p[1].actual.i_en.eq(1)
+        for lane_index in range(2):
+            m.d.comb += self.frames.p[lane_index].actual.port.cs.o.eq((1 << self.octets.p.chip)[1:])
+            m.d.comb += self.frames.p[lane_index].actual.port.cs.oe.eq(1)
+        m.d.comb += self.frames.p[1].meta.eq(self.octets.p.mode)
 
         return m
 
@@ -101,7 +112,7 @@ def elaborate(self, platform):
 class QSPIDeframer(wiring.Component): # meow :3
     def __init__(self):
         super().__init__({
-            "frames": In(IOStreamer.i_stream_signature({
+            "frames": In(IOClockerDeframer.o_stream_signature({
                 "io0": ("io", 1),
                 "io1": ("io", 1),
                 "io2": ("io", 1),
@@ -118,7 +129,7 @@ def elaborate(self, platform):
         cycle = Signal(range(8))
         m.d.comb += self.frames.ready.eq(~self.octets.valid | self.octets.ready)
         with m.If(self.frames.valid):
-            with m.Switch(self.frames.p.meta):
+            with m.Switch(self.frames.p[1].meta):
                 with m.Case(QSPIMode.GetX1, QSPIMode.Swap):
                     m.d.comb += self.octets.valid.eq(cycle == 7)
                 with m.Case(QSPIMode.GetX2):
@@ -129,17 +140,17 @@ def elaborate(self, platform):
                 m.d.sync += cycle.eq(Mux(self.octets.valid, 0, cycle + 1))
 
         data_reg = Signal(8)
-        with m.Switch(self.frames.p.meta):
+        with m.Switch(self.frames.p[1].meta):
             with m.Case(QSPIMode.GetX1, QSPIMode.Swap): # note: samples IO1
-                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io1.i, data_reg))
+                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io1.i, data_reg))
             with m.Case(QSPIMode.GetX2):
-                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io0.i,
-                                                      self.frames.p.port.io1.i, data_reg))
+                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io0.i,
+                                                      self.frames.p[1].actual.port.io1.i, data_reg))
             with m.Case(QSPIMode.GetX4):
-                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p.port.io0.i,
-                                                      self.frames.p.port.io1.i,
-                                                      self.frames.p.port.io2.i,
-                                                      self.frames.p.port.io3.i, data_reg))
+                m.d.comb += self.octets.p.data.eq(Cat(self.frames.p[1].actual.port.io0.i,
+                                                      self.frames.p[1].actual.port.io1.i,
+                                                      self.frames.p[1].actual.port.io2.i,
+                                                      self.frames.p[1].actual.port.io3.i, data_reg))
         with m.If(self.frames.valid & self.frames.ready):
             m.d.sync += data_reg.eq(self.octets.p.data)
 
@@ -147,7 +158,7 @@ def elaborate(self, platform):
 
 
 class QSPIController(wiring.Component):
-    def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_half_clocks=0):
+    def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, divisor_width=16, max_sample_delay_half_clocks=0, min_divisor=0):
         assert len(ports.sck) == 1 and ports.sck.direction in (io.Direction.Output, io.Direction.Bidir)
         assert len(ports.io) == 4 and ports.io.direction == io.Direction.Bidir
         assert len(ports.cs) >= 1 and ports.cs.direction in (io.Direction.Output, io.Direction.Bidir)
@@ -162,7 +173,9 @@ def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_h
         )
         self._ddr = use_ddr_buffers
         self._chip_count = chip_count
-        self._sample_delay_half_clocks = sample_delay_half_clocks
+        self._divisor_width = divisor_width
+        self._max_sample_delay_half_clocks = max_sample_delay_half_clocks
+        self._min_divisor = min_divisor
 
         super().__init__({
             "o_octets": In(stream.Signature(data.StructLayout({
@@ -174,7 +187,8 @@ def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_h
                 "data": 8
             }))),
 
-            "divisor": In(16),
+            "divisor": In(divisor_width),
+            "sample_delay_half_clocks": In(range(max_sample_delay_half_clocks + 1))
         })
 
     def elaborate(self, platform):
@@ -194,36 +208,28 @@ def elaborate(self, platform):
         connect(m, controller=flipped(self.o_octets), enframer=enframer.octets)
 
         m.submodules.io_clocker = io_clocker = IOClocker(ioshape,
-            clock="sck", o_ratio=ratio, meta_layout=QSPIMode)
+            o_ratio=ratio, meta_layout=QSPIMode, divisor_width=self._divisor_width)
         connect(m, enframer=enframer.frames, io_clocker=io_clocker.i_stream)
         m.d.comb += io_clocker.divisor.eq(self.divisor)
 
-        m.submodules.io_streamer = io_streamer = IOStreamer(ioshape, self._ports, init={
+        m.submodules.io_streamer = io_streamer = IOStreamerTop(ioshape, self._ports, init={
             "sck": {"o": 1, "oe": 1}, # Motorola "Mode 3" with clock idling high
             "cs":  {"o": 0, "oe": 1}, # deselected
-        }, ratio=ratio, meta_layout=QSPIMode,
-           sample_delay_half_clocks=self._sample_delay_half_clocks)
+        }, ratio=ratio, meta_layout=MetaLayoutWithTag(tag_layout=range(2), meta_layout=QSPIMode),
+           divisor_width=self._divisor_width,
+           max_sample_delay_half_clocks=self._max_sample_delay_half_clocks,
+           min_divisor=self._min_divisor)
         connect(m, io_clocker=io_clocker.o_stream, io_streamer=io_streamer.o_stream)
+        m.d.comb += io_streamer.divisor.eq(self.divisor)
+        m.d.comb += io_streamer.sample_delay_half_clocks.eq(self.sample_delay_half_clocks)
+
+        m.submodules.io_clocker_deframer = io_clocker_deframer = IOClockerDeframer(ioshape,
+            i_ratio=ratio, meta_layout=QSPIMode)
+        connect(m, io_streamer=io_streamer.i_stream, io_clocker_deframer=io_clocker_deframer.i_stream)
+        m.d.comb += io_clocker_deframer.divisor.eq(self.divisor)
 
         m.submodules.deframer = deframer = QSPIDeframer()
-        m.d.comb += [ # connect() wouldn't work if DDR buffers are used
-            deframer.frames.p.port.io0.i.eq(io_streamer.i_stream.p.port.io0.i[0]),
-            deframer.frames.p.port.io1.i.eq(io_streamer.i_stream.p.port.io1.i[0]),
-            deframer.frames.p.port.io2.i.eq(io_streamer.i_stream.p.port.io2.i[0]),
-            deframer.frames.p.port.io3.i.eq(io_streamer.i_stream.p.port.io3.i[0]),
-            deframer.frames.p.meta.eq(io_streamer.i_stream.p.meta),
-            deframer.frames.valid.eq(io_streamer.i_stream.valid),
-            io_streamer.i_stream.ready.eq(deframer.frames.ready),
-        ]
-
-        if self._ddr:
-            with m.If(self.divisor == 0):
-                m.d.comb += [
-                    deframer.frames.p.port.io0.i.eq(io_streamer.i_stream.p.port.io0.i[1]),
-                    deframer.frames.p.port.io1.i.eq(io_streamer.i_stream.p.port.io1.i[1]),
-                    deframer.frames.p.port.io2.i.eq(io_streamer.i_stream.p.port.io2.i[1]),
-                    deframer.frames.p.port.io3.i.eq(io_streamer.i_stream.p.port.io3.i[1]),
-                ]
+        connect(m, io_clocker_deframer=io_clocker_deframer.o_stream, deframer=deframer.frames)
 
         connect(m, deframer=deframer.octets, controller=flipped(self.i_octets))
 
diff --git a/software/tests/gateware/test_iostream.py b/software/tests/gateware/test_iostream.py
index 2b50d982f..6a8fe9202 100644
--- a/software/tests/gateware/test_iostream.py
+++ b/software/tests/gateware/test_iostream.py
@@ -5,7 +5,7 @@
 from amaranth.lib import io
 
 from glasgow.gateware.ports import PortGroup
-from glasgow.gateware.iostream import IOStreamer
+from glasgow.gateware.iostream import IOStreamerTop
 
 MAX_SKIDBUFFER_SIZE = 4
 
@@ -55,7 +55,7 @@ class IOStreamTimeoutError(Exception):
 class IOStreamTestCase(unittest.TestCase):
     def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_bits, timeout_clocks=None, iready_comb_path=False):
         """
-        This is a latency-agnostic test, that verifies that the IOStreamer samples the inputs at the same time as the output signals change.
+        This is a latency-agnostic test, that verifies that the IOStreamerTop samples the inputs at the same time as the output signals change.
 
         o_valid_bits: is a string of "1"s and "0"s. Each character refers to one (or more) clock cycles. "1" means to send a payload,
             and "0" means to leave o_stream idle for 1 clock cycle. When sending a payload o_stream is waited upon if it's not ready, so
@@ -76,7 +76,7 @@ def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_
               - back-pressure on i_stream may result in back-pressure on o_stream, never allowing the full o_valid_bits string to be completely played back,
                 and resulting in a timeout.
               - the playback of the o_valid_bits string may complete, however it's possible a number of sample requests that are in flight remain stuck
-                in the IOStreamer, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final
+                in the IOStreamerTop, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final
                 samples have been lost.
             To make sure a testcase completes, you will see some testcases have a large number of "1"s in this string past the length of o_valid_bits.
 
@@ -93,7 +93,7 @@ def _subtest_sdr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_
         if timeout_clocks is None:
             timeout_clocks = len(o_valid_bits) + len(i_ready_bits) + 20
 
-        dut = IOStreamer({
+        dut = IOStreamerTop({
             "clk_out": ("o", 1),
             "data_in": ("i", 8),
         }, ports, meta_layout=4)
@@ -118,7 +118,7 @@ async def input_generator_tb(ctx):
         async def save_expected_sample_values_tb(ctx):
             """
             This testbench looks at the clk_out port and when it sees a positive or negative edge it knows that
-            IOStreamer is expected to sample the input signal, so the current state of the data_in port
+            IOStreamerTop is expected to sample the input signal, so the current state of the data_in port
             becomes one of the expected sampled values. This is saved into expected_sample[] to be compared
             later, when the sample actually arrives back on i_stream.
             """
@@ -135,7 +135,8 @@ async def i_stream_consumer_tb(ctx):
                 if i_ready_bit == "1" and (not iready_comb_path or ctx.get(dut.i_stream.valid)):
                     payload = await stream_get_maybe(ctx,dut.i_stream)
                     if payload is not None:
-                        actually_sampled.append(payload.port.data_in.i)
+                        actually_sampled.append(payload[0].actual.port.data_in.i)
+                        assert payload[0].actual.i_valid
                 else:
                     await ctx.tick()
             i_stream_consumer_finished = True
@@ -168,15 +169,17 @@ async def main_testbench(ctx):
                     if i_en_bit:
                         expected_samples_count += 1
                         o_bit ^= 1
-                    await stream_put(ctx, dut.o_stream, {
+                    await stream_put(ctx, dut.o_stream, [{
                         "meta": i,
-                        "i_en": i_en_bit,
-                        "port": {
-                            "clk_out": {
-                                "o": o_bit,
+                        "actual": {
+                            "i_en": i_en_bit,
+                            "port": {
+                                "clk_out": {
+                                    "o": o_bit,
+                                }
                             }
                         }
-                    })
+                    }])
                 else:
                     await ctx.tick()
 
@@ -184,7 +187,7 @@ async def main_testbench(ctx):
                 await ctx.tick()
 
             assert len(actually_sampled) == expected_samples_count # This should be checked as well, because a
-            # possible failure mode is if IOStreamer never generates clock edges. We don't want to end up
+            # possible failure mode is if IOStreamerTop never generates clock edges. We don't want to end up
             # comparing two empty lists against eachother.
             assert actually_sampled == expected_sample, (f"Expected [" +
                     ", ".join(f"0x{s:02x}" for s in expected_sample) +
@@ -209,7 +212,7 @@ def test_sdr_input_sampled_correctly(self):
 
     def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_bits, timeout_clocks=None, iready_comb_path=False):
         """
-        This is a latency-agnostic test, that verifies that the IOStreamer samples the inputs at the same time as the output signals change.
+        This is a latency-agnostic test, that verifies that the IOStreamerTop samples the inputs at the same time as the output signals change.
 
         o_valid_bits: is a string of "1"s and "0"s. Each character refers to one (or more) internal clock cycles. "1" means to send a payload,
             and "0" means to leave o_stream idle for 1 clock cycle. When sending a payload o_stream is waited upon if it's not ready, so
@@ -231,7 +234,7 @@ def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_
               - back-pressure on i_stream may result in back-pressure on o_stream, never allowing the full o_valid_bits string to be completely played back,
                 and resulting in a timeout.
               - the playback of the o_valid_bits string may complete, however it's possible a number of sample requests that are in flight remain stuck
-                in the IOStreamer, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final
+                in the IOStreamerTop, pending for them to be extracted from the i_stream, and that could result in the testcase declaring that the final
                 samples have been lost.
             To make sure a testcase completes, you will see some testcases have a large number of "1"s in this string past the length of o_valid_bits.
 
@@ -251,7 +254,7 @@ def _subtest_ddr_input_sampled_correctly(self, o_valid_bits, i_en_bits, i_ready_
 
         CLK_PERIOD = 1e-6
 
-        dut = IOStreamer({
+        dut = IOStreamerTop({
             "clk_out": ("o", 1),
             "data_in": ("i", 8),
         }, ports, ratio=2, meta_layout=4)
@@ -280,7 +283,7 @@ async def input_generator_tb(ctx):
         async def save_expected_sample_values_tb(ctx):
             """
             This testbench looks at the clk_out port and when it sees a positive edge it knows that
-            IOStreamer is expected to sample the input signal, so the current state of the data_in port
+            IOStreamerTop is expected to sample the input signal, so the current state of the data_in port
             becomes one of the expected sampled values. This is saved into expected_sample[] to be compared
             later, when the sample actually arrives back on i_stream.
             """
@@ -298,7 +301,9 @@ async def i_stream_consumer_tb(ctx):
                 if i_ready_bit == "1" and (not iready_comb_path or ctx.get(dut.i_stream.valid)):
                     payload = await stream_get_maybe(ctx,dut.i_stream)
                     if payload is not None:
-                        data = payload.port.data_in.i[0], payload.port.data_in.i[1]
+                        data = payload[0].actual.port.data_in.i, payload[1].actual.port.data_in.i
+                        assert payload[0].actual.i_valid
+                        assert payload[1].actual.i_valid
                         actually_sampled.append(data)
                 else:
                     await ctx.tick()
@@ -329,15 +334,30 @@ async def main_testbench(ctx):
                 if o_valid_bit:
                     if i_en_bit:
                         expected_samples_count += 1
-                    await stream_put(ctx, dut.o_stream, {
+                    await stream_put(ctx, dut.o_stream, [
+                    {
                         "meta": i,
-                        "i_en": i_en_bit,
-                        "port": {
-                            "clk_out": {
-                                "o": (i_en_bit, 0),
+                        "actual": {
+                            "i_en": i_en_bit,
+                            "port": {
+                                "clk_out": {
+                                    "o": i_en_bit,
+                                }
                             }
                         }
-                    })
+                    },
+                    {
+                        "meta": i,
+                        "actual": {
+                            "i_en": i_en_bit,
+                            "port": {
+                                "clk_out": {
+                                    "o": 0,
+                                }
+                            }
+                        }
+                    },
+                    ])
                 else:
                     await ctx.tick()
 
@@ -345,7 +365,7 @@ async def main_testbench(ctx):
                 await ctx.tick()
 
             assert len(actually_sampled) == expected_samples_count # This should be checked as well, because a
-            # possible failure mode is if IOStreamer never generates clock edges. We don't want to end up
+            # possible failure mode is if IOStreamerTop never generates clock edges. We don't want to end up
             # comparing two empty lists against eachother.
 
             assert actually_sampled == expected_sample, (f"Expected [" +
@@ -434,45 +454,45 @@ def test_basic(self):
         ports = PortGroup()
         ports.data = port = io.SimulationPort("io", 1)
 
-        dut = IOStreamer({
+        dut = IOStreamerTop({
             "data": ("io", 1),
         }, ports, meta_layout=4)
 
         async def testbench(ctx):
             await ctx.tick()
 
-            ctx.set(dut.o_stream.p.port.data.o[0], 1)
-            ctx.set(dut.o_stream.p.port.data.oe, 0)
-            ctx.set(dut.o_stream.p.i_en, 1)
-            ctx.set(dut.o_stream.p.meta, 1)
+            ctx.set(dut.o_stream.p[0].actual.port.data.o, 1)
+            ctx.set(dut.o_stream.p[0].actual.port.data.oe, 0)
+            ctx.set(dut.o_stream.p[0].actual.i_en, 1)
+            ctx.set(dut.o_stream.p[0].meta, 1)
             ctx.set(dut.o_stream.valid, 1)
             ctx.set(dut.i_stream.ready, 1)
             await ctx.tick()
             assert ctx.get(port.o[0]) == 1
             assert ctx.get(port.oe) == 0
             assert ctx.get(dut.i_stream.valid) == 1
-            assert ctx.get(dut.i_stream.p.port.data.i[0]) == 0
-            assert ctx.get(dut.i_stream.p.meta) == 1
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 0
+            assert ctx.get(dut.i_stream.p[0].meta) == 1
 
-            ctx.set(dut.o_stream.p.port.data.oe, 1)
-            ctx.set(dut.o_stream.p.meta, 2)
+            ctx.set(dut.o_stream.p[0].actual.port.data.oe, 1)
+            ctx.set(dut.o_stream.p[0].meta, 2)
             await ctx.tick()
             assert ctx.get(port.o[0]) == 1
             assert ctx.get(port.oe) == 1
             assert ctx.get(dut.i_stream.valid) == 1
-            assert ctx.get(dut.i_stream.p.port.data.i[0]) == 0
-            assert ctx.get(dut.i_stream.p.meta) == 2
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 0
+            assert ctx.get(dut.i_stream.p[0].meta) == 2
 
-            ctx.set(dut.o_stream.p.meta, 3)
+            ctx.set(dut.o_stream.p[0].meta, 3)
             await ctx.tick()
             assert ctx.get(port.o[0]) == 1
             assert ctx.get(port.oe) == 1
             assert ctx.get(dut.i_stream.valid) == 1
-            assert ctx.get(dut.i_stream.p.port.data.i[0]) == 1
-            assert ctx.get(dut.i_stream.p.meta) == 3
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i[0]) == 1
+            assert ctx.get(dut.i_stream.p[0].meta) == 3
 
-            ctx.set(dut.o_stream.p.port.data.o[0], 0)
-            ctx.set(dut.o_stream.p.i_en, 0)
+            ctx.set(dut.o_stream.p[0].actual.port.data.o[0], 0)
+            ctx.set(dut.o_stream.p[0].actual.i_en, 0)
             await ctx.tick()
             assert ctx.get(port.o[0]) == 0
             assert ctx.get(port.oe) == 1
@@ -495,7 +515,7 @@ def test_skid(self):
         ports = PortGroup()
         ports.data = port = io.SimulationPort("io", 4)
 
-        dut = IOStreamer({
+        dut = IOStreamerTop({
             "data": ("io", 4),
         }, ports, meta_layout=4)
 
@@ -504,31 +524,31 @@ async def testbench(ctx):
 
             ctx.set(dut.i_stream.ready, 1)
             ctx.set(dut.o_stream.valid, 1)
-            ctx.set(dut.o_stream.p.i_en, 1)
-            ctx.set(dut.o_stream.p.meta, 0b0101)
+            ctx.set(dut.o_stream.p[0].actual.i_en, 1)
+            ctx.set(dut.o_stream.p[0].meta, 0b0101)
             ctx.set(port.i, 0b0101)
 
             await ctx.tick()
 
-            assert ctx.get(dut.i_stream.p.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}"
-            assert ctx.get(dut.i_stream.p.meta) == 0b0101, f"{ctx.get(dut.i_stream.p.meta):#06b}"
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}"
+            assert ctx.get(dut.i_stream.p[0].meta) == 0b0101, f"{ctx.get(dut.i_stream.p[0].meta):#06b}"
 
-            ctx.set(dut.o_stream.p.meta, 0b1111)
+            ctx.set(dut.o_stream.p[0].meta, 0b1111)
             ctx.set(port.i, 0b1111)
 
             ctx.set(dut.i_stream.ready, 0)
 
             await ctx.tick().repeat(10)
             # The skid buffer should protect the input stream from changes on the input signal
-            assert ctx.get(dut.i_stream.p.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}"
-            assert ctx.get(dut.i_stream.p.meta) == 0b0101, f"{ctx.get(dut.i_stream.p.meta):#06b}"
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b0101, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}"
+            assert ctx.get(dut.i_stream.p[0].meta) == 0b0101, f"{ctx.get(dut.i_stream.p[0].meta):#06b}"
 
             ctx.set(dut.i_stream.ready, 1)
 
             await ctx.tick()
 
-            assert ctx.get(dut.i_stream.p.port.data.i) == 0b1111, f"{ctx.get(dut.i_stream.p.port.data.i):#06b}"
-            assert ctx.get(dut.i_stream.p.meta) == 0b1111, f"{ctx.get(dut.i_stream.p.meta):#06b}"
+            assert ctx.get(dut.i_stream.p[0].actual.port.data.i) == 0b1111, f"{ctx.get(dut.i_stream.p[0].actual.port.data.i):#06b}"
+            assert ctx.get(dut.i_stream.p[0].meta) == 0b1111, f"{ctx.get(dut.i_stream.p[0].meta):#06b}"
 
         sim = Simulator(dut)
         sim.add_clock(1e-6)
diff --git a/software/tests/gateware/test_qspi.py b/software/tests/gateware/test_qspi.py
index 65e43b488..fa46faeda 100644
--- a/software/tests/gateware/test_qspi.py
+++ b/software/tests/gateware/test_qspi.py
@@ -181,19 +181,36 @@ async def data_put(*, chip, data, mode):
         async def testbench_out(ctx):
             async def bits_get(*, cs, ox, oe, i_en, mode):
                 for cycle, o in enumerate(ox):
-                    expected = {
-                        "bypass": (cs == 0),
-                        "port": {
-                            "sck": {"o":        1, "oe":         1},
-                            "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1},
-                            "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1},
-                            "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1},
-                            "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1},
-                            "cs":  {"o":       cs, "oe":         1},
-                        },
-                        "i_en": i_en,
-                        "meta": mode
-                    }
+                    expected = [
+                            {
+                                "actual": {
+                                    "port": {
+                                        "sck": {"o":      ~cs, "oe":         1},
+                                        "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1},
+                                        "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1},
+                                        "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1},
+                                        "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1},
+                                        "cs":  {"o":       cs, "oe":         1},
+                                    },
+                                    "i_en": 0,
+                                },
+                                "meta": 0,
+                            },
+                            {
+                                "actual": {
+                                    "port": {
+                                        "sck": {"o":        1, "oe":         1},
+                                        "io0": {"o": (o>>0)&1, "oe": (oe>>0)&1},
+                                        "io1": {"o": (o>>1)&1, "oe": (oe>>1)&1},
+                                        "io2": {"o": (o>>2)&1, "oe": (oe>>2)&1},
+                                        "io3": {"o": (o>>3)&1, "oe": (oe>>3)&1},
+                                        "cs":  {"o":       cs, "oe":         1},
+                                    },
+                                    "i_en": i_en,
+                                },
+                                "meta": mode,
+                            },
+                        ]
                     assert (actual := await stream_get(ctx, dut.frames)) == expected, \
                         f"(cycle {cycle}) {actual} != {expected}"
 
@@ -232,15 +249,33 @@ def test_qspi_deframer(self):
         async def testbench_in(ctx):
             async def bits_put(*, ix, mode):
                 for cycle, i in enumerate(ix):
-                    await stream_put(ctx, dut.frames, {
-                        "port": {
-                            "io0": {"i": (i>>0)&1},
-                            "io1": {"i": (i>>1)&1},
-                            "io2": {"i": (i>>2)&1},
-                            "io3": {"i": (i>>3)&1},
-                        },
-                        "meta": mode
-                    })
+                    await stream_put(ctx, dut.frames,
+                        [
+                            {
+                                "actual": {
+                                    "port": {
+                                        "io0": {"i": 0},
+                                        "io1": {"i": 0},
+                                        "io2": {"i": 0},
+                                        "io3": {"i": 0},
+                                    },
+                                    "i_valid": 0,
+                                },
+                                "meta": mode
+                            },
+                            {
+                                "actual": {
+                                    "port": {
+                                        "io0": {"i": (i>>0)&1},
+                                        "io1": {"i": (i>>1)&1},
+                                        "io2": {"i": (i>>2)&1},
+                                        "io3": {"i": (i>>3)&1},
+                                    },
+                                    "i_valid": 1,
+                                },
+                                "meta": mode
+                            },
+                        ])
 
             await bits_put(ix=[i<<1 for i in [1,0,1,1,1,0,1,0]], mode=QSPIMode.Swap)
 
@@ -290,7 +325,7 @@ def subtest_qspi_controller(self, *, use_ddr_buffers:bool, divisor:int, roundtri
         ports.io  = io.SimulationPort("io", 4)
         ports.cs  = io.SimulationPort("o",  1)
 
-        dut = QSPIController(ports, use_ddr_buffers=use_ddr_buffers, sample_delay_half_clocks=sample_delay_half_clocks)
+        dut = QSPIController(ports, use_ddr_buffers=use_ddr_buffers, max_sample_delay_half_clocks=sample_delay_half_clocks, min_divisor=divisor)
 
         async def testbench_controller(ctx):
             async def ctrl_idle():
@@ -323,6 +358,7 @@ async def ctrl_get(*, mode, count=1):
                 return words
             if divisor is not None:
                 ctx.set(dut.divisor, divisor)
+            ctx.set(dut.sample_delay_half_clocks, sample_delay_half_clocks)
 
             await ctrl_idle()
 
@@ -435,3 +471,41 @@ def test_qspi_controller_needs_sample_delay_sdr_div0_too_little_turnaround(self)
                 pass
             else:
                 assert False, "QSPI controller should have failed with too little turnaround time"
+
+    # The Div10 tests are here to verify the optimized sample_request_delayer:
+    def test_qspi_controller_ddr_div10(self):
+        self.subtest_qspi_controller(use_ddr_buffers=True, divisor=10)
+
+    def test_qspi_controller_needs_sample_delay_ddr_div10_max_turnaround(self):
+        for sample_delay_half_clocks in [7, 8, 9, 10, 20]:
+            divisor = 10
+            self.subtest_qspi_controller(use_ddr_buffers=True,
+                                         divisor=divisor,
+                                         roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) + self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.49,
+                                         sample_delay_half_clocks=sample_delay_half_clocks)
+
+    def test_qspi_controller_needs_sample_delay_ddr_div10_too_much_turnaround(self):
+        for sample_delay_half_clocks in [7, 8, 9, 10, 20]:
+            try:
+                divisor = 10
+                self.subtest_qspi_controller(use_ddr_buffers=True,
+                                             divisor=divisor,
+                                             roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) + self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.51,
+                                             sample_delay_half_clocks = sample_delay_half_clocks)
+            except AssertionError:
+                pass
+            else:
+                assert False, "QSPI controller should have failed with too much turnaround time"
+
+    def test_qspi_controller_needs_sample_delay_ddr_div10_too_little_turnaround(self):
+        for sample_delay_half_clocks in [7, 8, 9, 10, 20]:
+            try:
+                divisor = 10
+                self.subtest_qspi_controller(use_ddr_buffers=True,
+                                             divisor=divisor,
+                                             roundtrip_time_s=self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * (0.5 * sample_delay_half_clocks) - self.QSPI_CONTROLLER_SUBTEST_CLK_PERIOD * divisor * 2 * 0.51,
+                                             sample_delay_half_clocks = sample_delay_half_clocks)
+            except AssertionError:
+                pass
+            else:
+                assert False, "QSPI controller should have failed with too little turnaround time"