diff --git a/net_finder/core/base_types.py b/net_finder/core/base_types.py
index db278d5..9c9c959 100644
--- a/net_finder/core/base_types.py
+++ b/net_finder/core/base_types.py
@@ -6,6 +6,10 @@
 from amaranth.utils import ceil_log2
 
 
+def next_power_of_two(n: int):
+    return 1 << ceil_log2(n)
+
+
 def net_size(max_area: int):
     """Returns the width/height of the net."""
 
diff --git a/net_finder/core/core.py b/net_finder/core/core.py
index 2b77cf9..1d6b94f 100644
--- a/net_finder/core/core.py
+++ b/net_finder/core/core.py
@@ -1,68 +1,839 @@
+import enum
+
 from amaranth import *
-from amaranth.lib import data
+from amaranth.lib import data, stream, wiring
+from amaranth.lib.fifo import SyncFIFO
+from amaranth.lib.memory import Memory, ReadPort
+from amaranth.lib.wiring import In, Out
 from amaranth.utils import ceil_log2
 
-from .base_types import instruction_layout
+from .base_types import mapping_layout, next_power_of_two
+from .main_pipeline import (
+    FINDERS_PER_CORE,
+    MainPipeline,
+    Task,
+    instruction_ref_layout,
+    max_decisions_len,
+    max_potential_len,
+    run_stack_entry_layout,
+)
+from .memory import ChunkedMemory
+from .neighbour_lookup import neighbour_lookup_layout
+from .net import shard_depth
+from .skip_checker import undo_lookup_layout
+from .utils import pipe
+
 
-FINDERS_PER_CORE = 3
+def max_run_stack_len(max_area: int):
+    """
+    Returns the maximum number of run instructions there can be at any given time.
+    """
 
+    # The run stack's length can't ever actually reach `max_area`: if every other
+    # square is already filled, the last one to be added will always be a potential
+    # instruction since all its neighbouring squares will already be filled.
+    return max_area - 1
 
-def instruction_ref_layout(max_area: int):
-    """Returns the layout of an instruction reference."""
 
+def prefix_layout(cuboids: int, max_area: int):
     return data.StructLayout(
         {
-            # The index of the instruction's parent in the run stack.
-            "parent": ceil_log2(max_area),
-            # The index of this instruction in its parent's list of valid children.
-            #
-            # If the index is past the end of that list, it represents the last valid
-            # child. Then we always store the last valid child as 11, so that when
-            # backtracking we can immediately see 'oh this is the last one, so we need to
-            # move onto the next instruction'.
-            "child_index": 2,
+            "area": range(max_area + 1),
+            "start_mapping": mapping_layout(cuboids, max_area),
+            "start_mapping_index": (cuboids - 1) * ceil_log2(max_area),
+            # I think this reach `max_decisions_len`: if you can get to a particular length,
+            # it had to end with 1 at some point, at which point you can split and increment
+            # the base decision.
+            "base_decision": range(max_decisions_len(max_area) + 1),
         }
     )
 
 
-def max_potential_len(max_area: int):
-    """
-    Returns the maximum number of potential instructions there can be at any given
-    time.
-    """
+class FinderType(enum.Enum):
+    """The reason a finder is being emitted."""
 
-    # The upper bound of how many potential instructions there can be is if every
-    # square on the surfaces, except for the ones set by the first instruction, has
-    # 4 potential instructions trying to set it: 1 from each direction.
-    #
-    # While this isn't actually possible, it's a nice clean upper bound.
-    return 4 * (max_area - 1)
+    # The state the finder's in has passed the initial test for likely having a
+    # solution (there's an instruction or potential instruction setting every square
+    # on the surfaces).
+    Solution = 0
 
+    # The finder is a response to `req_split`.
+    Split = 1
 
-def max_decisions_len(max_area: int):
-    """Returns the maximum number of decisions there can be at any given time."""
+    # The finder is a response to `req_pause`.
+    Pause = 2
 
-    # There's always 1 decision for the first instruction, then the upper bound is
-    # that every square has 4 instructions setting it, 3 of which we decided not to
-    # run and the last one we did.
-    return 1 + 4 * (max_area - 1)
 
+def core_in_layout():
+    return data.StructLayout(
+        {
+            # The bit of the finder being received.
+            "data": 1,
+            # Whether `data` is the last bit of this finder.
+            "last": 1,
+        }
+    )
 
-def run_stack_entry_layout(cuboids: int, max_area: int):
-    """Returns the layout of a run stack entry."""
 
+def core_out_layout():
     return data.StructLayout(
         {
-            # The instruction that was run.
-            "instruction": instruction_layout(cuboids, max_area),
-            # A reference to where in the run stack this instruction originally came from.
-            "source": instruction_ref_layout(max_area),
-            # Whether this instruction's child in each direction was valid at the time this
-            # instruction was run.
-            "children": 4,
-            # The number of potential instructions there were at the point when it was run.
-            "potential_len": ceil_log2(max_potential_len(max_area) + 1),
-            # The index of the decision to run this instruction in the list of decisions.
-            "decision_index": ceil_log2(max_decisions_len(max_area)),
+            # The bit of the finder being sent.
+            "data": 1,
+            # Whether `data` is the last bit of this finder.
+            "last": 1,
+            # The reason this finder is being emitted (stays the same for the whole finder).
+            "type": FinderType,
         }
     )
+
+
+class CoreInterface(wiring.Signature):
+    def __init__(self):
+        super().__init__(
+            {
+                "input": In(stream.Signature(core_in_layout())),
+                "output": Out(stream.Signature(core_out_layout())),
+                "req_pause": In(1),
+                "req_split": In(1),
+                "wants_finder": Out(1),
+                "stepping": Out(1),
+                "active": Out(1),
+                "base_decision": Out(1),
+            }
+        )
+
+
+class State(enum.Enum):
+    Clear = 0
+    Receive = 1
+    Run = 2
+    Check = 3
+    Solution = 4
+    Pause = 5
+    Split = 6
+
+
+class Core(wiring.Component):
+    def __init__(self, cuboids: int, max_area: int):
+        nl_layout = neighbour_lookup_layout(max_area)
+        ul_layout = undo_lookup_layout(max_area)
+
+        super().__init__(
+            {
+                "interfaces": Out(CoreInterface()).array(FINDERS_PER_CORE),
+                # The ports this core should use to access the neighbour lookups.
+                "neighbour_lookups": In(
+                    ReadPort.Signature(
+                        addr_width=ceil_log2(nl_layout.depth),
+                        shape=nl_layout.shape,
+                    )
+                ).array(cuboids),
+                # The ports this core should use to access the undo lookups.
+                "undo_lookups": In(
+                    ReadPort.Signature(
+                        addr_width=ceil_log2(ul_layout.depth),
+                        shape=ul_layout.shape,
+                    )
+                ).array(cuboids - 1),
+                # The state that the finder in WB stage is in.
+                #
+                # It doesn't really matter which stage this comes from, though: the point is
+                # just to find out what state most of the core's time is being spent in, and all
+                # the finders will pass through WB stage once per iteration.
+                #
+                # I chose WB stage because its state comes out of a register, so we don't have
+                # to worry about critical paths extending into the core.
+                "state": Out(State),
+            }
+        )
+
+        self._cuboids = cuboids
+        self._max_area = max_area
+
+    def elaborate(self, platform) -> Module:
+        m = Module()
+
+        run_stack = Memory(
+            shape=run_stack_entry_layout(self._cuboids, self._max_area),
+            depth=FINDERS_PER_CORE
+            * next_power_of_two(max_run_stack_len(self._max_area)),
+            init=[],
+        )
+        m.submodules.run_stack = run_stack
+
+        potential = ChunkedMemory(
+            shape=instruction_ref_layout(self._max_area),
+            depth=max_potential_len(self._max_area),
+            chunks=FINDERS_PER_CORE,
+        )
+        m.submodules.potential = potential
+        potential_read, potential_write = potential.sdp_port(read_domain="comb")
+
+        decisions = ChunkedMemory(
+            shape=1, depth=max_decisions_len(self._max_area), chunks=FINDERS_PER_CORE
+        )
+        m.submodules.decisions = decisions
+        decisions_read, decisions_write = decisions.sdp_port()
+
+        potential_surface_ports = []
+        for i in range(self._cuboids):
+            surface = ChunkedMemory(
+                shape=1, depth=self._max_area, chunks=FINDERS_PER_CORE
+            )
+            m.submodules[f"potential_surface_{i}"] = surface
+            potential_surface_ports.append(surface.sdp_port())
+
+        in_fifos = []
+        out_fifos = []
+
+        for i in range(FINDERS_PER_CORE):
+            in_fifo = SyncFIFO(width=2, depth=1)
+            m.submodules[f"in_fifo_{i}"] = in_fifo
+
+            wiring.connect(
+                m, wiring.flipped(self.interfaces[i].input), in_fifo.w_stream
+            )
+            in_fifos.append(in_fifo)
+
+            out_fifo = SyncFIFO(width=4, depth=1)
+            m.submodules[f"out_fifo_{i}"] = out_fifo
+
+            wiring.connect(
+                m, out_fifo.r_stream, wiring.flipped(self.interfaces[i].output)
+            )
+            out_fifos.append(out_fifo)
+
+        wb_state = Signal(State)
+        # The 'normal' instruction that WB stage wanted to run next: so, not the first
+        # instruction or a potential instruction.
+        #
+        # `wb_target.parent` is allowed to be past the end of the run stack: that means
+        # that whenever this ends up getting processed, it'll actually result in a
+        # backtrack instead of this being run.
+        wb_target = Signal(instruction_ref_layout(self._max_area))
+        # Whether WB stage actually ended up processing `wb_target`.
+        wb_target_processed = Signal(1)
+        # Like `wb_target`, except that when `wb_target.parent` is past the end of the
+        # run stack, this is the instruction that was backtracked.
+        wb_inst_ref = Signal(instruction_ref_layout(self._max_area))
+        wb_potential_index = Signal(range(max_potential_len(self._max_area)))
+        wb_decision_index = Signal(range(max_decisions_len(self._max_area)))
+        # The index we're clearing: used both for Clear state and clearing the potential
+        # surfaces in the background.
+        wb_clear_index = Signal(range(shard_depth(self._max_area)))
+        wb_prefix_done = Signal(1)
+
+        # The value WB stage read from `decisions`.
+        wb_read_decision = Signal(1)
+        # If we're in Check state, whether or not we need to wait for the potential
+        # surfaces to finish being cleared before proceeding.
+        wb_clearing = Signal(1)
+        # If we're in Split state, whether we've already finished sending the finder and
+        # are now just searching for a new `base_decision`.
+        wb_finder_done = Signal(1)
+        wb_received = Signal(1)
+        wb_sent = Signal(1)
+        wb_in = Signal(core_in_layout())
+
+        wb_next_prefix = Signal(prefix_layout(self._cuboids, self._max_area))
+        wb_next_prefix_bits_left = Signal(range(wb_next_prefix.shape().size + 1))
+        wb_next_run_stack_len = Signal(range(max_run_stack_len(self._max_area) + 1))
+        wb_next_potential_len = Signal(range(max_potential_len(self._max_area) + 1))
+        wb_next_decisions_len = Signal(range(max_decisions_len(self._max_area) + 1))
+        wb_next_potential_areas = [
+            Signal(range(self._max_area + 1)) for i in range(self._cuboids)
+        ]
+
+        # Where we are in the cycle of finders moving through different pipeline stages.
+        #
+        # More concretely, the pipeline stage finder 0 is in.
+        finder_offset = Signal(range(FINDERS_PER_CORE))
+        with m.If(finder_offset == FINDERS_PER_CORE - 1):
+            m.d.sync += finder_offset.eq(0)
+        with m.Else():
+            m.d.sync += finder_offset.eq(finder_offset + 1)
+
+        # IF
+        #
+        # I considered making this pipeline stage run at the same time as WB stage, to
+        # reduce the amount of finders per core and hence resources used; but doing it
+        # that way would require implementing manual forwarding of potential
+        # instructions being written in WB stage, and while that wouldn't be
+        # particularly hard, merging the two pipeline stages together would be premature
+        # optimisation and so we shouldn't do it if it'll make the code worse.
+        #
+        # In addition, a merged WB/IF stage could very well end up being the critical
+        # path of the design, so it's not exactly as though merging them would be a
+        # guaranteed win - we may well have ended up having to split them up later
+        # anyway.
+
+        if_finder = (FINDERS_PER_CORE - finder_offset) % FINDERS_PER_CORE
+
+        if_prev_state = pipe(m, wb_state)
+        if_prev_target = pipe(m, wb_target)
+        if_prev_target_processed = pipe(m, wb_target_processed)
+        if_prev_inst_ref = pipe(m, wb_inst_ref)
+        if_prev_potential_index = pipe(m, wb_potential_index)
+        if_prev_decision_index = pipe(m, wb_decision_index)
+        if_prev_clear_index = pipe(m, wb_clear_index)
+        if_prev_prefix_done = pipe(m, wb_prefix_done)
+        if_prev_read_decision = pipe(m, wb_read_decision)
+        if_prev_clearing = pipe(m, wb_clearing)
+        if_prev_finder_done = pipe(m, wb_finder_done)
+        if_prev_received = pipe(m, wb_received)
+        if_prev_sent = pipe(m, wb_sent)
+        if_prev_in = pipe(m, wb_in)
+
+        if_initial_prefix = pipe(m, wb_next_prefix)
+        if_prefix_bits_left = pipe(
+            m, wb_next_prefix_bits_left, init=if_prefix.shape().size
+        )
+        if_run_stack_len = pipe(m, wb_next_run_stack_len)
+        if_potential_len = pipe(m, wb_next_potential_len)
+        if_decisions_len = pipe(m, wb_next_decisions_len)
+        if_potential_areas = [
+            pipe(m, wb_next_potential_areas[i]) for i in range(self._cuboids)
+        ]
+
+        if_req_pause = Array(self.interfaces)[if_finder].req_pause
+        if_req_split = Array(self.interfaces)[if_finder].req_split
+
+        if_prefix_done = if_prefix_bits_left == 0
+
+        if_next_target = Signal(instruction_ref_layout(self._max_area))
+        # TODO: this is the same as `if_next_inst + 1` (if we switch around the field
+        # order). I think this is clearer, but switch to that if it ends up improving
+        # performance.
+        with m.If(if_prev_inst_ref.child_index == 3):
+            m.d.comb += if_next_target.parent.eq(if_prev_inst_ref.parent + 1)
+            m.d.comb += if_next_target.child_index.eq(0)
+        with m.Else():
+            m.d.comb += if_next_target.parent.eq(if_prev_inst_ref.parent)
+            m.d.comb += if_next_target.child_index.eq(if_prev_inst_ref.child_index + 1)
+
+        # If WB stage processed its target, we can move on to the next one, otherwise we
+        # need to keep trying to process `if_prev_target`.
+        if_target = data.View(
+            instruction_ref_layout(self._max_area),
+            Mux(if_prev_target_processed, if_next_target, if_prev_target),
+        )
+
+        if_backtrack = (if_run_stack_len != 0) & (if_target.parent == if_run_stack_len)
+
+        if_potential_index = Mux(
+            if_prev_state == State.Check, if_prev_potential_index + ~if_prev_clearing, 0
+        )
+
+        decision_sent = if_prev_sent & if_prev_prefix_done
+        # This needs to be able to go up to `max_decisions_len` so that we can use
+        # `decision_index == decisions_len` as a check for whether we're done
+        # transmitting.
+        if_decision_index = Signal(range(max_decisions_len(self._max_area) + 1))
+        with m.Switch(if_prev_state):
+            with m.Case(State.Solution, State.Pause):
+                m.d.comb += if_decision_index.eq(if_prev_decision_index + decision_sent)
+            with m.Case(State.Split):
+                m.d.comb += if_decision_index.eq(
+                    if_prev_decision_index + (decision_sent | if_prev_finder_done)
+                )
+            with m.Default():
+                m.d.comb += if_decision_index.eq(0)
+
+        # We only get to Clear state via. resetting, which will reset this to 0 anyway:
+        # so the only time we actually need to reset it is when exiting Check state so
+        # that we don't waste time clearing addresses the potential surfaces don't have.
+        if_clear_index = Mux(if_prev_state == State.Check, 0, if_prev_clear_index + 1)
+
+        if_state = Signal(State)
+        if_prefix = Signal.like(if_initial_prefix)
+        m.d.comb += if_prefix.eq(if_initial_prefix)
+
+        with m.Switch(if_prev_state):
+            with m.Case(State.Clear):
+                with m.If(if_prev_clear_index == shard_depth(self._max_area) - 1):
+                    m.d.comb += if_state.eq(State.Receive)
+                with m.Else():
+                    m.d.comb += if_state.eq(State.Clear)
+            with m.Case(State.Receive):
+                with m.If(if_prev_received & if_prev_in.last):
+                    m.d.comb += if_state.eq(State.Run)
+                with m.Else():
+                    m.d.comb += if_state.eq(State.Receive)
+            with m.Case(State.Run):
+                with m.If(if_req_pause):
+                    m.d.comb += if_state.eq(State.Pause)
+                with m.Elif(
+                    if_req_split & (if_initial_prefix.base_decision < if_decisions_len)
+                ):
+                    m.d.comb += if_state.eq(State.Split)
+                    # Set `base_decision` to what the base decision of the finder we're sending will
+                    # be (1 past the end of its decisions), so that it gets sent out along with the
+                    # rest of the prefix.
+                    #
+                    # However, it might not be our new base decision, since it might be a 0: we'll
+                    # fix it up once we find the first 1 past our old base decision.
+                    m.d.comb += if_prefix.base_decision.eq(
+                        if_initial_prefix.base_decision + 1
+                    )
+                with m.Elif(
+                    if_backtrack
+                    & (if_run_stack_len + if_potential_len >= if_initial_prefix.area)
+                ):
+                    # There are enough run + potential instructions that we might have a solution,
+                    # so check for that before we backtrack.
+                    m.d.comb += if_state.eq(State.Check)
+                with m.Else():
+                    # Note that this also covers the case where we backtrack immediately.
+                    m.d.comb += if_state.eq(State.Run)
+            with m.Case(State.Check):
+                all_squares_filled = Cat(
+                    if_run_stack_len + if_potential_areas[i] == if_initial_prefix.area
+                    for i in range(self._cuboids)
+                ).all()
+                with m.If(all_squares_filled):
+                    # All the squares are filled, which means we have a potential solution!
+                    m.d.comb += if_state.eq(State.Solution)
+                # Note: although `wb_potential_index` can only go up to `max_potential_len - 1`,
+                # this can go all the way up to `max_potential_len` thanks to Amaranth inferring
+                # a shape big enough to fit all possible values of `if_prev_potential_index +
+                # ~if_prev_clearing`.
+                with m.Elif(if_potential_index == if_potential_len):
+                    # We've run all the potential instructions and not all the squares are filled,
+                    # so this isn't a solution. Time to backtrack.
+                    m.d.comb += if_state.eq(State.Run)
+                with m.Else():
+                    m.d.comb += if_state.eq(State.Check)
+            with m.Case(State.Solution):
+                with m.If(if_prefix_done & (if_decision_index == if_decisions_len)):
+                    m.d.comb += if_state.eq(State.Run)
+                with m.Else():
+                    m.d.comb += if_state.eq(State.Solution)
+            with m.Case(State.Pause):
+                # We transition out of `State.Pause` via. `local_reset`, rather than via. a
+                # regular state transition.
+                m.d.comb += if_state.eq(State.Pause)
+            with m.Case(State.Split):
+                # We don't actually stop once the finder is sent like you might expect: since
+                # `base_decision` can't point to a 0, we have to keep going until we find a 1 to
+                # set it to.
+                with m.If(if_prev_finder_done & if_prev_read_decision):
+                    m.d.comb += if_state.eq(State.Run)
+                    m.d.comb += if_prefix.base_decision.eq(if_prev_decision_index)
+                with m.Else():
+                    m.d.comb += if_state.eq(State.Split)
+
+        m.d.comb += potential_read.chunk.eq(if_finder)
+        m.d.comb += potential_read.addr.eq(if_potential_index)
+
+        if_run_stack_index = Signal(range(max_run_stack_len(self._max_area)))
+        with m.If(if_state == State.Check):
+            m.d.comb += if_run_stack_index.eq(potential_read.data.parent)
+        with m.Elif(if_backtrack):
+            m.d.comb += if_run_stack_index.eq(if_run_stack_len - 1)
+        with m.Else():
+            m.d.comb += if_run_stack_index.eq(if_target.parent)
+
+        if_child_index = Mux(
+            if_state == State.Check,
+            potential_read.data.child_index,
+            if_target.child_index,
+        )
+
+        if_in_fifo = Array(in_fifos)[if_finder]
+        if_in_rdy = if_in_fifo.r_rdy
+        if_in = data.View(core_in_layout(), if_in_fifo.r_data)
+
+        if_task = Signal(Task)
+        with m.Switch(if_state):
+            with m.Case(State.Clear):
+                m.d.comb += if_task.eq(Task.Clear)
+            with m.Case(State.Receive):
+                m.d.comb += if_task.eq(
+                    # If we've received a decision of 1, we need to run the next valid instruction
+                    # to fulfil it; otherwise, we want to not run the next valid instruction, but we
+                    # still need to check whether it was valid so that we know whether we can move
+                    # on to the next decision.
+                    Mux(
+                        if_prefix_done & if_in_rdy & if_in.data,
+                        Task.Advance,
+                        # This also serves as a no-op in the case where there isn't another bit
+                        # available yet.
+                        Task.Check,
+                    )
+                )
+            with m.Case(State.Run):
+                m.d.comb += if_task.eq(Mux(if_backtrack, Task.Backtrack, Task.Advance))
+            with m.Case(State.Check):
+                m.d.comb += if_task.eq(Task.Check)
+            with m.Default():
+                # In states that don't need the main pipeline, give it the Check task, since it
+                # doesn't have any side effects.
+                m.d.comb += if_task.eq(Task.Check)
+
+        run_stack_read = run_stack.read_port()
+        m.d.comb += run_stack_read.addr.eq(Cat(if_run_stack_index, if_finder))
+
+        # NL
+
+        nl_finder = (FINDERS_PER_CORE + 1 - finder_offset) % FINDERS_PER_CORE
+
+        nl_initial_state = pipe(m, if_state)
+        nl_initial_prefix_bits_left = pipe(
+            m, if_prefix_bits_left, init=if_prefix.shape().size
+        )
+        nl_initial_decisions_len = pipe(m, if_decisions_len)
+        nl_initial_task = pipe(m, if_task)
+        nl_prefix = pipe(m, if_prefix)
+        nl_decision_index = pipe(m, if_decision_index)
+        nl_entry = run_stack_read.data
+        local_reset = (
+            (nl_initial_state == State.Pause)
+            & (nl_initial_prefix_bits_left == 0)
+            & (nl_decision_index == nl_initial_decisions_len)
+        ) | (
+            (nl_initial_task == Task.Backtrack)
+            & (nl_entry.decision_index < nl_prefix.base_decision)
+        )
+
+        nl_state = Mux(local_reset, State.Clear, nl_initial_state)
+        nl_prefix_bits_left = Mux(
+            local_reset, nl_prefix.shape().size, nl_initial_prefix_bits_left
+        )
+        nl_run_stack_len = Mux(local_reset, 0, pipe(m, if_run_stack_len))
+        nl_potential_len = Mux(local_reset, 0, pipe(m, if_potential_len))
+        nl_decisions_len = Mux(local_reset, 0, nl_initial_decisions_len)
+        nl_potential_areas = [
+            Mux(local_reset, 0, pipe(m, if_potential_areas[i]))
+            for i in range(self._cuboids)
+        ]
+        nl_prefix_done = nl_prefix_bits_left == 0
+        nl_target = pipe(m, if_target)
+        nl_potential_index = pipe(m, if_potential_index)
+        nl_clear_index = Mux(local_reset, 0, pipe(m, if_clear_index))
+        nl_child_index = pipe(m, if_child_index)
+        nl_in_rdy = pipe(m, if_in_rdy)
+        nl_in = pipe(m, if_in)
+        nl_task = Mux(local_reset, Task.Clear, nl_initial_task)
+
+        main_pipeline = MainPipeline(self._cuboids, self._max_area)
+        m.submodules.main_pipeline = main_pipeline
+
+        m.d.comb += main_pipeline.finder.eq(nl_finder)
+        m.d.comb += main_pipeline.start_mapping_index.eq(nl_prefix.start_mapping_index)
+        m.d.comb += main_pipeline.task.eq(nl_task)
+        m.d.comb += main_pipeline.entry.eq(nl_entry)
+        with m.If(nl_run_stack_len == 0):
+            m.d.comb += main_pipeline.entry.instruction.pos.x.eq(0)
+            m.d.comb += main_pipeline.entry.instruction.pos.y.eq(0)
+            m.d.comb += main_pipeline.entry.instruction.mapping.eq(
+                nl_prefix.start_mapping
+            )
+        m.d.comb += main_pipeline.child.eq(
+            (nl_task != Task.Backtrack) & (nl_run_stack_len != 0)
+        )
+        m.d.comb += main_pipeline.child_index.eq(nl_child_index)
+        m.d.comb += main_pipeline.clear_index.eq(nl_clear_index)
+        for i in range(self._cuboids):
+            wiring.connect(
+                m,
+                main_pipeline.neighbour_lookups[i],
+                wiring.flipped(self.neighbour_lookups[i]),
+            )
+        for i in range(self._cuboids - 1):
+            wiring.connect(
+                m,
+                main_pipeline.undo_lookups[i],
+                wiring.flipped(self.undo_lookups[i]),
+            )
+
+        # VC
+
+        vc_finder = (FINDERS_PER_CORE + 2 - finder_offset) % FINDERS_PER_CORE
+
+        vc_state = pipe(m, nl_state)
+        vc_prefix = pipe(m, nl_prefix)
+        vc_prefix_bits_left = pipe(m, nl_prefix_bits_left, init=vc_prefix.shape().size)
+        vc_run_stack_len = pipe(m, nl_run_stack_len)
+        vc_potential_len = pipe(m, nl_potential_len)
+        vc_decisions_len = pipe(m, nl_decisions_len)
+        vc_potential_areas = [
+            pipe(m, nl_potential_areas[i]) for i in range(self._cuboids)
+        ]
+        vc_prefix_done = pipe(m, nl_prefix_done)
+        vc_target = pipe(m, nl_target)
+        vc_potential_index = pipe(m, nl_potential_index)
+        vc_decision_index = pipe(m, nl_decision_index)
+        vc_clear_index = pipe(m, nl_clear_index)
+        vc_in_rdy = pipe(m, nl_in_rdy)
+        vc_in = pipe(m, nl_in)
+        vc_task = pipe(m, nl_task)
+        vc_entry = pipe(m, nl_entry)
+
+        vc_instruction = main_pipeline.instruction
+
+        for i in range(self._cuboids):
+            read_port, _ = potential_surface_ports[i]
+            m.d.comb += read_port.chunk.eq(vc_finder)
+            m.d.comb += read_port.addr.eq(vc_instruction.mapping[i])
+
+        m.d.comb += decisions_read.chunk.eq(vc_finder)
+        m.d.comb += decisions_read.addr.eq(vc_decision_index)
+
+        # WB
+
+        wb_finder = FINDERS_PER_CORE - 1 - finder_offset
+
+        m.d.sync += wb_state.eq(vc_state)
+        wb_prefix = pipe(m, vc_prefix)
+        wb_prefix_bits_left = pipe(m, vc_prefix_bits_left, init=wb_prefix.shape().size)
+        wb_run_stack_len = pipe(m, vc_run_stack_len)
+        wb_potential_len = pipe(m, vc_potential_len)
+        wb_decisions_len = pipe(m, vc_decisions_len)
+        wb_potential_areas = [
+            pipe(m, vc_potential_areas[i]) for i in range(self._cuboids)
+        ]
+        m.d.sync += wb_prefix_done.eq(vc_prefix_done)
+        m.d.sync += wb_target.eq(vc_target)
+        m.d.sync += wb_potential_index.eq(vc_potential_index)
+        m.d.sync += wb_decision_index.eq(vc_decision_index)
+        m.d.sync += wb_clear_index.eq(vc_clear_index)
+        wb_in_rdy = pipe(m, vc_in_rdy)
+        m.d.sync += wb_in.eq(vc_in)
+        wb_task = pipe(m, vc_task)
+        wb_entry = pipe(m, vc_entry)
+        wb_instruction = pipe(m, vc_instruction)
+
+        wb_instruction_valid = main_pipeline.instruction_valid
+        wb_neighbours_valid = main_pipeline.neighbours_valid
+
+        m.d.comb += wb_read_decision.eq(decisions_read.data)
+
+        wb_run = (
+            (wb_task == Task.Advance) & wb_instruction_valid & wb_neighbours_valid.any()
+        )
+        wb_potential = (
+            (wb_task == Task.Advance)
+            & wb_instruction_valid
+            & ~wb_neighbours_valid.any()
+        )
+
+        m.d.comb += wb_received.eq(
+            (wb_state == State.Receive)
+            & wb_in_rdy
+            # If the instruction wasn't valid, whether or not to run it wasn't a decision.
+            & ~(wb_prefix_done & (~wb_instruction_valid | ~wb_neighbours_valid.any()))
+        )
+        for i in range(FINDERS_PER_CORE):
+            m.d.comb += in_fifos[i].r_en.eq((wb_finder == i) & wb_received)
+
+        m.d.comb += wb_finder_done.eq(
+            wb_prefix_done & (wb_decision_index >= wb_prefix.base_decision)
+        )
+
+        wb_out_fifo = Array(out_fifos)[wb_finder]
+        m.d.comb += wb_sent.eq(
+            (
+                (wb_state == State.Solution)
+                | (wb_state == State.Pause)
+                | ((wb_state == State.Split) & ~wb_finder_done)
+            )
+            & wb_out_fifo.w_rdy
+        )
+
+        split_reached = (wb_state == State.Split) & (
+            wb_decision_index == wb_prefix.base_decision - 1
+        )
+        wb_out = Signal(core_out_layout())
+        m.d.comb += wb_out.data.eq(
+            Mux(
+                wb_prefix_done,
+                wb_read_decision & ~split_reached,
+                wb_prefix.as_value()[-1],
+            )
+        )
+        m.d.comb += wb_out.last.eq(
+            Mux(
+                wb_prefix_done,
+                (wb_decision_index == wb_decisions_len - 1) | split_reached,
+                (wb_decisions_len == 0) & (wb_prefix_bits_left == 1),
+            )
+        )
+        with m.Switch(wb_state):
+            with m.Case(State.Solution):
+                m.d.comb += wb_out.type.eq(FinderType.Solution)
+            with m.Case(State.Pause):
+                m.d.comb += wb_out.type.eq(FinderType.Pause)
+            with m.Case(State.Split):
+                m.d.comb += wb_out.type.eq(FinderType.Split)
+            # It doesn't really matter what this is in other states, leave it as 0.
+
+        for i in range(FINDERS_PER_CORE):
+            m.d.comb += out_fifos[i].w_data.eq(wb_out)
+            m.d.comb += out_fifos[i].w_en.eq((wb_finder == i) & wb_sent)
+
+            m.d.comb += self.interfaces[i].stepping.eq(
+                (wb_finder == i) & (wb_run | (wb_task == Task.Backtrack))
+            )
+
+        last_child = wb_target.child_index == sum(wb_entry.children) - 1
+        normalised_target = Signal.like(wb_target)
+        m.d.comb += normalised_target.parent.eq(wb_target.parent)
+        m.d.comb += normalised_target.child_index.eq(
+            Mux(last_child, 3, wb_target.child_index)
+        )
+
+        run_stack_write = run_stack.write_port()
+        m.d.comb += run_stack_write.addr.eq(Cat(wb_run_stack_len, wb_finder))
+        m.d.comb += run_stack_write.data.instruction.eq(wb_instruction)
+        m.d.comb += run_stack_write.data.source.eq(normalised_target)
+        m.d.comb += run_stack_write.data.children.eq(wb_neighbours_valid)
+        m.d.comb += run_stack_write.data.potential_len.eq(wb_potential_len)
+        m.d.comb += run_stack_write.data.decision_index.eq(wb_decisions_len)
+        m.d.comb += run_stack_write.en.eq(wb_run)
+
+        m.d.comb += potential_write.chunk.eq(wb_finder)
+        m.d.comb += potential_write.addr.eq(wb_potential_len)
+        m.d.comb += potential_write.data.eq(wb_target)
+        m.d.comb += potential_write.en.eq(wb_potential)
+
+        m.d.comb += decisions_write.chunk.eq(wb_finder)
+        m.d.comb += decisions_write.addr.eq(
+            Mux(wb_task == Task.Backtrack, wb_entry.decision_index, wb_decisions_len)
+        )
+        m.d.comb += decisions_write.data.eq(
+            Mux(wb_state == State.Receive, wb_in.data, wb_task != Task.Backtrack)
+        )
+        m.d.comb += decisions_write.en.eq(
+            (wb_received & wb_prefix_done) | wb_run | (wb_task == Task.Backtrack)
+        )
+
+        # The `wb_potential_index == 0` is necessary because otherwise Check state would
+        # freeze up as soon as it ran the first potential instruction and the potential
+        # surfaces weren't empty anymore.
+        m.d.comb += wb_clearing.eq(
+            (wb_potential_index == 0)
+            & Cat(wb_potential_areas[i] != 0 for i in range(self._cuboids)).any()
+        )
+        for i in range(self._cuboids):
+            _, write_port = potential_surface_ports[i]
+            m.d.comb += write_port.chunk.eq(wb_finder)
+            m.d.comb += write_port.addr.eq(
+                Mux(
+                    wb_state == State.Check,
+                    wb_instruction.mapping[i].square,
+                    wb_clear_index,
+                )
+            )
+            m.d.comb += write_port.data.eq(wb_state == State.Check)
+            m.d.comb += write_port.en.eq(
+                Mux(
+                    wb_state == State.Check,
+                    wb_instruction_valid,
+                    wb_clear_index < self._max_area,
+                )
+            )
+
+        shift_prefix = (wb_received | wb_sent) & ~wb_prefix_done
+        prefix_in = Mux(wb_state == State.Receive, wb_in.data, wb_prefix.as_value()[-1])
+        m.d.comb += wb_next_prefix.eq(
+            Mux(shift_prefix, Cat(prefix_in, wb_prefix.as_value()[:-1]), wb_prefix)
+        )
+        m.d.comb += wb_next_prefix_bits_left.eq(
+            Mux(
+                (wb_state == State.Receive)
+                | (wb_state == State.Solution)
+                | (wb_state == State.Pause)
+                | (wb_state == State.Split),
+                wb_prefix_bits_left - shift_prefix,
+                wb_prefix.shape().size,
+            )
+        )
+
+        with m.If(
+            (wb_task == Task.Backtrack)
+            & (wb_entry.decision_index == wb_prefix.base_decision)
+        ):
+            m.d.comb += wb_next_prefix.base_decision.eq(wb_prefix.base_decision + 1)
+
+        with m.If(wb_task == Task.Backtrack):
+            m.d.comb += wb_next_run_stack_len.eq(wb_run_stack_len - 1)
+            m.d.comb += wb_next_potential_len.eq(wb_entry.potential_len)
+        with m.Else():
+            m.d.comb += wb_next_run_stack_len.eq(wb_run_stack_len + wb_run)
+            m.d.comb += wb_next_potential_len.eq(wb_potential_len + wb_potential)
+
+        with m.Switch(wb_state):
+            with m.Case(State.Receive):
+                m.d.comb += wb_next_decisions_len.eq(
+                    wb_decisions_len + (wb_received & wb_prefix_done)
+                )
+            with m.Case(State.Run):
+                with m.If(wb_task == Task.Backtrack):
+                    m.d.comb += wb_next_decisions_len.eq(wb_entry.decision_index + 1)
+                with m.Else():
+                    m.d.comb += wb_next_decisions_len.eq(wb_decisions_len + wb_run)
+            with m.Default():
+                m.d.comb += wb_next_decisions_len.eq(wb_decisions_len)
+
+        for i in range(self._cuboids):
+            read_port, write_port = potential_surface_ports[i]
+            with m.If(wb_state == State.Clear):
+                wb_next_potential_areas[i].eq(wb_potential_areas[i])
+            with m.Elif(write_port.en & (read_port.data == 0) & (write_port.data == 1)):
+                wb_next_potential_areas[i].eq(wb_potential_areas[i] + 1)
+            with m.Elif(write_port.en & (read_port.data == 1) & (write_port.data == 0)):
+                wb_next_potential_areas[i].eq(wb_potential_areas[i] - 1)
+            with m.Else():
+                wb_next_potential_areas[i].eq(wb_potential_areas[i])
+
+        m.d.comb += wb_target_processed.eq(
+            (
+                ((wb_state == State.Receive) & wb_prefix_done & wb_in_rdy)
+                | (wb_state == State.Run)
+            )
+            & (wb_run_stack_len != 0)
+        )
+
+        # Arguably we should consider Check state here too, but this is only used for
+        # computing the next target anyway so it doesn't really matter.
+        #
+        # We use `normalised_target` here so that we can always find the next target by
+        # just adding 1 to `child_index`.
+        m.d.comb += wb_inst_ref.eq(
+            Mux(wb_task == Task.Backtrack, wb_entry.source, normalised_target)
+        )
+
+        for i in range(FINDERS_PER_CORE):
+            state = Signal(State)
+            prefix = Signal.like(if_prefix)
+            prefix_bits_left = Signal.like(if_prefix_bits_left)
+            with m.Switch((finder_offset + i) % FINDERS_PER_CORE):
+                with m.Case(0):
+                    m.d.comb += state.eq(if_state)
+                    m.d.comb += prefix.eq(if_prefix)
+                    m.d.comb += prefix_bits_left.eq(if_prefix_bits_left)
+                with m.Case(1):
+                    m.d.comb += state.eq(nl_state)
+                    m.d.comb += prefix.eq(nl_prefix)
+                    m.d.comb += prefix_bits_left.eq(nl_prefix_bits_left)
+                with m.Case(2):
+                    m.d.comb += state.eq(vc_state)
+                    m.d.comb += prefix.eq(vc_prefix)
+                    m.d.comb += prefix_bits_left.eq(vc_prefix_bits_left)
+                with m.Case(3):
+                    m.d.comb += state.eq(wb_state)
+                    m.d.comb += prefix.eq(wb_prefix)
+                    m.d.comb += prefix_bits_left.eq(wb_prefix_bits_left)
+
+            m.d.comb += self.interfaces[i].wants_finder.eq(
+                (state == State.Receive) & prefix_bits_left == prefix.shape().size
+            )
+            m.d.comb += self.interfaces[i].active.eq(
+                (state != State.Clear) & (state != State.Receive)
+            )
+            m.d.comb += self.interfaces[i].base_decision.eq(prefix.base_decision)
+
+        m.d.comb += self.state.eq(wb_state)
+
+        return m
diff --git a/net_finder/core/main_pipeline.py b/net_finder/core/main_pipeline.py
index 3acb196..7a30bef 100644
--- a/net_finder/core/main_pipeline.py
+++ b/net_finder/core/main_pipeline.py
@@ -1,7 +1,7 @@
 import enum
 
 from amaranth import *
-from amaranth.lib import wiring
+from amaranth.lib import data, wiring
 from amaranth.lib.memory import ReadPort
 from amaranth.lib.wiring import In, Out
 from amaranth.utils import ceil_log2
@@ -10,12 +10,92 @@
 from net_finder.core.net import Net, shard_depth
 
 from .base_types import instruction_layout, net_size
-from .core import FINDERS_PER_CORE, run_stack_entry_layout
 from .memory import ChunkedMemory
 from .neighbour_lookup import neighbour_lookup_layout
 from .skip_checker import SkipChecker, undo_lookup_layout
 from .utils import pipe
 
+FINDERS_PER_CORE = 4
+
+
+def instruction_ref_layout(max_area: int):
+    """Returns the layout of an instruction reference."""
+
+    return data.StructLayout(
+        {
+            # The index of the instruction's parent in the run stack.
+            "parent": ceil_log2(max_area),
+            # The index of this instruction in its parent's list of valid children.
+            #
+            # If the index is past the end of that list, it represents the last valid
+            # child. Then we always store the last valid child as 11, so that when
+            # backtracking we can immediately see 'oh this is the last one, so we need to
+            # move onto the next instruction'.
+            "child_index": 2,
+        }
+    )
+
+
+def max_potential_len(max_area: int):
+    """
+    Returns the maximum number of potential instructions there can be at any given
+    time.
+    """
+
+    # The upper bound of how many potential instructions there can be is if every
+    # square on the surfaces, except for the ones set by the first instruction, has
+    # 4 potential instructions trying to set it: 1 from each direction.
+    #
+    # While this isn't actually possible, it's a nice clean upper bound.
+    #
+    # TODO: I think we can reduce this to 4 + 2 * (max_run_stack_len - 1), since:
+    # - The first instruction can produce at most 4 potential instructions.
+    # - Each instruction after that:
+    #   - Reduces the max. potential instructions by one (since we'd previously
+    #     pessimistically assumed it was a potential instruction, but now clearly it
+    #     isn't because we've run it)
+    #   - Increases the max. potential instructions by 3.
+    #   - So in total, it increases the maximum by 2.
+    return 4 * (max_area - 1)
+
+
+def max_decisions_len(max_area: int):
+    """Returns the maximum number of decisions there can be at any given time."""
+
+    # There's always 1 decision for the first instruction, then the upper bound is
+    # that every square has 4 instructions setting it, 3 of which we decided not to
+    # run and the last one we did.
+    #
+    # TODO: smaller upper bound:
+    #
+    # Say you have a list of decisions.
+    #
+    # If it's of maximal length, it should have max_run_stack_len 1s.
+    #
+    # The first 1 produces at most 4 instructions, and the rest produce at most 3:
+    # so then the maximum number of decisions is 4 + 3 * (max_run_stack_len - 1).
+    return 1 + 4 * (max_area - 1)
+
+
+def run_stack_entry_layout(cuboids: int, max_area: int):
+    """Returns the layout of a run stack entry."""
+
+    return data.StructLayout(
+        {
+            # The instruction that was run.
+            "instruction": instruction_layout(cuboids, max_area),
+            # A reference to where in the run stack this instruction originally came from.
+            "source": instruction_ref_layout(max_area),
+            # Whether this instruction's child in each direction was valid at the time this
+            # instruction was run.
+            "children": 4,
+            # The number of potential instructions there were at the point when it was run.
+            "potential_len": ceil_log2(max_potential_len(max_area) + 1),
+            # The index of the decision to run this instruction in the list of decisions.
+            "decision_index": ceil_log2(max_decisions_len(max_area)),
+        }
+    )
+
 
 def child_index_to_direction(children: int, child_index: int) -> int | None:
     """
@@ -99,6 +179,15 @@ def __init__(self, cuboids: int, max_area: int):
                 "task": In(Task),
                 # The run stack entry we're operating on.
                 "entry": In(run_stack_entry_layout(cuboids, max_area)),
+                # Whether the instruction to advance/check is a child of `entry`, not
+                # `entry.instruction` itself.
+                #
+                # This should always be 0 when backtracking.
+                #
+                # This is almost always 1 when advancing/checking: the only exception is when
+                # running the first instruction, since it gets handed to us from outside and
+                # doesn't have a parent.
+                "child": In(1),
                 # The index of the child of `self.entry` we're operating on (if we're advancing or
                 # checking).
                 "child_index": In(2),
@@ -118,10 +207,14 @@ def __init__(self, cuboids: int, max_area: int):
                         shape=ul_layout.shape,
                     )
                 ).array(cuboids - 1),
+                # Signals coming from VC stage.
+                #
                 # The instruction the pipeline ended up operating on - so, the neighbour of
                 # `entry` when advancing/checking, and `entry.instruction` itself when
                 # backtracking.
                 "instruction": Out(instruction_layout(cuboids, max_area)),
+                # Signals coming from WB stage.
+                #
                 # Whether or not `instruction` was valid.
                 "instruction_valid": Out(1),
                 # Whether or not the neighbours of `instruction` in each direction were valid.
@@ -157,6 +250,7 @@ def elaborate(self, platform) -> Module:
         nl_start_mapping_index = self.start_mapping_index
         nl_task = self.task
         nl_entry = self.entry
+        nl_child = self.child
         nl_child_index = self.child_index
         nl_clear_index = self.clear_index
 
@@ -186,7 +280,7 @@ def elaborate(self, platform) -> Module:
                 wiring.flipped(self.neighbour_lookups[i]),
             )
         m.d.comb += neighbour_lookup.input.eq(nl_entry.instruction)
-        m.d.comb += neighbour_lookup.t_mode.eq(nl_task != Task.Backtrack)
+        m.d.comb += neighbour_lookup.t_mode.eq(nl_child)
         m.d.comb += neighbour_lookup.direction.eq(nl_child_direction)
 
         # Valid check (VC) stage
@@ -235,12 +329,12 @@ def elaborate(self, platform) -> Module:
         m.d.comb += skip_checker.fixed_family.eq(vc_fixed_family)
         m.d.comb += skip_checker.transform.eq(vc_transform)
 
+        m.d.comb += self.instruction.eq(vc_middle)
+
         # Write back (WB) stage
         #
         # This is the stage where we write back any changes that were made to the net
         # and surfaces.
-        #
-        # This occurs at the same time as the outer pipeline's IF stage.
 
         wb_finder = pipe(m, vc_finder)
         wb_task = pipe(m, vc_task)
diff --git a/net_finder/core/memory.py b/net_finder/core/memory.py
index 57f21eb..9fcadcb 100644
--- a/net_finder/core/memory.py
+++ b/net_finder/core/memory.py
@@ -1,3 +1,5 @@
+from itertools import chain
+
 from amaranth import *
 from amaranth.hdl import ShapeLike, ValueLike
 from amaranth.lib import wiring
@@ -56,12 +58,13 @@ def __init__(self, *, shape: ShapeLike, depth: int, chunks: int):
         self._depth = depth
         self._chunks = chunks
 
-        self._read_ports: list[PureInterface] = []
-        self._sdp_ports: list[tuple[PureInterface, PureInterface]] = []
+        self._read_ports: list[tuple[PureInterface, str]] = []
+        self._write_ports: list[PureInterface] = []
+        self._sdp_ports: list[tuple[tuple[PureInterface, str], PureInterface]] = []
 
         super().__init__({})
 
-    def read_port(self) -> PureInterface:
+    def read_port(self, domain="sync") -> PureInterface:
         # Return a disconnected interface, which we then add to an array and hook up
         # during `elaborate`.
         port = ChunkedReadPortSignature(
@@ -70,11 +73,24 @@ def read_port(self) -> PureInterface:
             shape=self._shape,
         ).create()
 
-        self._read_ports.append(port)
+        self._read_ports.append((port, domain))
 
         return port
 
-    def sdp_port(self) -> tuple[PureInterface, PureInterface]:
+    def write_port(self) -> PureInterface:
+        # Return a disconnected interface, which we then add to an array and hook up
+        # during `elaborate`.
+        port = ChunkedWritePortSignature(
+            chunk_width=ceil_log2(self._chunks),
+            addr_width=ceil_log2(self._depth),
+            shape=self._shape,
+        ).create()
+
+        self._write_ports.append(port)
+
+        return port
+
+    def sdp_port(self, read_domain="sync") -> tuple[PureInterface, PureInterface]:
         # Return disconnected interfaces, which we then add to an array and hook up
         # during `elaborate`.
         read_port = ChunkedReadPortSignature(
@@ -88,7 +104,7 @@ def sdp_port(self) -> tuple[PureInterface, PureInterface]:
             shape=self._shape,
         ).create()
 
-        self._sdp_ports.append((read_port, write_port))
+        self._sdp_ports.append(((read_port, read_domain), write_port))
 
         return read_port, write_port
 
@@ -106,7 +122,7 @@ def elaborate(self, platform) -> Module:
 
             # Give the chunk a port corresponding to each of our outer ports, and hook up
             # their inputs.
-            for port_index, (read_port, write_port) in enumerate(self._sdp_ports):
+            for port_index, ((read_port, _), write_port) in enumerate(self._sdp_ports):
                 inner_read_port = chunk.read_port()
                 inner_write_port = chunk.write_port()
 
@@ -123,18 +139,25 @@ def elaborate(self, platform) -> Module:
 
                 inner_sdp_read_ports[port_index].append(inner_read_port)
 
-            for port_index, port in enumerate(self._read_ports):
-                inner_port = chunk.read_port()
+            for port_index, (port, domain) in enumerate(self._read_ports):
+                inner_port = chunk.read_port(domain=domain)
                 m.d.comb += inner_port.addr.eq(port.addr)
                 inner_read_ports[port_index].append(inner_port)
 
-        # Connect up the SDP read ports' outputs.
-        for (port, _), inner_ports in zip(self._sdp_ports, inner_sdp_read_ports):
-            m.d.comb += port.data.eq(Array(inner_ports)[port.chunk].data)
-
-        # Connect up the regular read ports' outputs.
-        for port, inner_ports in zip(self._read_ports, inner_read_ports):
-            m.d.comb += port.data.eq(Array(inner_ports)[port.chunk].data)
+            for port_index, port in enumerate(self._write_ports):
+                inner_port = chunk.write_port()
+                m.d.comb += inner_port.addr.eq(port.addr)
+                m.d.comb += inner_port.data.eq(port.data)
+                m.d.comb += inner_port.en.eq(port.en & (port.chunk == chunk_index))
+
+        # Connect up the read ports' outputs.
+        for (port, domain), inner_ports in zip(
+            chain(self._read_ports, (r for r, _ in self._sdp_ports)),
+            chain(inner_read_ports, inner_sdp_read_ports),
+        ):
+            chunk = Signal.like(port.chunk)
+            m.d[domain] += chunk.eq(port.chunk)
+            m.d.comb += port.data.eq(Array(inner_ports)[chunk].data)
 
         return m
 
diff --git a/net_finder/core/net.py b/net_finder/core/net.py
index c3d10c4..9486450 100644
--- a/net_finder/core/net.py
+++ b/net_finder/core/net.py
@@ -4,7 +4,7 @@
 from amaranth.lib.wiring import In, Out
 from amaranth.utils import ceil_log2
 
-from .base_types import PosLayout, PosView, net_size
+from .base_types import PosLayout, PosView, net_size, next_power_of_two
 from .memory import ChunkedMemory
 from .utils import pipe
 
@@ -13,7 +13,7 @@ def shard_depth(max_area: int):
     net_size_ = net_size(max_area)
     # We need to round one of the dimensions up to the next power of two in order
     # for concatenating the x and y coordinates to work properly.
-    return (net_size_ << ceil_log2(net_size_)) // 4
+    return net_size_ * next_power_of_two(net_size_) // 4
 
 
 def neighbour_shards(m: Module, pos: PosView):
diff --git a/net_finder/core/utils.py b/net_finder/core/utils.py
index 50c1a21..0c58ec8 100644
--- a/net_finder/core/utils.py
+++ b/net_finder/core/utils.py
@@ -2,10 +2,10 @@
 from amaranth.hdl import ValueLike
 
 
-def pipe(m: Module, input: ValueLike) -> Signal:
+def pipe(m: Module, input: ValueLike, **kwargs) -> Signal:
     # src_loc_at tells Signal how far up in the call chain to look for what to name
     # the signal: so, setting it to 1 means we want it to use the name of the
     # variable the caller's assigning our result to.
-    output = Signal.like(input, src_loc_at=1)
+    output = Signal.like(input, src_loc_at=1, **kwargs)
     m.d.sync += output.eq(input)
     return output
diff --git a/net_finder/soc/core.py b/net_finder/soc/core.py
index 5e966ed..45ad1ab 100644
--- a/net_finder/soc/core.py
+++ b/net_finder/soc/core.py
@@ -305,6 +305,7 @@ def __init__(self, cuboids: list[Cuboid], n: int):
         # Whether or not each core is splittable (is active and has a `base_decision` of
         # `splittable_base`).
         splittable = Cat(
+            # TODO: base_decision is garbage while sending, so this decision-making might be a bit off.
             cores_active[i] & (core.base_decision == splittable_base)
             for i, core in enumerate(cores)
         )
diff --git a/shell.nix b/shell.nix
index e2dbdb9..679c431 100644
--- a/shell.nix
+++ b/shell.nix
@@ -29,23 +29,17 @@ let
 in
 pkgs.mkShell {
   venvDir = ".venv";
-  packages =
-    [
-      pkgs.python311.pkgs.venvShellHook
+  packages = [
+    pkgs.python311.pkgs.venvShellHook
 
-      openocd
-      pkgs.yosys
+    openocd
+    pkgs.yosys
 
-      # Needed by Verilator simulations
-      pkgs.json_c
-      pkgs.libevent
-      pkgs.zlib
-    ]
-    ++ pkgs.lib.optionals pkgs.stdenv.isDarwin [
-      # Needed by Rust code
-      pkgs.libiconv
-      pkgs.darwin.apple_sdk.frameworks.CoreFoundation
-    ];
+    # Needed by Verilator simulations
+    pkgs.json_c
+    pkgs.libevent
+    pkgs.zlib
+  ];
 
   postVenvCreation = ''
     ${pkgs.uv}/bin/uv pip install -r requirements.txt
diff --git a/src/bin/dump_neighbours.rs b/src/bin/dump_neighbours.rs
index e95d3d1..6b5fbef 100644
--- a/src/bin/dump_neighbours.rs
+++ b/src/bin/dump_neighbours.rs
@@ -1,4 +1,5 @@
-//! Dumps the information required by `test_neighbour_lookup.py` as JSON to stdout.
+//! Dumps the information required by `test_neighbour_lookup.py` as JSON to
+//! stdout.
 
 use std::io;
 use std::time::Duration;
diff --git a/src/geometry.rs b/src/geometry.rs
index 1b7d525..9803fb4 100644
--- a/src/geometry.rs
+++ b/src/geometry.rs
@@ -470,6 +470,14 @@ impl Net<bool> {
         self.color_with_cache(cuboid, &square_cache)
     }
 
+    // TODO: make a variant of `color` that returns all the colorings, then filters
+    // them down to the ones that are actually different: that is, renumber all the
+    // faces in the order they occur so that mapping to different faces doesn't make
+    // a difference, and then return the ones that are distinct under that
+    // representation. I think that's a good definition of different foldings:
+    // if you fold along the same lines every time, you should get the same result,
+    // and same coloring = same lines.
+
     /// Return a version of this net with its squares 'colored' with which faces
     /// they're on.
     ///
@@ -1695,8 +1703,8 @@ impl Class {
             .unwrap()
     }
 
-    /// Returns the list of all the transformations you can perform to get from the
-    /// root of this class's family to this class.
+    /// Returns the list of all the transformations you can perform to get from
+    /// the root of this class's family to this class.
     pub fn alternate_transforms(
         self,
         cache: &SquareCache,
diff --git a/src/primary/mod.rs b/src/primary/mod.rs
index 14ec266..83aafda 100644
--- a/src/primary/mod.rs
+++ b/src/primary/mod.rs
@@ -229,7 +229,8 @@ impl<const CUBOIDS: usize> FinderCtx<CUBOIDS> {
         }
     }
 
-    /// Given a class on the fixed cuboid, returns whether it's in the fixed family.
+    /// Given a class on the fixed cuboid, returns whether it's in the fixed
+    /// family.
     pub fn fixed_family(&self, cursor: Cursor) -> bool {
         let index = cursor.0 as usize;
         (self.maybe_skipped_lookup[index >> 6] >> (index & 0x3f)) & 1 != 0
@@ -1111,7 +1112,10 @@ impl<const CUBOIDS: usize> FinderCtx<CUBOIDS> {
         // net but disagreeing on what each other should map to (which leads to a cut).
         for &instruction in completed.iter() {
             let to_check = [
-                instruction,
+                instruction, /* TODO: is this needed? I think it was at the time this was
+                              * written, since `net` hadn't been added yet, but `net` should now
+                              * guarantee that you can't have two instructions setting the same
+                              * net position. */
                 instruction.moved_in(self, Left),
                 instruction.moved_in(self, Up),
                 instruction.moved_in(self, Right),
@@ -1600,6 +1604,13 @@ pub fn drive<const CUBOIDS: usize, R: Runtime<CUBOIDS>>(
             continue;
         }
 
+        // TODO: this is starting to get suspicious. Counting flipped versions of nets
+        // as well isn't doubling the number of solutions; the flipped version of the
+        // net will always be another valid solution, so that should mean it's happening
+        // because the flipped version of the net is the same as the original net and
+        // still doesn't get counted separately, but that doesn't seem to be the case.
+        // So then why isn't it double? Are we missing solutions? I think we
+        // established that this was due to skipping? I don't 100% remember...
         count += 1;
         progress.suspend(|| {
             println!(