MPMC 2-stack queue #112

polytypic · 2023-12-17T13:31:50Z

This PR implements a MPMC queue using two stacks. This is a new lock-free queue algorithm / data structure. It uses two stacks for the tail and head of the queue. Operations on the tail (pushes) and head (pops) are like with a Treiber stack. A simple lock-free algorithm is used to transfer elements from the tail to head after a pop is attempted on an empty head (and the tail is non-empty).

The interesting feature of this queue is that it seems to outperform an optimized Michael-Scott queue (see #122) on many machines. Here are results from a benchmark run on my M3 Max:

➜  saturn git:(mpmc-two-stack-queue) ✗ dune exec --release -- ./bench/main.exe -budget 1 'Two' | jq '[.results.[].metrics.[] | select(.name | test("over")) | {name, value}]'
[                                      
  {
    "name": "messages over time/one domain",
    "value": 57.04506560182545
  },
  {
    "name": "messages over time/1 nb adder, 1 nb taker",
    "value": 141.66814237648308
  },
  {
    "name": "messages over time/1 nb adder, 2 nb takers",
    "value": 118.94340356581118
  },
  {
    "name": "messages over time/2 nb adders, 1 nb taker",
    "value": 108.01087885571833
  },
  {
    "name": "messages over time/2 nb adders, 2 nb takers",
    "value": 103.93427612115202
  }
]

As one can see, the (median) thruput is substantially higher than that of the optimized Michael-Scott queue (see #122) and that seems to be the case on most machines I've tested this on. Most interestingly, however, on the "fermat" machine we use for benchmarking, the Michael-Scott queue seems to perform better. See my comment below for a possible explanation.

polytypic · 2024-02-09T14:28:05Z

Out of curiosity I tried to test the theory of whether one could improve the performance of this queue on Opteron by writing to the cache line (non-atomically) before reading from the cache line to avoid the cache line being put into the shared/owned mode (assuming that is the problem). I made the following changes to the code:

modified   src_lockfree/two_stack_queue.ml
@@ -12,7 +12,17 @@
    OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
    PERFORMANCE OF THIS SOFTWARE. *)
 
-module Atomic = Transparent_atomic
+module Atomic = struct
+  include Transparent_atomic
+
+  let[@inline] get_for_set (x : _ t) =
+    Array.unsafe_set (Obj.magic x : int array) 6 0; (* NOTE: A padded atomic! *)
+    get x
+
+  let[@inline] fenceless_get_for_set (x : _ t) =
+    Array.unsafe_set (Obj.magic x : int array) 6 0; (* NOTE: A padded atomic! *)
+    fenceless_get x
+end
 
 type 'a t = { head : 'a head Atomic.t; tail : 'a tail Atomic.t }
 
@@ -67,7 +77,7 @@ let rec push t value backoff = function
       if move != Obj.magic () then begin
         let (Snoc move_r) = move in
         begin
-          match Atomic.get t.head with
+          match Atomic.get_for_set t.head with
           | H (Head head_r as head) when head_r.counter < move_r.counter ->
               let after = rev move in
               if
@@ -88,9 +98,10 @@ and push_with t value backoff counter prefix =
   if new_tail != prefix then push t value backoff new_tail
   else if not (Atomic.compare_and_set t.tail prefix (T after)) then
     let backoff = Backoff.once backoff in
-    push t value backoff (Atomic.fenceless_get t.tail)
+    push t value backoff (Atomic.fenceless_get_for_set t.tail)
 
-let push t value = push t value Backoff.default (Atomic.fenceless_get t.tail)
+let push t value =
+  push t value Backoff.default (Atomic.fenceless_get_for_set t.tail)
 
 exception Empty
 
@@ -103,7 +114,7 @@ let rec pop_as : type a r. a t -> _ -> (a, r) poly -> a head -> r =
         match poly with Value -> cons_r.value | Option -> Some cons_r.value
       else
         let backoff = Backoff.once backoff in
-        pop_as t backoff poly (Atomic.fenceless_get t.head)
+        pop_as t backoff poly (Atomic.fenceless_get_for_set t.head)
   | H (Head head_r as head) -> begin
       match Atomic.fenceless_get t.tail with
       | T (Snoc snoc_r as move) ->
@@ -112,14 +123,14 @@ let rec pop_as : type a r. a t -> _ -> (a, r) poly -> a head -> r =
               match poly with
               | Value -> snoc_r.value
               | Option -> Some snoc_r.value
-            else pop_as t backoff poly (Atomic.fenceless_get t.head)
+            else pop_as t backoff poly (Atomic.fenceless_get_for_set t.head)
           else
             let tail = Tail { counter = snoc_r.counter; move } in
             let new_head = Atomic.get t.head in
             if new_head != H head then pop_as t backoff poly new_head
             else if Atomic.compare_and_set t.tail (T move) (T tail) then
               pop_moving_as t backoff poly head move tail
-            else pop_as t backoff poly (Atomic.fenceless_get t.head)
+            else pop_as t backoff poly (Atomic.fenceless_get_for_set t.head)
       | T (Tail tail_r as tail) ->
           let move = tail_r.move in
           if move == Obj.magic () then pop_emptyish_as t backoff poly head
@@ -148,7 +159,7 @@ and pop_moving_as :
         end
         else
           let backoff = Backoff.once backoff in
-          pop_as t backoff poly (Atomic.fenceless_get t.head)
+          pop_as t backoff poly (Atomic.fenceless_get_for_set t.head)
   else pop_emptyish_as t backoff poly head
 
 and pop_emptyish_as : type a r. a t -> _ -> (a, r) poly -> (a, _) tdt -> r =
@@ -158,8 +169,10 @@ and pop_emptyish_as : type a r. a t -> _ -> (a, r) poly -> (a, _) tdt -> r =
     match poly with Value -> raise_notrace Empty | Option -> None
   else pop_as t backoff poly new_head
 
-let pop t = pop_as t Backoff.default Value (Atomic.fenceless_get t.head)
-let pop_opt t = pop_as t Backoff.default Option (Atomic.fenceless_get t.head)
+let pop t = pop_as t Backoff.default Value (Atomic.fenceless_get_for_set t.head)
+
+let pop_opt t =
+  pop_as t Backoff.default Option (Atomic.fenceless_get_for_set t.head)

The results are not entirely conclusive:

The 1st result from the right is after dropping the experiment and the 6th results from the right is the first result with some of the extra writes. Using eyeball statistics it would seem like there is potentially some improvement. Of course, performing an actual write to memory is quite different from prefetching a cache line in anticipation of a write.

edwintorok · 2024-05-06T16:04:30Z

The fermat machine has an Opteron (piledriver) CPU from 2013, which probably isn't the best target machine for optimizing multicore algorithms.

I agree, that CPU is not supported by its vendor anymore. See https://www.amd.com/en/support/download/drivers.html and https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/log/amd-ucode/README?showmsg=1, notice that lack of microcode updates since 2018: https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/log/amd-ucode/microcode_amd_fam15h.bin.asc
That doesn't mean they are not useful (I still have some tests that I run on Opterons), but shouldn't be the main optimization target.

I'd suggest using at least a Zen1 CPU for AMD, and Skylake for Intel.
Here are some results from AMD Ryzen 9 7950X using OCaml 5.2.0-rc+fp on Fedora 39/40 (using -diff base.json, where base.json was obtained on latest master):

Saturn_lockfree Queue:                
  time per message/one domain:
    110.33 ns = 1.00 x 110.30 ns
  messages over time/one domain:
    9.06 M/s = 1.00 x 9.07 M/s
  time per message/1 nb adder, 1 nb taker:
    321.40 ns = 1.00 x 322.87 ns
  messages over time/1 nb adder, 1 nb taker:
    6.22 M/s = 1.00 x 6.19 M/s
  time per message/1 nb adder, 2 nb takers:
    364.39 ns = 1.14 x 319.16 ns
  messages over time/1 nb adder, 2 nb takers:
    8.23 M/s = 0.88 x 9.40 M/s
  time per message/2 nb adders, 1 nb taker:
    241.21 ns = 0.78 x 308.96 ns
  messages over time/2 nb adders, 1 nb taker:
    12.44 M/s = 1.28 x 9.71 M/s
  time per message/2 nb adders, 2 nb takers:
    372.38 ns = 1.01 x 368.26 ns
  messages over time/2 nb adders, 2 nb takers:
    10.74 M/s = 0.99 x 10.86 M/s

The short comparison here doesn't show confidence intervals, but rerunning it shows similar variation on master:

Saturn_lockfree Queue:                
  time per message/one domain:
    110.83 ns = 1.00 x 110.30 ns
  messages over time/one domain:
    9.02 M/s = 1.00 x 9.07 M/s
  time per message/1 nb adder, 1 nb taker:
    317.72 ns = 0.98 x 322.87 ns
  messages over time/1 nb adder, 1 nb taker:
    6.29 M/s = 1.02 x 6.19 M/s
  time per message/1 nb adder, 2 nb takers:
    346.13 ns = 1.08 x 319.16 ns
  messages over time/1 nb adder, 2 nb takers:
    8.67 M/s = 0.92 x 9.40 M/s
  time per message/2 nb adders, 1 nb taker:
    261.80 ns = 0.85 x 308.96 ns
  messages over time/2 nb adders, 1 nb taker:
    11.46 M/s = 1.18 x 9.71 M/s
  time per message/2 nb adders, 2 nb takers:
    340.20 ns = 0.92 x 368.26 ns
  messages over time/2 nb adders, 2 nb takers:
    11.76 M/s = 1.08 x 10.86 M/s

polytypic · 2024-05-10T07:44:34Z

Note that the Saturn_lockfree Queue refers to the Michael-Scott based queue. The code in this branch adds a benchmark for Saturn_lockfree Two_stack_queue. If you run dune exec --release -- ./bench/main.exe -budget=1 -brief Queue Two you can see results for both in this branch. The M-S queue in this branch is the same as in main and suffers from a few known performance issues. There is another PR #122 that optimizes the M-S queue.

doesn't show confidence intervals

Yes, the "statistics" in multicore-bench is very rudimentary. The idea has been to move that to the benchmarking service/frontend and just have the benchmarking db store the raw results. This way it should then be possible to view/analyze the data in multiple ways. I'm not sure at what point we might get around to do that, however.

polytypic force-pushed the mpmc-two-stack-queue branch 9 times, most recently from fe48302 to cf2beeb Compare December 20, 2023 12:35

polytypic force-pushed the mpmc-two-stack-queue branch 3 times, most recently from fa07ee9 to d090cb7 Compare January 11, 2024 14:37

polytypic force-pushed the mpmc-two-stack-queue branch 18 times, most recently from e99cf2a to e96063f Compare January 17, 2024 00:46

polytypic force-pushed the mpmc-two-stack-queue branch from 32fbde3 to 4184353 Compare February 9, 2024 14:15

polytypic force-pushed the mpmc-two-stack-queue branch 2 times, most recently from 2655ec6 to 2578a23 Compare February 16, 2024 08:43

polytypic force-pushed the mpmc-two-stack-queue branch 4 times, most recently from c461de3 to 2a7e58d Compare February 17, 2024 17:21

polytypic force-pushed the mpmc-two-stack-queue branch 4 times, most recently from f7a1c73 to 224fcfe Compare March 1, 2024 13:51

polytypic force-pushed the mpmc-two-stack-queue branch from 224fcfe to 6592a73 Compare March 13, 2024 07:47

polytypic force-pushed the mpmc-two-stack-queue branch 3 times, most recently from cf16d30 to fa499cd Compare April 2, 2024 16:46

polytypic mentioned this pull request May 6, 2024

Document that Saturn_lockfree.Queue.{snapshot + next} are not linearizable #132

Open

polytypic force-pushed the mpmc-two-stack-queue branch from fa499cd to 8ed33e5 Compare May 25, 2024 10:41

polytypic force-pushed the mpmc-two-stack-queue branch from 8ed33e5 to ec37b10 Compare July 8, 2024 20:13

polytypic force-pushed the mpmc-two-stack-queue branch 4 times, most recently from 35fe5bc to 79880f8 Compare August 12, 2024 09:48

polytypic added 2 commits August 22, 2024 21:48

MPMC 2-stack queue

61f16cd

Add push_head

32e220f

polytypic force-pushed the mpmc-two-stack-queue branch from 79880f8 to 32e220f Compare August 22, 2024 18:48

polytypic mentioned this pull request Aug 27, 2024

Michael-Scott queue : safe - unsafe versions #146

Merged

2 tasks

Fix pop

586dd4b

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MPMC 2-stack queue #112

MPMC 2-stack queue #112

polytypic commented Dec 17, 2023 •

edited

Loading

polytypic commented Feb 9, 2024

edwintorok commented May 6, 2024

polytypic commented May 10, 2024 •

edited

Loading

MPMC 2-stack queue #112

Are you sure you want to change the base?

MPMC 2-stack queue #112

Conversation

polytypic commented Dec 17, 2023 • edited Loading

polytypic commented Feb 9, 2024

edwintorok commented May 6, 2024

polytypic commented May 10, 2024 • edited Loading

polytypic commented Dec 17, 2023 •

edited

Loading

polytypic commented May 10, 2024 •

edited

Loading