From: Johan Ur Riise
Subject: Mutex, Compare and swap
Date: 
Message-ID: <87prgyapvw.fsf@morr.riise-data.net>
;; Here is a fun experiment. It demonstrates the effect of two strategies
;; for handling concurrent update in sbcl, mutexes and compare-and-swap,
;; plus the effect of mindlessly updating.

;; All experiments run a number of threads, each thread wants to 
;; increment a variable a number of times, then decrement the
;; variable the same number of times. The variable is shared between 
;; the threads. Since the threads doesnt do anything else, the 
;; contention is high.

;; First the functions and macros that make up my test-harness.

;; Just to save some keystrokes:
(defun thread-name ()
  (sb-thread:thread-name sb-thread:*current-thread*))

;; Traditional locking for print to avoid cut off lines etc

(defparameter *say-mutex* (sb-thread:make-mutex))

;; stdout is a closed over variable in the thread closures. It contains 
;; *standard-output* from the repl. 
(defmacro say (&rest forms)
  "Needed to print from threads in slime"
  `(sb-thread:with-mutex (*say-mutex*) 
      (funcall 'format stdout ,@forms)
      (fresh-line stdout)
      (finish-output stdout)))

;; Saving more keystrokes
(defmacro elapsed ()
  `(- (get-internal-real-time) start))

;; Creates a thread function (closure) with the supplied body,
;; starts a number of threads and waits for them to complete.

(defmacro with-threads ((thread-count)
                        &body body)
  (let ((threadno (gensym)))
    `(mapcar #'sb-thread:join-thread
             (loop for ,threadno below ,thread-count collect 
                  (sb-thread:make-thread 
                   (lambda ()
                     ,@body)
                   :name (format nil "Thread-~d" ,threadno))))))

;; Binding some variables
(defmacro with-mumble (&body body)
  `(let ((start (get-internal-real-time))
         (stdout *standard-output*))
     ,@body))

;; The data we are going to work on
(defparameter *data* 0)

;; Reports the data
(defmacro conclude (text data)
  `(say "~a rest-value: ~d (~a), operations: ~d time used: ~d ms" 
        ,text
        ,data
        (if (= 0 ,data) "OK" "WRONG")
        (* 2 iteration-count thread-count)
        (elapsed)))

;;The first test increments and decrements a lexical variable.
;; With only 100 iterations, it seems to work mostly ok
;;CL-USER> (t0 100 4)
;;Thread-0 done
;;Thread-1 done
;;Thread-2 done
;;Thread-3 done
;;Unprotected, lexical, rest-value: 0 (OK), operations: 800 time used: 2 ms

;; but with about 3000 iterations it fails sometimes, and with 10000 iterations
;; it fails each time

;;CL-USER> (t0 10000 4)
;;Thread-2 done
;;Thread-1 done
;;Thread-0 done
;;Thread-3 done
;;Unprotected, lexical, rest-value: -3334 (WRONG), operations: 80000 time used: 5 ms
;;Here is the test function
(defun t0 (iteration-count thread-count)
  (with-mumble
    (let ((data 0))
      (with-threads (thread-count)
        (loop for x below iteration-count do (incf data))
        (loop for x below iteration-count do (decf data))
        (say ";;~a done" (thread-name)))
      (conclude ";;Unprotected, lexical," data))))

;; The next test works on a special variable, the result is 
;; just like previous example
;; CL-USER> (t1 10000 6)
;;Thread-0 done
;;Thread-1 done
;;Thread-2 done
;;Thread-4 done
;;Thread-3 done
;;Thread-5 done
;;Unprotected, special, rest-value: 3880 (WRONG), operations: 120000 time used: 9 ms
;;Here is the test function
(defun t1 (iteration-count thread-count)
  (with-mumble
    (setf *data* 0)
    (with-threads (thread-count)
      (loop for x below iteration-count do (incf *data*))
      (loop for x below iteration-count do (decf *data*))
      (say ";;~a done" (thread-name)))
    (conclude ";;Unprotected, special," *data*)))

;; Another one, incrementing with discrete instructions, don't know if these
;; are optimized to be like incf, anyway the result is the same
(defun t1% (iteration-count thread-count)
  (with-mumble
    (setf *data* 0)
    (with-threads (thread-count)
      (loop for x below iteration-count do (let ((new (1+ *data*))) (setf *data* new)))
      (loop for x below iteration-count do (let ((new (1- *data*))) (setf *data* new)))
      (say ";;~a done" (thread-name)))
    (conclude ";;Unprotected, special," *data*)))

;;We have to protect the data from concurrent update from multiple threads, this
;;test uses traditional locks
;;Here we get the correct result each time.
;;CL-USER> (t2 10000 4)
;;Thread-0 done
;;Thread-2 done
;;Thread-1 done
;;Thread-3 done
;;Mutex, rest-value: 0 (OK), operations: 80000 time used: 91 ms

;;CL-USER> (t2 100000 12)
;;Thread-9 done
;;Thread-0 done
;;Thread-5 done
;;Thread-3 done
;;Thread-11 done
;;Thread-6 done
;;Thread-8 done
;;Thread-7 done
;;Thread-1 done
;;Thread-10 done
;;Thread-4 done
;;Thread-2 done
;;Mutex, rest-value: 0 (OK), operations: 2400000 time used: 5209 ms

;;the test-function with locking:
(defun t2 (iteration-count thread-count)
  (with-mumble
    (let ((mutex (sb-thread:make-mutex)))
      (setf *data* 0)
      (with-threads (thread-count)
        (loop for x below iteration-count do (sb-thread:with-mutex (mutex) (incf *data*)))
        (loop for x below iteration-count do (sb-thread:with-mutex (mutex) (decf *data*)))
        (say ";;~a done" (thread-name)))
      (conclude ";;Mutex," *data*))))


;; Now to the mysterious compare-and-swap. This function takes a place, a new value
;; and an old value. It tries to set the place to the new value in an atomic operation,
;; but only if the existing value of place is as specified in old. If the return value
;; is something different from what we specified as old value, the swap is not 
;; performed. The reason for this would be that another thread has changed the value
;; in the meantime.

;; The operation does not use locks, but since we might not succeed, we have to retry
;; it until we do. This macro applies function to place using compare-and-swap,
;; retrying until it succeeds. Each time it does not succeed, it increments a 
;; counter that has to be bound in advance. 

(defmacro protect (place function) 
  `(let (old new)
     (loop
        do
          (setf old ,place) 
          (setf new (funcall ,function old))
        until (eql old (compare-and-swap ,place old new))
        do (incf unsuccessful-cas))))

;; Here is the test, doing the same increment and decrement of the common 
;; variable.

(defun t3 (iteration-count thread-count)
  (with-mumble
    (setf *data* 0)
    (with-threads (thread-count)
      (let ((unsuccessful-cas 0))
        (loop for x below iteration-count do 
             (protect (symbol-value '*data*) #'1+))
        (loop for x below iteration-count do 
             (protect (symbol-value '*data*) #'1-))
        (say ";;~a unsuccessful-cas ~d" (thread-name) unsuccessful-cas)))
    (conclude ";;CAS," *data*)))

;;The first run has only 1000 inc's and dec's, and all cas operations 
;;succeeds
;;CL-USER> (t3 1000 4)
;;Thread-0 unsuccessful-cas 0
;;Thread-1 unsuccessful-cas 0
;;Thread-2 unsuccessful-cas 0
;;Thread-3 unsuccessful-cas 0
;;CAS, rest-value: 0 (OK), operations: 8000 time used: 3 ms

;; Next we increase the count to 10000. Note the high
;; number of retries:
;;CL-USER> (t3 10000 4)
;;Thread-0 unsuccessful-cas 5952
;;Thread-1 unsuccessful-cas 5937
;;Thread-2 unsuccessful-cas 15762
;;Thread-3 unsuccessful-cas 17472
;;CAS, rest-value: 0 (OK), operations: 80000 time used: 17 ms

;; Let's compare with mutexes, these are the same number of
;; iterations and threads as the largest mutex-example:
;;CL-USER> (t3 100000 12)
;;Thread-0 unsuccessful-cas 220289
;;Thread-3 unsuccessful-cas 213210
;;Thread-1 unsuccessful-cas 297819
;;Thread-6 unsuccessful-cas 243704
;;Thread-7 unsuccessful-cas 267340
;;Thread-2 unsuccessful-cas 312256
;;Thread-5 unsuccessful-cas 236505
;;Thread-11 unsuccessful-cas 201204
;;Thread-10 unsuccessful-cas 234383
;;Thread-4 unsuccessful-cas 342688
;;Thread-8 unsuccessful-cas 320228
;;Thread-9 unsuccessful-cas 251330
;;CAS, rest-value: 0 (OK), operations: 2400000 time used: 952 ms

;;As you see, the number of retries is about the same as iterations
;; in total, still the time used is quite lower than the mutex
;;example, 952 ms in stead of 5209 with mutex.


;; insurance against printing the big array
(setf *print-length* 40)

;; Here I use a big array of conses to operate on. The idea is 
;; that there should be less unsuccessful cas's when the
;; updating is spread around. I can not use just an array of 
;; values, because CAS can not operate on arrays. See 
;; (describe 'compare-and-swap)

;; There are still a lot of unsuccessful cas, I don't understand
;; that.

;; In these tests, the CAS function runs about 8 times faster
;; than the mutex function

(defun t4 (iteration-count thread-count)
  (let ((array (make-array 1000000)))
    (loop for index below (length array) do (setf (aref array index) (cons 0 nil)))
    (with-mumble
      (let ()
        (with-threads (thread-count)
          (let ((unsuccessful-cas 0)
                (*random-state* (make-random-state t)))
            (loop for x below iteration-count do 
                 (let ((cons (aref array (random (length array)))))
                   (protect (car cons) #'1+)))
            (loop for x below iteration-count do 
                 (let ((cons (aref array (random (length array)))))
                   (protect (car cons) #'1-)))
            (say ";;~a unsuccessful-cas ~d" (thread-name) unsuccessful-cas)))
        (let ((rest (loop for x across array sum (car x))))
          (say ";;Array CAS rest-value: ~d (~a), operations: ~d time used: ~d ms"
               rest
               (if (= 0 rest) "OK" "WRONG")
               (* 2 iteration-count thread-count)
               (elapsed)
               ))))))
;;CL-USER> (t4 100000 12)
;;Thread-10 unsuccessful-cas 3638
;;Thread-4 unsuccessful-cas 5831
;;Thread-5 unsuccessful-cas 13015
;;Thread-0 unsuccessful-cas 6305
;;Thread-11 unsuccessful-cas 5650
;;Thread-1 unsuccessful-cas 15040
;;Thread-9 unsuccessful-cas 3079
;;Thread-2 unsuccessful-cas 20865
;;Thread-3 unsuccessful-cas 14628
;;Thread-6 unsuccessful-cas 15771
;;Thread-8 unsuccessful-cas 28106
;;Thread-7 unsuccessful-cas 14053
;;Array CAS rest-value: 0 (OK), operations: 2400000 time used: 866 ms

(defun t5 (iteration-count thread-count)
  (let ((array (make-array 1000000)))
    (loop for index below (length array) do (setf (aref array index) (cons 0 nil)))  
    (with-mumble
      (let ((mutex (sb-thread:make-mutex)))
        (with-threads (thread-count)
          (let ((*random-state* (make-random-state t)))
            (loop for x below iteration-count do 
                 (let ((cons (aref array (random (length array)))))
                   (sb-thread:with-mutex (mutex) (incf (car cons)))))
            (loop for x below iteration-count do 
                 (let ((cons (aref array (random (length array)))))
                   (sb-thread:with-mutex (mutex) (decf (car cons)))))
            (say ";;~a finished" (thread-name))))
        (let ((rest (loop for x across array sum (car x))))
          (say ";;Array mutex rest-value: ~d (~a), operations: ~d time used: ~d ms"
               rest
               (if (= 0 rest) "OK" "WRONG")
               (* 2 iteration-count thread-count)
               (elapsed)
               ))))))

;;CL-USER> (t5 100000 12)
;;Thread-2 finished
;;Thread-6 finished
;;Thread-8 finished
;;Thread-1 finished
;;Thread-7 finished
;;Thread-0 finished
;;Thread-3 finished
;;Thread-4 finished
;;Thread-10 finished
;;Thread-5 finished
;;Thread-9 finished
;;Thread-11 finished
;;Array mutex rest-value: 0 (OK), operations: 2400000 time used: 6853 ms

;; The hardware used in the test are a dual processor Xeon
;; with hyperthreads from a few years back.

;;CL-USER> (time (t4 100000 12))
;;Thread-1 unsuccessful-cas 30462
;;Thread-0 unsuccessful-cas 29751
;;Thread-2 unsuccessful-cas 16
;;Thread-7 unsuccessful-cas 2
;;Thread-3 unsuccessful-cas 23
;;Thread-4 unsuccessful-cas 3
;;Thread-6 unsuccessful-cas 20613
;;Thread-8 unsuccessful-cas 0
;;Thread-9 unsuccessful-cas 1
;;Thread-5 unsuccessful-cas 20965
;;Thread-10 unsuccessful-cas 2
;;Thread-11 unsuccessful-cas 0
;;Array CAS rest-value: 0 (OK), operations: 2400000 time used: 607 ms
;; Evaluation took:
;;   0.631 seconds of real time
;;   1.964122 seconds of total run time (1.948121 user, 0.016001 system)
;;   311.25% CPU
;;   1,511,523,688 processor cycles
;;   12,171,832 bytes consed

;;cat /proc/cpuinfo lists 4 cpus

;;model name      : Intel(R) Xeon(TM) CPU 2.40GHz
;;cpu MHz         : 2396.048
;;cache size      : 512 KB