;; Here is a fun experiment. It demonstrates the effect of two strategies
;; for handling concurrent update in sbcl, mutexes and compare-and-swap,
;; plus the effect of mindlessly updating.
;; All experiments run a number of threads, each thread wants to
;; increment a variable a number of times, then decrement the
;; variable the same number of times. The variable is shared between
;; the threads. Since the threads doesnt do anything else, the
;; contention is high.
;; First the functions and macros that make up my test-harness.
;; Just to save some keystrokes:
(defun thread-name ()
(sb-thread:thread-name sb-thread:*current-thread*))
;; Traditional locking for print to avoid cut off lines etc
(defparameter *say-mutex* (sb-thread:make-mutex))
;; stdout is a closed over variable in the thread closures. It contains
;; *standard-output* from the repl.
(defmacro say (&rest forms)
"Needed to print from threads in slime"
`(sb-thread:with-mutex (*say-mutex*)
(funcall 'format stdout ,@forms)
(fresh-line stdout)
(finish-output stdout)))
;; Saving more keystrokes
(defmacro elapsed ()
`(- (get-internal-real-time) start))
;; Creates a thread function (closure) with the supplied body,
;; starts a number of threads and waits for them to complete.
(defmacro with-threads ((thread-count)
&body body)
(let ((threadno (gensym)))
`(mapcar #'sb-thread:join-thread
(loop for ,threadno below ,thread-count collect
(sb-thread:make-thread
(lambda ()
,@body)
:name (format nil "Thread-~d" ,threadno))))))
;; Binding some variables
(defmacro with-mumble (&body body)
`(let ((start (get-internal-real-time))
(stdout *standard-output*))
,@body))
;; The data we are going to work on
(defparameter *data* 0)
;; Reports the data
(defmacro conclude (text data)
`(say "~a rest-value: ~d (~a), operations: ~d time used: ~d ms"
,text
,data
(if (= 0 ,data) "OK" "WRONG")
(* 2 iteration-count thread-count)
(elapsed)))
;;The first test increments and decrements a lexical variable.
;; With only 100 iterations, it seems to work mostly ok
;;CL-USER> (t0 100 4)
;;Thread-0 done
;;Thread-1 done
;;Thread-2 done
;;Thread-3 done
;;Unprotected, lexical, rest-value: 0 (OK), operations: 800 time used: 2 ms
;; but with about 3000 iterations it fails sometimes, and with 10000 iterations
;; it fails each time
;;CL-USER> (t0 10000 4)
;;Thread-2 done
;;Thread-1 done
;;Thread-0 done
;;Thread-3 done
;;Unprotected, lexical, rest-value: -3334 (WRONG), operations: 80000 time used: 5 ms
;;Here is the test function
(defun t0 (iteration-count thread-count)
(with-mumble
(let ((data 0))
(with-threads (thread-count)
(loop for x below iteration-count do (incf data))
(loop for x below iteration-count do (decf data))
(say ";;~a done" (thread-name)))
(conclude ";;Unprotected, lexical," data))))
;; The next test works on a special variable, the result is
;; just like previous example
;; CL-USER> (t1 10000 6)
;;Thread-0 done
;;Thread-1 done
;;Thread-2 done
;;Thread-4 done
;;Thread-3 done
;;Thread-5 done
;;Unprotected, special, rest-value: 3880 (WRONG), operations: 120000 time used: 9 ms
;;Here is the test function
(defun t1 (iteration-count thread-count)
(with-mumble
(setf *data* 0)
(with-threads (thread-count)
(loop for x below iteration-count do (incf *data*))
(loop for x below iteration-count do (decf *data*))
(say ";;~a done" (thread-name)))
(conclude ";;Unprotected, special," *data*)))
;; Another one, incrementing with discrete instructions, don't know if these
;; are optimized to be like incf, anyway the result is the same
(defun t1% (iteration-count thread-count)
(with-mumble
(setf *data* 0)
(with-threads (thread-count)
(loop for x below iteration-count do (let ((new (1+ *data*))) (setf *data* new)))
(loop for x below iteration-count do (let ((new (1- *data*))) (setf *data* new)))
(say ";;~a done" (thread-name)))
(conclude ";;Unprotected, special," *data*)))
;;We have to protect the data from concurrent update from multiple threads, this
;;test uses traditional locks
;;Here we get the correct result each time.
;;CL-USER> (t2 10000 4)
;;Thread-0 done
;;Thread-2 done
;;Thread-1 done
;;Thread-3 done
;;Mutex, rest-value: 0 (OK), operations: 80000 time used: 91 ms
;;CL-USER> (t2 100000 12)
;;Thread-9 done
;;Thread-0 done
;;Thread-5 done
;;Thread-3 done
;;Thread-11 done
;;Thread-6 done
;;Thread-8 done
;;Thread-7 done
;;Thread-1 done
;;Thread-10 done
;;Thread-4 done
;;Thread-2 done
;;Mutex, rest-value: 0 (OK), operations: 2400000 time used: 5209 ms
;;the test-function with locking:
(defun t2 (iteration-count thread-count)
(with-mumble
(let ((mutex (sb-thread:make-mutex)))
(setf *data* 0)
(with-threads (thread-count)
(loop for x below iteration-count do (sb-thread:with-mutex (mutex) (incf *data*)))
(loop for x below iteration-count do (sb-thread:with-mutex (mutex) (decf *data*)))
(say ";;~a done" (thread-name)))
(conclude ";;Mutex," *data*))))
;; Now to the mysterious compare-and-swap. This function takes a place, a new value
;; and an old value. It tries to set the place to the new value in an atomic operation,
;; but only if the existing value of place is as specified in old. If the return value
;; is something different from what we specified as old value, the swap is not
;; performed. The reason for this would be that another thread has changed the value
;; in the meantime.
;; The operation does not use locks, but since we might not succeed, we have to retry
;; it until we do. This macro applies function to place using compare-and-swap,
;; retrying until it succeeds. Each time it does not succeed, it increments a
;; counter that has to be bound in advance.
(defmacro protect (place function)
`(let (old new)
(loop
do
(setf old ,place)
(setf new (funcall ,function old))
until (eql old (compare-and-swap ,place old new))
do (incf unsuccessful-cas))))
;; Here is the test, doing the same increment and decrement of the common
;; variable.
(defun t3 (iteration-count thread-count)
(with-mumble
(setf *data* 0)
(with-threads (thread-count)
(let ((unsuccessful-cas 0))
(loop for x below iteration-count do
(protect (symbol-value '*data*) #'1+))
(loop for x below iteration-count do
(protect (symbol-value '*data*) #'1-))
(say ";;~a unsuccessful-cas ~d" (thread-name) unsuccessful-cas)))
(conclude ";;CAS," *data*)))
;;The first run has only 1000 inc's and dec's, and all cas operations
;;succeeds
;;CL-USER> (t3 1000 4)
;;Thread-0 unsuccessful-cas 0
;;Thread-1 unsuccessful-cas 0
;;Thread-2 unsuccessful-cas 0
;;Thread-3 unsuccessful-cas 0
;;CAS, rest-value: 0 (OK), operations: 8000 time used: 3 ms
;; Next we increase the count to 10000. Note the high
;; number of retries:
;;CL-USER> (t3 10000 4)
;;Thread-0 unsuccessful-cas 5952
;;Thread-1 unsuccessful-cas 5937
;;Thread-2 unsuccessful-cas 15762
;;Thread-3 unsuccessful-cas 17472
;;CAS, rest-value: 0 (OK), operations: 80000 time used: 17 ms
;; Let's compare with mutexes, these are the same number of
;; iterations and threads as the largest mutex-example:
;;CL-USER> (t3 100000 12)
;;Thread-0 unsuccessful-cas 220289
;;Thread-3 unsuccessful-cas 213210
;;Thread-1 unsuccessful-cas 297819
;;Thread-6 unsuccessful-cas 243704
;;Thread-7 unsuccessful-cas 267340
;;Thread-2 unsuccessful-cas 312256
;;Thread-5 unsuccessful-cas 236505
;;Thread-11 unsuccessful-cas 201204
;;Thread-10 unsuccessful-cas 234383
;;Thread-4 unsuccessful-cas 342688
;;Thread-8 unsuccessful-cas 320228
;;Thread-9 unsuccessful-cas 251330
;;CAS, rest-value: 0 (OK), operations: 2400000 time used: 952 ms
;;As you see, the number of retries is about the same as iterations
;; in total, still the time used is quite lower than the mutex
;;example, 952 ms in stead of 5209 with mutex.
;; insurance against printing the big array
(setf *print-length* 40)
;; Here I use a big array of conses to operate on. The idea is
;; that there should be less unsuccessful cas's when the
;; updating is spread around. I can not use just an array of
;; values, because CAS can not operate on arrays. See
;; (describe 'compare-and-swap)
;; There are still a lot of unsuccessful cas, I don't understand
;; that.
;; In these tests, the CAS function runs about 8 times faster
;; than the mutex function
(defun t4 (iteration-count thread-count)
(let ((array (make-array 1000000)))
(loop for index below (length array) do (setf (aref array index) (cons 0 nil)))
(with-mumble
(let ()
(with-threads (thread-count)
(let ((unsuccessful-cas 0)
(*random-state* (make-random-state t)))
(loop for x below iteration-count do
(let ((cons (aref array (random (length array)))))
(protect (car cons) #'1+)))
(loop for x below iteration-count do
(let ((cons (aref array (random (length array)))))
(protect (car cons) #'1-)))
(say ";;~a unsuccessful-cas ~d" (thread-name) unsuccessful-cas)))
(let ((rest (loop for x across array sum (car x))))
(say ";;Array CAS rest-value: ~d (~a), operations: ~d time used: ~d ms"
rest
(if (= 0 rest) "OK" "WRONG")
(* 2 iteration-count thread-count)
(elapsed)
))))))
;;CL-USER> (t4 100000 12)
;;Thread-10 unsuccessful-cas 3638
;;Thread-4 unsuccessful-cas 5831
;;Thread-5 unsuccessful-cas 13015
;;Thread-0 unsuccessful-cas 6305
;;Thread-11 unsuccessful-cas 5650
;;Thread-1 unsuccessful-cas 15040
;;Thread-9 unsuccessful-cas 3079
;;Thread-2 unsuccessful-cas 20865
;;Thread-3 unsuccessful-cas 14628
;;Thread-6 unsuccessful-cas 15771
;;Thread-8 unsuccessful-cas 28106
;;Thread-7 unsuccessful-cas 14053
;;Array CAS rest-value: 0 (OK), operations: 2400000 time used: 866 ms
(defun t5 (iteration-count thread-count)
(let ((array (make-array 1000000)))
(loop for index below (length array) do (setf (aref array index) (cons 0 nil)))
(with-mumble
(let ((mutex (sb-thread:make-mutex)))
(with-threads (thread-count)
(let ((*random-state* (make-random-state t)))
(loop for x below iteration-count do
(let ((cons (aref array (random (length array)))))
(sb-thread:with-mutex (mutex) (incf (car cons)))))
(loop for x below iteration-count do
(let ((cons (aref array (random (length array)))))
(sb-thread:with-mutex (mutex) (decf (car cons)))))
(say ";;~a finished" (thread-name))))
(let ((rest (loop for x across array sum (car x))))
(say ";;Array mutex rest-value: ~d (~a), operations: ~d time used: ~d ms"
rest
(if (= 0 rest) "OK" "WRONG")
(* 2 iteration-count thread-count)
(elapsed)
))))))
;;CL-USER> (t5 100000 12)
;;Thread-2 finished
;;Thread-6 finished
;;Thread-8 finished
;;Thread-1 finished
;;Thread-7 finished
;;Thread-0 finished
;;Thread-3 finished
;;Thread-4 finished
;;Thread-10 finished
;;Thread-5 finished
;;Thread-9 finished
;;Thread-11 finished
;;Array mutex rest-value: 0 (OK), operations: 2400000 time used: 6853 ms
;; The hardware used in the test are a dual processor Xeon
;; with hyperthreads from a few years back.
;;CL-USER> (time (t4 100000 12))
;;Thread-1 unsuccessful-cas 30462
;;Thread-0 unsuccessful-cas 29751
;;Thread-2 unsuccessful-cas 16
;;Thread-7 unsuccessful-cas 2
;;Thread-3 unsuccessful-cas 23
;;Thread-4 unsuccessful-cas 3
;;Thread-6 unsuccessful-cas 20613
;;Thread-8 unsuccessful-cas 0
;;Thread-9 unsuccessful-cas 1
;;Thread-5 unsuccessful-cas 20965
;;Thread-10 unsuccessful-cas 2
;;Thread-11 unsuccessful-cas 0
;;Array CAS rest-value: 0 (OK), operations: 2400000 time used: 607 ms
;; Evaluation took:
;; 0.631 seconds of real time
;; 1.964122 seconds of total run time (1.948121 user, 0.016001 system)
;; 311.25% CPU
;; 1,511,523,688 processor cycles
;; 12,171,832 bytes consed
;;cat /proc/cpuinfo lists 4 cpus
;;model name : Intel(R) Xeon(TM) CPU 2.40GHz
;;cpu MHz : 2396.048
;;cache size : 512 KB