Split tasks in load balance #83

mratsim · 2019-12-30T12:46:34Z

So the fact that loadBalance didn't split tasks was the main reason behind #35 whoops. Fixes #35 and addresses the main reason behind aprell/tasking-2.0#3

Histogram pathological bench

After the fix we are consistently faster than OpenMP on the histogram bench:

$  ./build/weave_histo_split 
--------------------------------------------------------------------------
Scheduler:                                    Sequential
Benchmark:                                    Histogram 2D 
Threads:                                      1
Matrix:                                       25000 x 25000
Histogram bins:                               1000
--------------------------------------------------------------------------
Time(ms):                                     1459.663
Max RSS (KB):                                 2443432
Runtime RSS (KB):                             0
# of page faults:                             0
Max (from histogram):                         0.00800323486328125
--------------------------------------------------------------------------
Scheduler:                                    Weave - Parallel Reduce (eager flowvars)
Benchmark:                                    Histogram 2D 
Threads:                                      36
Matrix:                                       25000 x 25000
Histogram bins:                               1000
--------------------------------------------------------------------------
Time(ms):                                     98.73099999999999
Max RSS (KB):                                 2449192
Runtime RSS (KB):                             5320
# of page faults:                             1828
Max (from histogram):                         0.00800323486328125
--------------------------------------------------------------------------
Scheduler:                                    Weave - Parallel For Staged (eager flowvars)
Benchmark:                                    Histogram 2D 
Threads:                                      36
Matrix:                                       25000 x 25000
Histogram bins:                               1000
--------------------------------------------------------------------------
Time(ms):                                     83.36499999999999
Max RSS (KB):                                 2450200
Runtime RSS (KB):                             1008
# of page faults:                             925
Max (from histogram):                         0.00800323486328125

$  ./build/histo_omp
time serial: 1.955220
max  serial: 0.008003
time openmp: 0.112712
max  openmp: 0.008003
diff 0

So we can put it out of the pathological cases and complete @HadrienG2 challenge in HadrienG2/parallel-histograms-bench#2 (comment)

Log-Sum-Exp pathological bench

LogSumExp is now also faster than sequential

$  ./build/weave_lse_split
Note that a text vocabulary is often in the 50000-15000 words

Sanity check, logSumExp(1..<10) should be 9.4585514 (numpy logsumexp): 9.458551406860352


--------------------------------------------------------------------------
Scheduler:                                    Sequential
Benchmark:                                    Log-Sum-Exp (Machine Learning) 
Threads:                                      1
datasetSize:                                  20000
batchSize:                                    256
# of full batches:                            78
# of image labels:                            1000
Text vocabulary size:                         10000
--------------------------------------------------------------------------
Dataset:                                      256x1000
Time(ms):                                     79.05
Max RSS (KB):                                 80760
Runtime RSS (KB):                             0
# of page faults:                             0
Logsumexp:                                    1013.554382324219
--------------------------------------------------------------------------
Dataset:                                      256x10000
Time(ms):                                     784.914
Max RSS (KB):                                 861992
Runtime RSS (KB):                             0
# of page faults:                             0
Logsumexp:                                    1193.10498046875
--------------------------------------------------------------------------
Scheduler:                                    Weave Reduce (eager flowvars)
Benchmark:                                    Log-Sum-Exp (Machine Learning) 
Threads:                                      36
datasetSize:                                  20000
batchSize:                                    256
# of full batches:                            78
# of image labels:                            1000
Text vocabulary size:                         10000
--------------------------------------------------------------------------
Dataset:                                      256x1000
Time(ms):                                     23.368
Max RSS (KB):                                 954580
Runtime RSS (KB):                             13588
# of page faults:                             3541
Logsumexp:                                    1012.461975097656
--------------------------------------------------------------------------
Dataset:                                      256x10000
Time(ms):                                     211.193
Max RSS (KB):                                 1743676
Runtime RSS (KB):                             7656
# of page faults:                             1950
Logsumexp:                                    1193.15625
--------------------------------------------------------------------------
Scheduler:                                    Weave (Collapsed) (eager flowvars)
Benchmark:                                    Log-Sum-Exp (Machine Learning) 
Threads:                                      36
datasetSize:                                  20000
batchSize:                                    256
# of full batches:                            78
# of image labels:                            1000
Text vocabulary size:                         10000
--------------------------------------------------------------------------
Dataset:                                      256x1000
Time(ms):                                     40.658
Max RSS (KB):                                 1876008
Runtime RSS (KB):                             54188
# of page faults:                             13646
Logsumexp:                                    1009.968688964844
--------------------------------------------------------------------------
Dataset:                                      256x10000
Time(ms):                                     143.206
Max RSS (KB):                                 2685268
Runtime RSS (KB):                             28108
# of page faults:                             16843
Logsumexp:                                    1190.671264648438
--------------------------------------------------------------------------
Scheduler:                                    Weave (Staged) (eager flowvars)
Benchmark:                                    Log-Sum-Exp (Machine Learning) 
Threads:                                      36
datasetSize:                                  20000
batchSize:                                    256
# of full batches:                            78
# of image labels:                            1000
Text vocabulary size:                         10000
--------------------------------------------------------------------------
Dataset:                                      256x1000
Time(ms):                                     32.684
Max RSS (KB):                                 2763816
Runtime RSS (KB):                             784
# of page faults:                             240
Logsumexp:                                    1013.553344726562
--------------------------------------------------------------------------
Dataset:                                      256x10000
Time(ms):                                     320.218
Max RSS (KB):                                 3546832
Runtime RSS (KB):                             1576
# of page faults:                             462
Logsumexp:                                    1193.156127929688

weave/benchmarks/xxx_pathological/logsumexp/weave_logsumexp.nim

Lines 183 to 202 in a393c62

    
           proc logsumexpWeaveReduce[T: SomeFloat](M: Matrix[T]): T = 
        
             let alpha = M.maxWeaveReduce() 
        
             var lse: Flowvar[T] 
        
             parallelFor i in 0 ..< M.nrows: 
        
               captures:{alpha, M} 
        
               reduce(lse): 
        
                 prologue: 
        
                   var localLSE = 0.T 
        
                 fold: 
        
                   for j in 0 ..< M.ncols: 
        
                     localLSE += exp(M[i, j] - alpha) 
        
                     loadBalance(Weave) 
        
                 merge(remoteLSE): 
        
                   localLSE += sync(remoteLSE) 
        
                 return localLSE 
        
             result = alpha + ln(sync(lse))

weave/benchmarks/xxx_pathological/logsumexp/weave_logsumexp.nim

Lines 223 to 240 in a393c62

    
           proc logsumexpWeaveCollapsed[T: SomeFloat](M: Matrix[T]): T = 
        
             let alpha = M.maxWeaveCollapsed() 
        
             var lse: Flowvar[T] 
        
             parallelFor ij in 0 ..< M.nrows * M.ncols: 
        
               captures:{alpha, M} 
        
               reduce(lse): 
        
                 prologue: 
        
                   var localLSE = 0.T 
        
                 fold: 
        
                   localLSE += exp(M.buffer[ij] - alpha) 
        
                 merge(remoteLSE): 
        
                   localLSE += sync(remoteLSE) 
        
                 return localLSE 
        
             result = alpha + ln(sync(lse))

weave/benchmarks/xxx_pathological/logsumexp/weave_logsumexp.nim

Lines 266 to 295 in a393c62

    
           proc logsumexpWeaveStaged[T: SomeFloat](M: Matrix[T]): T = 
        
             let alpha = M.maxWeaveStaged() 
        
             var lse = T(0) 
        
             let lseAddr = lse.addr 
        
             # Atomic increment for float is done with a Compare-And-Swap loop usually. 
        
             # Due to lazy splitting, load distribution is unbalanced between threads so they shouldn't 
        
             # finish at the same time in general and lock contention would be low 
        
             var lock: Lock 
        
             lock.initLock() 
        
             let lockAddr = lock.addr 
        
             parallelForStaged i in 0 ..< M.nrows: 
        
               captures:{lseAddr, lockAddr, alpha, M} 
        
               prologue: 
        
                 var localLSE = 0.T 
        
               loop: 
        
                 for j in 0 ..< M.ncols: 
        
                   localLSE += exp(M[i, j] - alpha) 
        
                   loadBalance(Weave) 
        
               epilogue: 
        
                 lockAddr[].acquire() 
        
                 lseAddr[] += localLSE 
        
                 lockAddr[].release() 
        
             sync(Weave) 
        
             result = alpha + ln(lse) 
        
             lock.deinitLock()

GEneralized Matrix Multiplication (GEMM BLAS)

It doesn't help reaching Laser or MKL speed on GEMM (perf unchanged) though it makes targeting a random victim instead of a last victim more competitive (reaching over 2.3 TFlops).
Combining that with WV_MaxConcurrentStealPerWorker=2 in LastVictim mode will trigger an abysmal performance of 0.5 TFlops so it might makes sense to switch back the default to Random from lastVictim (introduced in #74). This would also regain 10% lost perf on tasks with a duration inferior to 20 µs or so.

Matrix tranposition

Matrix transposition used nested for loop so the nested loop provided opportunities to do the full load balancing, we are on par with OpenMP collapse:

Inverting the transpose order may favor one transposition heavily for non-tiled strategies
--------------------------------------------------------------------------
Scheduler:                                    OpenMP
Benchmark:                                    Transpose - TiledCollapsed
Threads:                                      1
# of rounds:                                  1000
# of operations:                              1600000
# of bytes:                                   6400000
Arithmetic Intensity:                         0.25
--------------------------------------------------------------------------
Transposition:                                400x4000 --> 4000x400
Time(ms):                                     65.831
Max RSS (KB):                                 88264
Runtime RSS (KB):                             0
# of page faults:                             1807
Perf (GMEMOPs/s ~ GigaMemory Operations/s)    24.305
--------------------------------------------------------------------------
Transposition:                                4000x400 --> 400x4000
Time(ms):                                     43.52
Max RSS (KB):                                 88264
Runtime RSS (KB):                             0
# of page faults:                             0
Perf (GMEMOPs/s ~ GigaMemory Operations/s)    36.765

Inverting the transpose order may favor one transposition heavily for non-tiled strategies
--------------------------------------------------------------------------
Scheduler:                                    Weave  (eager flowvars)
Benchmark:                                    Transpose - TiledNested
Threads:                                      36
# of rounds:                                  1000
# of operations:                              1600000
# of bytes:                                   6400000
Arithmetic Intensity:                         0.25
--------------------------------------------------------------------------
Transposition:                                400x4000 --> 4000x400
Time(ms):                                     59.042
Max RSS (KB):                                 109516
Runtime RSS (KB):                             0
# of page faults:                             2724
Perf (GMEMOPs/s ~ GigaMemory Operations/s)    27.099
--------------------------------------------------------------------------
Transposition:                                4000x400 --> 400x4000
Time(ms):                                     46.186
Max RSS (KB):                                 109516
Runtime RSS (KB):                             0
# of page faults:                             96
Perf (GMEMOPs/s ~ GigaMemory Operations/s)    34.643

Meaning

I think this means that Weave has no more parallelism domain where it is weak, it is actually as fast/efficient of the top industry contenders of each categories:

Overhead: Cilk
Load Balancing: Cilk
Conditional Parallelism: Cilk
Task Parallelism: Cilk, TBB
Data Parallelism: OpenMP
Nested Data Parallelism: No contest (?)

…eps at-least an iteration

…s in Weave

mratsim · 2019-12-30T13:28:02Z

After going back to 2 parallelized loops and random victim targeting, Weave can now reach 2.53TFlops!

Backend:                        Weave (Pure Nim)
Type:                           float32
A matrix shape:                 (M: 1920, N: 1920)
B matrix shape:                 (M: 1920, N: 1920)
Output shape:                   (M: 1920, N: 1920)
Required number of operations: 14155.776 millions
Required bytes:                   29.491 MB
Arithmetic intensity:            480.000 FLOP/byte
Theoretical peak single-core:    224.000 GFLOP/s
Theoretical peak multi:         4032.000 GFLOP/s

Weave implementation
Collected 300 samples in 2021 ms
Average time: 5.590 ms
Stddev  time: 1.074 ms
Min     time: 5.000 ms
Max     time: 21.000 ms
Perf:         2532.339 GFLOP/s

mratsim added 3 commits December 30, 2019 01:07

loadBalance didn't split tasks: hopefully addresses #68 and #35

59145b1

Introduce a new debugSplit and extra assertions to ensure a worker ke…

1a2f9f3

…eps at-least an iteration

Ensure a splitter always keeps an iteration

56c1f91

mratsim changed the title ~~Send loop task first~~ Split tasks in load balance Dec 30, 2019

This was referenced Dec 30, 2019

Reductions benchmark aprell/tasking-2.0#3

Open

Perf profile on i9-9980XE HadrienG2/parallel-histograms-bench#2

Open

mratsim added 2 commits December 30, 2019 14:25

Reenable all logsumexp bench

77a0792

2.5 TFlops!! Go back to random victim policy, only parallelize 2 loop…

0b782f7

…s in Weave

mratsim added 3 commits December 30, 2019 14:32

Move 2D histogram and logsumexp out of pathological case

f835007

Add a README on why GEMM is a key workload

6257e13

add some reference numbers

3f42103

mratsim merged commit 059d452 into master Dec 30, 2019

mratsim mentioned this pull request Dec 30, 2019

BLAS matmul vs Laser, Intel MKL, MKL-DNN, OpenBLAS and co #68

Closed

mratsim deleted the send-loop-task-first branch January 1, 2020 23:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Split tasks in load balance #83

Split tasks in load balance #83

mratsim commented Dec 30, 2019 •

edited

Loading

mratsim commented Dec 30, 2019 •

edited

Loading

	proc logsumexpWeaveReduce[T: SomeFloat](M: Matrix[T]): T =

	let alpha = M.maxWeaveReduce()

	var lse: Flowvar[T]

	parallelFor i in 0 ..< M.nrows:
	captures:{alpha, M}
	reduce(lse):
	prologue:
	var localLSE = 0.T
	fold:
	for j in 0 ..< M.ncols:
	localLSE += exp(M[i, j] - alpha)
	loadBalance(Weave)
	merge(remoteLSE):
	localLSE += sync(remoteLSE)
	return localLSE

	result = alpha + ln(sync(lse))

	proc logsumexpWeaveCollapsed[T: SomeFloat](M: Matrix[T]): T =

	let alpha = M.maxWeaveCollapsed()

	var lse: Flowvar[T]

	parallelFor ij in 0 ..< M.nrows * M.ncols:
	captures:{alpha, M}
	reduce(lse):
	prologue:
	var localLSE = 0.T
	fold:
	localLSE += exp(M.buffer[ij] - alpha)
	merge(remoteLSE):
	localLSE += sync(remoteLSE)
	return localLSE

	result = alpha + ln(sync(lse))

	proc logsumexpWeaveStaged[T: SomeFloat](M: Matrix[T]): T =

	let alpha = M.maxWeaveStaged()

	var lse = T(0)
	let lseAddr = lse.addr

	# Atomic increment for float is done with a Compare-And-Swap loop usually.
	# Due to lazy splitting, load distribution is unbalanced between threads so they shouldn't
	# finish at the same time in general and lock contention would be low
	var lock: Lock
	lock.initLock()
	let lockAddr = lock.addr

	parallelForStaged i in 0 ..< M.nrows:
	captures:{lseAddr, lockAddr, alpha, M}
	prologue:
	var localLSE = 0.T
	loop:
	for j in 0 ..< M.ncols:
	localLSE += exp(M[i, j] - alpha)
	loadBalance(Weave)
	epilogue:
	lockAddr[].acquire()
	lseAddr[] += localLSE
	lockAddr[].release()

	sync(Weave)
	result = alpha + ln(lse)
	lock.deinitLock()

Split tasks in load balance #83

Split tasks in load balance #83

Conversation

mratsim commented Dec 30, 2019 • edited Loading

Histogram pathological bench

Log-Sum-Exp pathological bench

GEneralized Matrix Multiplication (GEMM BLAS)

Matrix tranposition

Meaning

mratsim commented Dec 30, 2019 • edited Loading

mratsim commented Dec 30, 2019 •

edited

Loading

mratsim commented Dec 30, 2019 •

edited

Loading