diff --git a/axon/gosl.go b/axon/gosl.go
index d154060a8..8d9def8c2 100644
--- a/axon/gosl.go
+++ b/axon/gosl.go
@@ -61,24 +61,24 @@ func GPUInit() {
 	{
 		sy := gpu.NewComputeSystem(gp, "Default")
 		GPUSystem = sy
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/BetweenGi.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/PoolGi.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSyn.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleInc.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/LayerGi.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/CyclePost.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSubMeanPath.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/PoolGi.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/BetweenGi.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/CyclePost.wgsl", sy)
 		vars := sy.Vars()
 		{
 			sgp := vars.AddGroup(gpu.Storage)
@@ -146,274 +146,172 @@ func GPURelease() {
 	ComputeGPU.Release()
 }
 
-// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
-// on either the CPU or GPU depending on the UseGPU variable.
-// Can call multiple Run* kernels in a row, which are then all launched
-// in the same command submission on the GPU, which is by far the most efficient.
-// MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a
-// single run-and-sync case.
-func RunDWtSubMeanPath(n int) {
-	if UseGPU {
-		RunDWtSubMeanPathGPU(n)
-	} else {
-		RunDWtSubMeanPathCPU(n)
-	}
-}
-
-// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info.
-func RunDWtSubMeanPathGPU(n int) {
-	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtSubMeanPath"]
-	ce, _ := sy.BeginComputePass()
-	pl.Dispatch1D(ce, n, 64)
-}
-
-// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU.
-func RunDWtSubMeanPathCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		DWtSubMeanPath(uint32(i))
-	}
-}
-
-// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
-// on either the CPU or GPU depending on the UseGPU variable.
-// This version then calls RunDone with the given variables to sync
-// after the Run, for a single-shot Run-and-Done call. If multiple kernels
-// can be run in sequence, it is much more efficient to do multiple Run*
-// calls followed by a RunDone call.
-func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) {
-	if UseGPU {
-		RunDWtSubMeanPathGPU(n)
-		RunDone(syncVars...)
-	} else {
-		RunDWtSubMeanPathCPU(n)
-	}
-}
-// RunGatherSpikes runs the GatherSpikes kernel with given number of elements,
-// on either the CPU or GPU depending on the UseGPU variable.
-// Can call multiple Run* kernels in a row, which are then all launched
-// in the same command submission on the GPU, which is by far the most efficient.
-// MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a
-// single run-and-sync case.
-func RunGatherSpikes(n int) {
-	if UseGPU {
-		RunGatherSpikesGPU(n)
-	} else {
-		RunGatherSpikesCPU(n)
-	}
-}
-
-// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info.
-func RunGatherSpikesGPU(n int) {
-	sy := GPUSystem
-	pl := sy.ComputePipelines["GatherSpikes"]
-	ce, _ := sy.BeginComputePass()
-	pl.Dispatch1D(ce, n, 64)
-}
-
-// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU.
-func RunGatherSpikesCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		GatherSpikes(uint32(i))
-	}
-}
-
-// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements,
-// on either the CPU or GPU depending on the UseGPU variable.
-// This version then calls RunDone with the given variables to sync
-// after the Run, for a single-shot Run-and-Done call. If multiple kernels
-// can be run in sequence, it is much more efficient to do multiple Run*
-// calls followed by a RunDone call.
-func RunOneGatherSpikes(n int, syncVars ...GPUVars) {
-	if UseGPU {
-		RunGatherSpikesGPU(n)
-		RunDone(syncVars...)
-	} else {
-		RunGatherSpikesCPU(n)
-	}
-}
-// RunCycleNeuron runs the CycleNeuron kernel with given number of elements,
+// RunPoolGi runs the PoolGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a
 // single run-and-sync case.
-func RunCycleNeuron(n int) {
+func RunPoolGi(n int) {
 	if UseGPU {
-		RunCycleNeuronGPU(n)
+		RunPoolGiGPU(n)
 	} else {
-		RunCycleNeuronCPU(n)
+		RunPoolGiCPU(n)
 	}
 }
 
-// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info.
-func RunCycleNeuronGPU(n int) {
+// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info.
+func RunPoolGiGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["CycleNeuron"]
+	pl := sy.ComputePipelines["PoolGi"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU.
-func RunCycleNeuronCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		CycleNeuron(uint32(i))
-	}
+// RunPoolGiCPU runs the PoolGi kernel on the CPU.
+func RunPoolGiCPU(n int) {
+	gpu.VectorizeFunc(0, n, PoolGi)
 }
 
-// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements,
+// RunOnePoolGi runs the PoolGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneCycleNeuron(n int, syncVars ...GPUVars) {
+func RunOnePoolGi(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunCycleNeuronGPU(n)
+		RunPoolGiGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunCycleNeuronCPU(n)
+		RunPoolGiCPU(n)
 	}
 }
-// RunSendSpike runs the SendSpike kernel with given number of elements,
+// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a
+// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunSendSpike(n int) {
+func RunMinusPhaseNeuron(n int) {
 	if UseGPU {
-		RunSendSpikeGPU(n)
+		RunMinusPhaseNeuronGPU(n)
 	} else {
-		RunSendSpikeCPU(n)
+		RunMinusPhaseNeuronCPU(n)
 	}
 }
 
-// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info.
-func RunSendSpikeGPU(n int) {
+// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info.
+func RunMinusPhaseNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["SendSpike"]
+	pl := sy.ComputePipelines["MinusPhaseNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunSendSpikeCPU runs the SendSpike kernel on the CPU.
-func RunSendSpikeCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		SendSpike(uint32(i))
-	}
+// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU.
+func RunMinusPhaseNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, MinusPhaseNeuron)
 }
 
-// RunOneSendSpike runs the SendSpike kernel with given number of elements,
+// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneSendSpike(n int, syncVars ...GPUVars) {
+func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunSendSpikeGPU(n)
+		RunMinusPhaseNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunSendSpikeCPU(n)
+		RunMinusPhaseNeuronCPU(n)
 	}
 }
-// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements,
+// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a
+// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhasePool(n int) {
+func RunPlusPhaseStartNeuron(n int) {
 	if UseGPU {
-		RunPlusPhasePoolGPU(n)
+		RunPlusPhaseStartNeuronGPU(n)
 	} else {
-		RunPlusPhasePoolCPU(n)
+		RunPlusPhaseStartNeuronCPU(n)
 	}
 }
 
-// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info.
-func RunPlusPhasePoolGPU(n int) {
+// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info.
+func RunPlusPhaseStartNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhasePool"]
+	pl := sy.ComputePipelines["PlusPhaseStartNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU.
-func RunPlusPhasePoolCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		PlusPhasePool(uint32(i))
-	}
+// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU.
+func RunPlusPhaseStartNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhaseStartNeuron)
 }
 
-// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements,
+// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhasePool(n int, syncVars ...GPUVars) {
+func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhasePoolGPU(n)
+		RunPlusPhaseStartNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhasePoolCPU(n)
+		RunPlusPhaseStartNeuronCPU(n)
 	}
 }
-// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
+// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a
+// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a
 // single run-and-sync case.
-func RunDWtFromDiSyn(n int) {
+func RunDWtSubMeanPath(n int) {
 	if UseGPU {
-		RunDWtFromDiSynGPU(n)
+		RunDWtSubMeanPathGPU(n)
 	} else {
-		RunDWtFromDiSynCPU(n)
+		RunDWtSubMeanPathCPU(n)
 	}
 }
 
-// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info.
-func RunDWtFromDiSynGPU(n int) {
+// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info.
+func RunDWtSubMeanPathGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtFromDiSyn"]
+	pl := sy.ComputePipelines["DWtSubMeanPath"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU.
-func RunDWtFromDiSynCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		DWtFromDiSyn(uint32(i))
-	}
+// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU.
+func RunDWtSubMeanPathCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtSubMeanPath)
 }
 
-// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
+// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) {
+func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunDWtFromDiSynGPU(n)
+		RunDWtSubMeanPathGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunDWtFromDiSynCPU(n)
+		RunDWtSubMeanPathCPU(n)
 	}
 }
 // RunWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
@@ -441,10 +339,7 @@ func RunWtFromDWtSynGPU(n int) {
 
 // RunWtFromDWtSynCPU runs the WtFromDWtSyn kernel on the CPU.
 func RunWtFromDWtSynCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		WtFromDWtSyn(uint32(i))
-	}
+	gpu.VectorizeFunc(0, n, WtFromDWtSyn)
 }
 
 // RunOneWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
@@ -486,10 +381,7 @@ func RunBetweenGiGPU(n int) {
 
 // RunBetweenGiCPU runs the BetweenGi kernel on the CPU.
 func RunBetweenGiCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		BetweenGi(uint32(i))
-	}
+	gpu.VectorizeFunc(0, n, BetweenGi)
 }
 
 // RunOneBetweenGi runs the BetweenGi kernel with given number of elements,
@@ -506,229 +398,214 @@ func RunOneBetweenGi(n int, syncVars ...GPUVars) {
 		RunBetweenGiCPU(n)
 	}
 }
-// RunPoolGi runs the PoolGi kernel with given number of elements,
+// RunCycleNeuron runs the CycleNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a
+// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunPoolGi(n int) {
+func RunCycleNeuron(n int) {
 	if UseGPU {
-		RunPoolGiGPU(n)
+		RunCycleNeuronGPU(n)
 	} else {
-		RunPoolGiCPU(n)
+		RunCycleNeuronCPU(n)
 	}
 }
 
-// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info.
-func RunPoolGiGPU(n int) {
+// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info.
+func RunCycleNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PoolGi"]
+	pl := sy.ComputePipelines["CycleNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPoolGiCPU runs the PoolGi kernel on the CPU.
-func RunPoolGiCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		PoolGi(uint32(i))
-	}
+// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU.
+func RunCycleNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, CycleNeuron)
 }
 
-// RunOnePoolGi runs the PoolGi kernel with given number of elements,
+// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePoolGi(n int, syncVars ...GPUVars) {
+func RunOneCycleNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPoolGiGPU(n)
+		RunCycleNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPoolGiCPU(n)
+		RunCycleNeuronCPU(n)
 	}
 }
-// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
+// RunCyclePost runs the CyclePost kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a
 // single run-and-sync case.
-func RunApplyExtsNeuron(n int) {
+func RunCyclePost(n int) {
 	if UseGPU {
-		RunApplyExtsNeuronGPU(n)
+		RunCyclePostGPU(n)
 	} else {
-		RunApplyExtsNeuronCPU(n)
+		RunCyclePostCPU(n)
 	}
 }
 
-// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info.
-func RunApplyExtsNeuronGPU(n int) {
+// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info.
+func RunCyclePostGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["ApplyExtsNeuron"]
+	pl := sy.ComputePipelines["CyclePost"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU.
-func RunApplyExtsNeuronCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		ApplyExtsNeuron(uint32(i))
-	}
+// RunCyclePostCPU runs the CyclePost kernel on the CPU.
+func RunCyclePostCPU(n int) {
+	gpu.VectorizeFunc(0, n, CyclePost)
 }
 
-// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
+// RunOneCyclePost runs the CyclePost kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) {
+func RunOneCyclePost(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunApplyExtsNeuronGPU(n)
+		RunCyclePostGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunApplyExtsNeuronCPU(n)
+		RunCyclePostCPU(n)
 	}
 }
-// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
+// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhaseStartNeuron(n int) {
+func RunMinusPhasePool(n int) {
 	if UseGPU {
-		RunPlusPhaseStartNeuronGPU(n)
+		RunMinusPhasePoolGPU(n)
 	} else {
-		RunPlusPhaseStartNeuronCPU(n)
+		RunMinusPhasePoolCPU(n)
 	}
 }
 
-// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info.
-func RunPlusPhaseStartNeuronGPU(n int) {
+// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info.
+func RunMinusPhasePoolGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhaseStartNeuron"]
+	pl := sy.ComputePipelines["MinusPhasePool"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU.
-func RunPlusPhaseStartNeuronCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		PlusPhaseStartNeuron(uint32(i))
-	}
+// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU.
+func RunMinusPhasePoolCPU(n int) {
+	gpu.VectorizeFunc(0, n, MinusPhasePool)
 }
 
-// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
+// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) {
+func RunOneMinusPhasePool(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhaseStartNeuronGPU(n)
+		RunMinusPhasePoolGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhaseStartNeuronCPU(n)
+		RunMinusPhasePoolCPU(n)
 	}
 }
-// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
+// RunGatherSpikes runs the GatherSpikes kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhaseNeuron(n int) {
+func RunGatherSpikes(n int) {
 	if UseGPU {
-		RunPlusPhaseNeuronGPU(n)
+		RunGatherSpikesGPU(n)
 	} else {
-		RunPlusPhaseNeuronCPU(n)
+		RunGatherSpikesCPU(n)
 	}
 }
 
-// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info.
-func RunPlusPhaseNeuronGPU(n int) {
+// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info.
+func RunGatherSpikesGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhaseNeuron"]
+	pl := sy.ComputePipelines["GatherSpikes"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU.
-func RunPlusPhaseNeuronCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		PlusPhaseNeuron(uint32(i))
-	}
+// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU.
+func RunGatherSpikesCPU(n int) {
+	gpu.VectorizeFunc(0, n, GatherSpikes)
 }
 
-// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
+// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) {
+func RunOneGatherSpikes(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhaseNeuronGPU(n)
+		RunGatherSpikesGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhaseNeuronCPU(n)
+		RunGatherSpikesCPU(n)
 	}
 }
-// RunDWtSyn runs the DWtSyn kernel with given number of elements,
+// RunSendSpike runs the SendSpike kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a
+// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a
 // single run-and-sync case.
-func RunDWtSyn(n int) {
+func RunSendSpike(n int) {
 	if UseGPU {
-		RunDWtSynGPU(n)
+		RunSendSpikeGPU(n)
 	} else {
-		RunDWtSynCPU(n)
+		RunSendSpikeCPU(n)
 	}
 }
 
-// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info.
-func RunDWtSynGPU(n int) {
+// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info.
+func RunSendSpikeGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtSyn"]
+	pl := sy.ComputePipelines["SendSpike"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunDWtSynCPU runs the DWtSyn kernel on the CPU.
-func RunDWtSynCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		DWtSyn(uint32(i))
-	}
+// RunSendSpikeCPU runs the SendSpike kernel on the CPU.
+func RunSendSpikeCPU(n int) {
+	gpu.VectorizeFunc(0, n, SendSpike)
 }
 
-// RunOneDWtSyn runs the DWtSyn kernel with given number of elements,
+// RunOneSendSpike runs the SendSpike kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneDWtSyn(n int, syncVars ...GPUVars) {
+func RunOneSendSpike(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunDWtSynGPU(n)
+		RunSendSpikeGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunDWtSynCPU(n)
+		RunSendSpikeCPU(n)
 	}
 }
 // RunCycleInc runs the CycleInc kernel with given number of elements,
@@ -756,10 +633,7 @@ func RunCycleIncGPU(n int) {
 
 // RunCycleIncCPU runs the CycleInc kernel on the CPU.
 func RunCycleIncCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		CycleInc(uint32(i))
-	}
+	gpu.VectorizeFunc(0, n, CycleInc)
 }
 
 // RunOneCycleInc runs the CycleInc kernel with given number of elements,
@@ -776,49 +650,130 @@ func RunOneCycleInc(n int, syncVars ...GPUVars) {
 		RunCycleIncCPU(n)
 	}
 }
-// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
+// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a
+// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a
 // single run-and-sync case.
-func RunMinusPhasePool(n int) {
+func RunPlusPhasePool(n int) {
 	if UseGPU {
-		RunMinusPhasePoolGPU(n)
+		RunPlusPhasePoolGPU(n)
 	} else {
-		RunMinusPhasePoolCPU(n)
+		RunPlusPhasePoolCPU(n)
 	}
 }
 
-// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info.
-func RunMinusPhasePoolGPU(n int) {
+// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info.
+func RunPlusPhasePoolGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["MinusPhasePool"]
+	pl := sy.ComputePipelines["PlusPhasePool"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU.
-func RunMinusPhasePoolCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		MinusPhasePool(uint32(i))
+// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU.
+func RunPlusPhasePoolCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhasePool)
+}
+
+// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// This version then calls RunDone with the given variables to sync
+// after the Run, for a single-shot Run-and-Done call. If multiple kernels
+// can be run in sequence, it is much more efficient to do multiple Run*
+// calls followed by a RunDone call.
+func RunOnePlusPhasePool(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunPlusPhasePoolGPU(n)
+		RunDone(syncVars...)
+	} else {
+		RunPlusPhasePoolCPU(n)
+	}
+}
+// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Can call multiple Run* kernels in a row, which are then all launched
+// in the same command submission on the GPU, which is by far the most efficient.
+// MUST call RunDone (with optional vars to sync) after all Run calls.
+// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a
+// single run-and-sync case.
+func RunPlusPhaseNeuron(n int) {
+	if UseGPU {
+		RunPlusPhaseNeuronGPU(n)
+	} else {
+		RunPlusPhaseNeuronCPU(n)
 	}
 }
 
-// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
+// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info.
+func RunPlusPhaseNeuronGPU(n int) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["PlusPhaseNeuron"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+}
+
+// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU.
+func RunPlusPhaseNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhaseNeuron)
+}
+
+// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneMinusPhasePool(n int, syncVars ...GPUVars) {
+func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunMinusPhasePoolGPU(n)
+		RunPlusPhaseNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunMinusPhasePoolCPU(n)
+		RunPlusPhaseNeuronCPU(n)
+	}
+}
+// RunDWtSyn runs the DWtSyn kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Can call multiple Run* kernels in a row, which are then all launched
+// in the same command submission on the GPU, which is by far the most efficient.
+// MUST call RunDone (with optional vars to sync) after all Run calls.
+// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a
+// single run-and-sync case.
+func RunDWtSyn(n int) {
+	if UseGPU {
+		RunDWtSynGPU(n)
+	} else {
+		RunDWtSynCPU(n)
+	}
+}
+
+// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info.
+func RunDWtSynGPU(n int) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["DWtSyn"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+}
+
+// RunDWtSynCPU runs the DWtSyn kernel on the CPU.
+func RunDWtSynCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtSyn)
+}
+
+// RunOneDWtSyn runs the DWtSyn kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// This version then calls RunDone with the given variables to sync
+// after the Run, for a single-shot Run-and-Done call. If multiple kernels
+// can be run in sequence, it is much more efficient to do multiple Run*
+// calls followed by a RunDone call.
+func RunOneDWtSyn(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunDWtSynGPU(n)
+		RunDone(syncVars...)
+	} else {
+		RunDWtSynCPU(n)
 	}
 }
 // RunLayerGi runs the LayerGi kernel with given number of elements,
@@ -846,10 +801,7 @@ func RunLayerGiGPU(n int) {
 
 // RunLayerGiCPU runs the LayerGi kernel on the CPU.
 func RunLayerGiCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		LayerGi(uint32(i))
-	}
+	gpu.VectorizeFunc(0, n, LayerGi)
 }
 
 // RunOneLayerGi runs the LayerGi kernel with given number of elements,
@@ -866,94 +818,88 @@ func RunOneLayerGi(n int, syncVars ...GPUVars) {
 		RunLayerGiCPU(n)
 	}
 }
-// RunCyclePost runs the CyclePost kernel with given number of elements,
+// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a
+// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunCyclePost(n int) {
+func RunApplyExtsNeuron(n int) {
 	if UseGPU {
-		RunCyclePostGPU(n)
+		RunApplyExtsNeuronGPU(n)
 	} else {
-		RunCyclePostCPU(n)
+		RunApplyExtsNeuronCPU(n)
 	}
 }
 
-// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info.
-func RunCyclePostGPU(n int) {
+// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info.
+func RunApplyExtsNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["CyclePost"]
+	pl := sy.ComputePipelines["ApplyExtsNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunCyclePostCPU runs the CyclePost kernel on the CPU.
-func RunCyclePostCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		CyclePost(uint32(i))
-	}
+// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU.
+func RunApplyExtsNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, ApplyExtsNeuron)
 }
 
-// RunOneCyclePost runs the CyclePost kernel with given number of elements,
+// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneCyclePost(n int, syncVars ...GPUVars) {
+func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunCyclePostGPU(n)
+		RunApplyExtsNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunCyclePostCPU(n)
+		RunApplyExtsNeuronCPU(n)
 	}
 }
-// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
+// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a
 // single run-and-sync case.
-func RunMinusPhaseNeuron(n int) {
+func RunDWtFromDiSyn(n int) {
 	if UseGPU {
-		RunMinusPhaseNeuronGPU(n)
+		RunDWtFromDiSynGPU(n)
 	} else {
-		RunMinusPhaseNeuronCPU(n)
+		RunDWtFromDiSynCPU(n)
 	}
 }
 
-// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info.
-func RunMinusPhaseNeuronGPU(n int) {
+// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info.
+func RunDWtFromDiSynGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["MinusPhaseNeuron"]
+	pl := sy.ComputePipelines["DWtFromDiSyn"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU.
-func RunMinusPhaseNeuronCPU(n int) {
-	// todo: need threaded api -- not tensor
-	for i := range n {
-		MinusPhaseNeuron(uint32(i))
-	}
+// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU.
+func RunDWtFromDiSynCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtFromDiSyn)
 }
 
-// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
+// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) {
+func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunMinusPhaseNeuronGPU(n)
+		RunDWtFromDiSynGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunMinusPhaseNeuronCPU(n)
+		RunDWtFromDiSynCPU(n)
 	}
 }
 // RunDone must be called after Run* calls to start compute kernels.
diff --git a/axon/shaders/LayerGi.wgsl b/axon/shaders/LayerGi.wgsl
index 4f7470073..83b3f2ca6 100644
--- a/axon/shaders/LayerGi.wgsl
+++ b/axon/shaders/LayerGi.wgsl
@@ -1234,16 +1234,19 @@ fn PoolNNeurons(pi: u32) -> i32 {
 }
 fn PoolAvgMaxCalcVar(vr: AvgMaxVars, pi: u32,di: u32) {
 	var floatFromInt = f32(1.0) / f32(u32(1)<<20);
-	var vim = AvgMaxIntVarIndex(vr, Max);
-	var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))];
+	var vis = AvgMaxIntVarIndex(vr, Avg);
+	var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))];
 	if (sum < 0) {
 		sum = i32(u32(1) << 20);
 	}
 	Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Avg)),u32(pi),u32(di))] = f32(sum) * floatFromInt;
-	var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(AvgMaxIntVarIndex(vr, Max)),u32(pi),u32(di))];
+	PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))] = 0;
+	var vim = AvgMaxIntVarIndex(vr, Max);
+	var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))];
 	if (mx < 0) {
 		mx = i32(u32(1) << 20);
 	}
+	PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))] = 0;
 	Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Max)),u32(pi),u32(di))] = f32(mx) * floatFromInt;
 }
 fn PoolAvgMaxCalc(pi: u32,di: u32) {
diff --git a/axon/shaders/PoolGi.wgsl b/axon/shaders/PoolGi.wgsl
index b5fd7f8fd..58849710b 100644
--- a/axon/shaders/PoolGi.wgsl
+++ b/axon/shaders/PoolGi.wgsl
@@ -1240,16 +1240,19 @@ fn PoolNNeurons(pi: u32) -> i32 {
 }
 fn PoolAvgMaxCalcVar(vr: AvgMaxVars, pi: u32,di: u32) {
 	var floatFromInt = f32(1.0) / f32(u32(1)<<20);
-	var vim = AvgMaxIntVarIndex(vr, Max);
-	var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))];
+	var vis = AvgMaxIntVarIndex(vr, Avg);
+	var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))];
 	if (sum < 0) {
 		sum = i32(u32(1) << 20);
 	}
 	Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Avg)),u32(pi),u32(di))] = f32(sum) * floatFromInt;
-	var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(AvgMaxIntVarIndex(vr, Max)),u32(pi),u32(di))];
+	PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))] = 0;
+	var vim = AvgMaxIntVarIndex(vr, Max);
+	var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))];
 	if (mx < 0) {
 		mx = i32(u32(1) << 20);
 	}
+	PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))] = 0;
 	Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Max)),u32(pi),u32(di))] = f32(mx) * floatFromInt;
 }
 fn PoolAvgMaxCalc(pi: u32,di: u32) {
diff --git a/axon/simstats.go b/axon/simstats.go
new file mode 100644
index 000000000..e109e6847
--- /dev/null
+++ b/axon/simstats.go
@@ -0,0 +1,123 @@
+// Code generated by "goal build"; DO NOT EDIT.
+//line simstats.goal:1
+// Copyright (c) 2024, The Emergent Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package axon
+
+import (
+	"time"
+
+	"cogentcore.org/core/base/timer"
+	"cogentcore.org/core/enums"
+	"cogentcore.org/core/plot"
+	"cogentcore.org/core/tensor/datafs"
+	"cogentcore.org/core/tensor/stats/stats"
+)
+
+// StatPerTrialMSec returns a Stats function that reports the number of milliseconds
+// per trial, for the given times and training mode enum values.
+// The times should start at the Trial and go up from there: data will
+// be recorded from the second time level. The statName is the name of another
+// stat that is used to get the number of trials.
+func StatPerTrialMSec(statDir *datafs.Data, statName string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) {
+	var epcTimer timer.Time
+	return func(lmode enums.Enum, ltime enums.Enum, start bool) {
+		if lmode.Int64() != trainMode.Int64() || ltime.Int64() <= times[0].Int64() {
+			return
+		}
+		name := "PerTrialMSec"
+		modeDir := statDir.RecycleDir(lmode.String())
+		timeDir := modeDir.RecycleDir(ltime.String())
+		tsr := datafs.Value[float64](timeDir, name)
+		if start {
+			tsr.SetNumRows(0)
+			if ps := plot.GetStylersFrom(tsr); ps == nil {
+				ps.Add(func(s *plot.Style) {
+					s.Range.SetMin(0).SetMax(1)
+				})
+				plot.SetStylersTo(tsr, ps)
+			}
+			return
+		}
+		for i, tm := range times {
+			if ltime.Int64() != tm.Int64() {
+				continue
+			}
+			switch i {
+			case 0:
+				continue
+			case 1:
+				epcTimer.Stop()
+				subd := modeDir.RecycleDir(times[0].String())
+				trls := subd.Value(statName) // must be a stat
+				epcTimer.N = trls.Len()
+				pertrl := float64(epcTimer.Avg()) / float64(time.Millisecond)
+				tsr.AppendRowFloat(pertrl)
+				epcTimer.ResetStart()
+			default:
+				subd := modeDir.RecycleDir(times[i-1].String())
+				stat := stats.StatMean.Call(subd.Value(name))
+				tsr.AppendRow(stat)
+			}
+		}
+	}
+}
+
+// StatDiagnostics returns a Stats function that computes key
+// statistics.
+func StatDiagnostics(statDir *datafs.Data, net *Network, layerNames []string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) {
+	statNames := []string{"ActMAvg", "ActMMax", "MaxGeM"}
+	return func(lmode enums.Enum, ltime enums.Enum, start bool) {
+		if lmode.Int64() != trainMode.Int64() || ltime.Int64() < times[0].Int64() {
+			return
+		}
+		modeDir := statDir.RecycleDir(lmode.String())
+		timeDir := modeDir.RecycleDir(ltime.String())
+		ndata := net.Context().NData
+		for _, lnm := range layerNames {
+			for si, statName := range statNames {
+				ly := net.LayerByName(lnm)
+				lpi := ly.Params.PoolIndex(0)
+				name := lnm + "_" + statName
+				tsr := datafs.Value[float64](timeDir, name)
+				if start {
+					tsr.SetNumRows(0)
+					if ps := plot.GetStylersFrom(tsr); ps == nil {
+						ps.Add(func(s *plot.Style) {
+							s.Range.SetMin(0).SetMax(1)
+						})
+						plot.SetStylersTo(tsr, ps)
+					}
+					return
+				}
+				for i, tm := range times {
+					if ltime.Int64() != tm.Int64() {
+						continue
+					}
+					switch i {
+					case 0:
+						for di := range ndata {
+							var stat float32
+							switch si {
+							case 0:
+								stat = PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
+							case 1:
+								stat = PoolAvgMax(AMAct, AMMinus, Max, lpi, di)
+							case 2:
+								stat = PoolAvgMax(AMGeInt, AMMinus, Max, lpi, di)
+							}
+							tsr.AppendRowFloat(float64(stat))
+						}
+					// todo: last 5 here
+					default:
+						subd := modeDir.RecycleDir(times[i-1].String())
+						stat := stats.StatMean.Call(subd.Value(name))
+						tsr.AppendRow(stat)
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/axon/simstats.goal b/axon/simstats.goal
new file mode 100644
index 000000000..75648666d
--- /dev/null
+++ b/axon/simstats.goal
@@ -0,0 +1,121 @@
+// Copyright (c) 2024, The Emergent Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package axon
+
+import (
+	"time"
+
+	"cogentcore.org/core/base/timer"
+	"cogentcore.org/core/enums"
+	"cogentcore.org/core/plot"
+	"cogentcore.org/core/tensor/datafs"
+	"cogentcore.org/core/tensor/stats/stats"
+)
+
+// StatPerTrialMSec returns a Stats function that reports the number of milliseconds
+// per trial, for the given times and training mode enum values.
+// The times should start at the Trial and go up from there: data will
+// be recorded from the second time level. The statName is the name of another
+// stat that is used to get the number of trials.
+func StatPerTrialMSec(statDir *datafs.Data, statName string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) {
+	var epcTimer timer.Time
+	return func(lmode enums.Enum, ltime enums.Enum, start bool) {
+		if lmode.Int64() != trainMode.Int64() || ltime.Int64() <= times[0].Int64() {
+			return
+		}
+		name := "PerTrialMSec"
+		modeDir := statDir.RecycleDir(lmode.String())
+		timeDir := modeDir.RecycleDir(ltime.String())
+		tsr := datafs.Value[float64](timeDir, name)
+		if start {
+			tsr.SetNumRows(0)
+			if ps := plot.GetStylersFrom(tsr); ps == nil {
+				ps.Add(func(s *plot.Style) {
+					s.Range.SetMin(0).SetMax(1)
+				})
+				plot.SetStylersTo(tsr, ps)
+			}
+			return
+		}
+		for i, tm := range times {
+			if ltime.Int64() != tm.Int64() {
+				continue
+			}
+			switch i {
+			case 0:
+				continue
+			case 1:
+				epcTimer.Stop()
+				subd := modeDir.RecycleDir(times[0].String())
+				trls := subd.Value(statName) // must be a stat
+				epcTimer.N = trls.Len()
+				pertrl := float64(epcTimer.Avg()) / float64(time.Millisecond)
+				tsr.AppendRowFloat(pertrl)
+				epcTimer.ResetStart()
+			default:
+				subd := modeDir.RecycleDir(times[i-1].String())
+				stat := stats.StatMean.Call(subd.Value(name))
+				tsr.AppendRow(stat)
+			}
+		}
+	}
+}
+
+// StatDiagnostics returns a Stats function that computes key
+// statistics.
+func StatDiagnostics(statDir *datafs.Data, net *Network, layerNames []string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) {
+	statNames := []string{"ActMAvg", "ActMMax", "MaxGeM"}
+	return func(lmode enums.Enum, ltime enums.Enum, start bool) {
+		if lmode.Int64() != trainMode.Int64() || ltime.Int64() < times[0].Int64() {
+			return
+		}
+		modeDir := statDir.RecycleDir(lmode.String())
+		timeDir := modeDir.RecycleDir(ltime.String())
+		ndata := net.Context().NData
+		for _, lnm := range layerNames {
+			for si, statName := range statNames {
+				ly := net.LayerByName(lnm)
+				lpi := ly.Params.PoolIndex(0)
+				name := lnm + "_" + statName
+				tsr := datafs.Value[float64](timeDir, name)
+				if start {
+					tsr.SetNumRows(0)
+					if ps := plot.GetStylersFrom(tsr); ps == nil {
+						ps.Add(func(s *plot.Style) {
+							s.Range.SetMin(0).SetMax(1)
+						})
+						plot.SetStylersTo(tsr, ps)
+					}
+					return
+				}
+				for i, tm := range times {
+					if ltime.Int64() != tm.Int64() {
+						continue
+					}
+					switch i {
+					case 0:
+						for di := range ndata {
+							var stat float32
+							switch si {
+							case 0:
+								stat = PoolAvgMax(AMAct, AMMinus, Avg, lpi, di)
+							case 1:
+								stat = PoolAvgMax(AMAct, AMMinus, Max, lpi, di)
+							case 2:
+								stat = PoolAvgMax(AMGeInt, AMMinus, Max, lpi, di)
+							}
+							tsr.AppendRowFloat(float64(stat))
+						}
+					// todo: last 5 here
+					default:
+						subd := modeDir.RecycleDir(times[i-1].String())
+						stat := stats.StatMean.Call(subd.Value(name))
+						tsr.AppendRow(stat)
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/examples/ra25/ra25.go b/examples/ra25/ra25.go
index 289f94b45..979f35a9c 100644
--- a/examples/ra25/ra25.go
+++ b/examples/ra25/ra25.go
@@ -596,14 +596,25 @@ func (ss *Sim) RunStats(lmode Modes, ltime Times, lphase StatsPhase) {
 	}
 }
 
+func (ss *Sim) StatsData(lmode Modes, ltime Times) *datafs.Data {
+	modeDir := ss.Stats.RecycleDir(lmode.String())
+	return modeDir.RecycleDir(ltime.String())
+}
+
 func (ss *Sim) InitStats() {
 	for mode, st := range ss.Loops.Stacks {
+		cmd := mode.(Modes)
 		for _, tm := range st.Order {
 			ctm := tm.(Times)
 			if ctm == Cycle {
 				continue
 			}
 			ss.RunStats(mode.(Modes), ctm, Start)
+			if ss.GUI.Tabs != nil {
+				if cmd == Train && ctm == Epoch {
+					ss.GUI.Tabs.PlotDataFS(ss.StatsData(cmd, ctm))
+				}
+			}
 		}
 	}
 }
@@ -611,8 +622,10 @@ func (ss *Sim) InitStats() {
 // ConfigStats handles configures functions to do all stats computation
 // in the datafs system.
 func (ss *Sim) ConfigStats() {
+	net := ss.Net
 	ss.Stats, _ = ss.Root.Mkdir("Stats")
 	ss.Current, _ = ss.Stats.Mkdir("Current")
+	// todo: move this to simstats:
 	for md, st := range ss.Loops.Stacks {
 		cmd := md.(Modes)
 		for _, tm := range st.Order {
@@ -625,7 +638,7 @@ func (ss *Sim) ConfigStats() {
 					return
 				}
 				name := tm.String() // name of stat = time
-				ndata := ss.Config.Run.NData
+				ndata := int(ss.Net.Context().NData)
 				modeDir := ss.Stats.RecycleDir(lmode.String())
 				timeDir := modeDir.RecycleDir(ltime.String())
 				tsr := datafs.Value[int](timeDir, name)
@@ -655,12 +668,14 @@ func (ss *Sim) ConfigStats() {
 			})
 		}
 	}
+	// todo: loop over stat names as in diagnostics
+	// and include NZero, and stopping just grabs that from current.
 	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
 		name := "UnitErr"
 		modeDir := ss.Stats.RecycleDir(lmode.String())
 		timeDir := modeDir.RecycleDir(ltime.String())
 		tsr := datafs.Value[float64](timeDir, name)
-		ndata := ss.Config.Run.NData
+		ndata := int(ss.Net.Context().NData)
 		if lphase == Start {
 			tsr.SetNumRows(0)
 			if ps := plot.GetStylersFrom(tsr); ps == nil {
@@ -763,6 +778,15 @@ func (ss *Sim) ConfigStats() {
 			tsr.AppendRow(stat)
 		}
 	})
+	perTrlFunc := axon.StatPerTrialMSec(ss.Stats, "Err", Train, Trial, Epoch, Run)
+	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
+		perTrlFunc(lmode, ltime, lphase == Start)
+	})
+	lays := net.LayersByType(axon.SuperLayer, axon.CTLayer, axon.TargetLayer)
+	diagsFunc := axon.StatDiagnostics(ss.Stats, net, lays, Train, Trial, Epoch, Run)
+	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
+		diagsFunc(lmode, ltime, lphase == Start)
+	})
 }
 
 // StatCounters returns counters string to show at bottom of netview.
@@ -790,25 +814,6 @@ func (ss *Sim) StatCounters(md, tm enums.Enum) string {
 	// ss.ViewUpdate.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Di", "TrialName", "Cycle", "UnitErr", "TrlErr", "PhaseDiff"})
 }
 
-// TrialStats computes the trial-level statistics.
-// Aggregation is done directly from log data.
-func (ss *Sim) TrialStats(di int) {
-	// out := ss.Net.LayerByName("Output")
-	//
-	// ss.Stats.SetFloat("PhaseDiff", float64(out.Values[di].PhaseDiff.Cor))
-	// ss.Stats.SetFloat("UnitErr", out.PctUnitErr(&ss.Context)[di])
-	//
-	//	if ss.Stats.Float("UnitErr") > 0 {
-	//		ss.Stats.SetFloat("TrlErr", 1)
-	//	} else {
-	//
-	//		ss.Stats.SetFloat("TrlErr", 0)
-	//	}
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// 		Logging
-
 func (ss *Sim) ConfigLogs() {
 	// ss.Stats.SetString("RunName", ss.Params.RunName(0)) // used for naming logs, stats, etc
 	//
@@ -844,40 +849,6 @@ func (ss *Sim) ConfigLogs() {
 	// ss.Logs.SetMeta(Train, Run, "LegendCol", "RunName")
 }
 
-// Log is the main logging function, handles special things for different scopes
-func (ss *Sim) Log(mode Modes, time Times) {
-	// ctx := ss.Net.Context()
-	//
-	//	if mode != Analyze {
-	//		ctx.Mode = mode // Also set specifically in a Loop callback.
-	//	}
-	//
-	// dt := ss.Logs.Table(mode, time)
-	//
-	//	if dt == nil {
-	//		return
-	//	}
-	//
-	// row := dt.Rows
-	//
-	// switch {
-	// case time == Cycle:
-	//
-	//	return
-	//
-	// case time == Trial:
-	//
-	//		for di := 0; di < int(ctx.NData); di++ {
-	//			ss.TrialStats(di)
-	//			ss.StatCounters(di)
-	//			ss.Logs.LogRowDi(mode, time, row, di)
-	//		}
-	//		return // don't do reg below
-	//	}
-	//
-	// ss.Logs.LogRow(mode, time, row) // also logs to file, etc
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////
 // 		Gui
 
@@ -902,6 +873,7 @@ func (ss *Sim) ConfigGUI() {
 	nv.SceneXYZ().Camera.LookAt(math32.Vec3(0, 0, 0), math32.Vec3(0, 1, 0))
 
 	ss.GUI.UpdateFiles()
+	ss.InitStats()
 	// ss.GUI.AddPlots(title, &ss.Logs)
 	ss.GUI.FinalizeGUI(false)
 	//	if ss.Config.Run.GPU {
diff --git a/examples/ra25/ra25.goal b/examples/ra25/ra25.goal
index c65bb11e8..2c154a233 100644
--- a/examples/ra25/ra25.goal
+++ b/examples/ra25/ra25.goal
@@ -594,14 +594,25 @@ func (ss *Sim) RunStats(lmode Modes, ltime Times, lphase StatsPhase) {
 	}
 }
 
+func (ss *Sim) StatsData(lmode Modes, ltime Times) *datafs.Data {
+	modeDir := ss.Stats.RecycleDir(lmode.String())
+	return modeDir.RecycleDir(ltime.String())
+}
+
 func (ss *Sim) InitStats() {
 	for mode, st := range ss.Loops.Stacks {
+		cmd := mode.(Modes)
 		for _, tm := range st.Order {
 			ctm := tm.(Times)
 			if ctm == Cycle {
 				continue
 			}
 			ss.RunStats(mode.(Modes), ctm, Start)
+			if ss.GUI.Tabs != nil {
+				if cmd == Train && ctm == Epoch {
+					ss.GUI.Tabs.PlotDataFS(ss.StatsData(cmd, ctm))
+				}
+			}
 		}
 	}
 }
@@ -609,8 +620,10 @@ func (ss *Sim) InitStats() {
 // ConfigStats handles configures functions to do all stats computation
 // in the datafs system.
 func (ss *Sim) ConfigStats() {
+	net := ss.Net
 	ss.Stats, _ = ss.Root.Mkdir("Stats")
 	ss.Current, _ = ss.Stats.Mkdir("Current")
+	// todo: move this to simstats:
 	for md, st := range ss.Loops.Stacks {
 		cmd := md.(Modes)
 		for _, tm := range st.Order {
@@ -623,7 +636,7 @@ func (ss *Sim) ConfigStats() {
 					return
 				}
 				name := tm.String() // name of stat = time
-				ndata := ss.Config.Run.NData
+				ndata := int(ss.Net.Context().NData)
 				modeDir := ss.Stats.RecycleDir(lmode.String())
 				timeDir := modeDir.RecycleDir(ltime.String())
 				tsr := datafs.Value[int](timeDir, name)
@@ -653,12 +666,14 @@ func (ss *Sim) ConfigStats() {
 			})
 		}
 	}
+	// todo: loop over stat names as in diagnostics
+	// and include NZero, and stopping just grabs that from current.
 	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
 		name := "UnitErr"
 		modeDir := ss.Stats.RecycleDir(lmode.String())
 		timeDir := modeDir.RecycleDir(ltime.String())
 		tsr := datafs.Value[float64](timeDir, name)
-		ndata := ss.Config.Run.NData
+		ndata := int(ss.Net.Context().NData)
 		if lphase == Start {
 			tsr.SetNumRows(0)
 			if ps := plot.GetStylersFrom(tsr); ps == nil {
@@ -761,6 +776,15 @@ func (ss *Sim) ConfigStats() {
 			tsr.AppendRow(stat)
 		}
 	})
+	perTrlFunc := axon.StatPerTrialMSec(ss.Stats, "Err", Train, Trial, Epoch, Run)
+	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
+		perTrlFunc(lmode, ltime, lphase == Start)
+	})
+	lays := net.LayersByType(axon.SuperLayer, axon.CTLayer, axon.TargetLayer)
+	diagsFunc := axon.StatDiagnostics(ss.Stats, net, lays, Train, Trial, Epoch, Run)
+	ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) {
+		diagsFunc(lmode, ltime, lphase == Start)
+	})
 }
 
 // StatCounters returns counters string to show at bottom of netview.
@@ -786,24 +810,6 @@ func (ss *Sim) StatCounters(md, tm enums.Enum) string {
 	// ss.ViewUpdate.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Di", "TrialName", "Cycle", "UnitErr", "TrlErr", "PhaseDiff"})
 }
 
-// TrialStats computes the trial-level statistics.
-// Aggregation is done directly from log data.
-func (ss *Sim) TrialStats(di int) {
-	// out := ss.Net.LayerByName("Output")
-	//
-	// ss.Stats.SetFloat("PhaseDiff", float64(out.Values[di].PhaseDiff.Cor))
-	// ss.Stats.SetFloat("UnitErr", out.PctUnitErr(&ss.Context)[di])
-	//
-	// if ss.Stats.Float("UnitErr") > 0 {
-	// 	ss.Stats.SetFloat("TrlErr", 1)
-	// } else {
-	// 	ss.Stats.SetFloat("TrlErr", 0)
-	// }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// 		Logging
-
 func (ss *Sim) ConfigLogs() {
 	// ss.Stats.SetString("RunName", ss.Params.RunName(0)) // used for naming logs, stats, etc
 	//
@@ -839,33 +845,6 @@ func (ss *Sim) ConfigLogs() {
 	// ss.Logs.SetMeta(Train, Run, "LegendCol", "RunName")
 }
 
-// Log is the main logging function, handles special things for different scopes
-func (ss *Sim) Log(mode Modes, time Times) {
-	// ctx := ss.Net.Context()
-	// if mode != Analyze {
-	// 	ctx.Mode = mode // Also set specifically in a Loop callback.
-	// }
-	// dt := ss.Logs.Table(mode, time)
-	// if dt == nil {
-	// 	return
-	// }
-	// row := dt.Rows
-	//
-	// switch {
-	// case time == Cycle:
-	// 	return
-	// case time == Trial:
-	// 	for di := 0; di < int(ctx.NData); di++ {
-	// 		ss.TrialStats(di)
-	// 		ss.StatCounters(di)
-	// 		ss.Logs.LogRowDi(mode, time, row, di)
-	// 	}
-	// 	return // don't do reg below
-	// }
-	//
-	// ss.Logs.LogRow(mode, time, row) // also logs to file, etc
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////
 // 		Gui
 
@@ -890,6 +869,7 @@ func (ss *Sim) ConfigGUI() {
 	nv.SceneXYZ().Camera.LookAt(math32.Vec3(0, 0, 0), math32.Vec3(0, 1, 0))
 
 	ss.GUI.UpdateFiles()
+	ss.InitStats()
 	// ss.GUI.AddPlots(title, &ss.Logs)
 	ss.GUI.FinalizeGUI(false)
 	// if ss.Config.Run.GPU {