diff --git a/axon/act-layer.go b/axon/act-layer.go
index 58aca1ebc..2a2474b68 100644
--- a/axon/act-layer.go
+++ b/axon/act-layer.go
@@ -291,8 +291,8 @@ func (ly *LayerParams) GInteg(ctx *Context, pi, ni, di uint32) {
 // conductance values prior to doing the standard updates in GFromRawSyn
 // drvAct is for Pulvinar layers, activation of driving neuron
 func (ly *LayerParams) SpecialPreGs(ctx *Context, pi, ni, di uint32, drvGe float32, nonDrivePct float32) float32 {
-	saveVal := float32(0) // sometimes we need to use a value computed here, for the post Gs step
-	pi0 := pi - 1         // 0-n pool index
+	saveVal := float32(0)     // sometimes we need to use a value computed here, for the post Gs step
+	pi0 := pi - ly.PoolSt - 1 // 0-n pool index
 	pnn := uint32(PoolNNeurons(pi))
 	pni := NeuronIxs.Value(int(NrnNeurIndex), int(ni)) - uint32(PoolsInt.Value(int(PoolNeurSt), int(pi), int(di)))
 	nrnCtxtGe := Neurons.Value(int(CtxtGe), int(ni), int(di))
@@ -568,7 +568,7 @@ func (ly *LayerParams) SendSpike(ctx *Context, ni, di uint32) {
 func (ly *LayerParams) PostSpikeSpecial(ctx *Context, lpi, pi, ni, di uint32) {
 	Neurons.Set(Neurons.Value(int(CaSpkP), int(ni), int(di)), int(Burst), int(ni), int(di))
 	li := ly.Index
-	pi0 := pi - 1 // 0-n pool index
+	pi0 := pi - ly.PoolSt - 1 // 0-n pool index
 	pnn := uint32(PoolNNeurons(pi))
 	pni := NeuronIxs.Value(int(NrnNeurIndex), int(ni)) - uint32(PoolsInt.Value(int(PoolNeurSt), int(pi), int(di)))
 	hasRew := GlobalScalars.Value(int(GvHasRew), int(di)) > 0
diff --git a/axon/act-layer.goal b/axon/act-layer.goal
index 36176f6b5..02fce8f0e 100644
--- a/axon/act-layer.goal
+++ b/axon/act-layer.goal
@@ -290,7 +290,7 @@ func (ly *LayerParams) GInteg(ctx *Context, pi, ni, di uint32) {
 // drvAct is for Pulvinar layers, activation of driving neuron
 func (ly *LayerParams) SpecialPreGs(ctx *Context, pi, ni, di uint32, drvGe float32, nonDrivePct float32) float32 {
 	saveVal := float32(0)               // sometimes we need to use a value computed here, for the post Gs step
-	pi0 := pi - 1 // 0-n pool index
+	pi0 := pi - ly.PoolSt - 1 // 0-n pool index
 	pnn := uint32(PoolNNeurons(pi))
 	pni := NeuronIxs[NrnNeurIndex, ni] - uint32(PoolsInt[PoolNeurSt, pi, di])
 	nrnCtxtGe := Neurons[CtxtGe, ni, di]
@@ -462,7 +462,7 @@ func (ly *LayerParams) GFromRawSyn(ctx *Context, ni, di uint32) {
 		extraSyn = md * nrnGModSyn * ly.Acts.Dend.ModGain
 	default:
 		if ly.Acts.Dend.HasMod.IsTrue() {
-			md := ly.Acts.Dend.ModBase + ly.Acts.Dend.ModGain*nrnGModSyn
+			md := ly.Acts.Dend.ModBase + ly.Acts.Dend.ModGain * nrnGModSyn
 			if md > 1 {
 				md = 1
 			}
@@ -566,7 +566,7 @@ func (ly *LayerParams) SendSpike(ctx *Context, ni, di uint32) {
 func (ly *LayerParams) PostSpikeSpecial(ctx *Context, lpi, pi, ni, di uint32) {
 	Neurons[Burst, ni, di] = Neurons[CaSpkP, ni, di]
 	li := ly.Index
-	pi0 := pi - 1 // 0-n pool index
+	pi0 := pi - ly.PoolSt - 1 // 0-n pool index
 	pnn := uint32(PoolNNeurons(pi))
 	pni := NeuronIxs[NrnNeurIndex, ni] - uint32(PoolsInt[PoolNeurSt, pi, di])
 	hasRew := GlobalScalars[GvHasRew, di] > 0
diff --git a/axon/act-net.go b/axon/act-net.go
index 3b3b8fb3e..325b17f19 100644
--- a/axon/act-net.go
+++ b/axon/act-net.go
@@ -6,24 +6,36 @@
 
 package axon
 
-// Cycle runs one cycle of activation updating using threading methods.
-func (nt *Network) Cycle() {
-	// todo: chunks of 10 cycles
+// todo: don't even need layer-level ultimately.
+
+// Cycle runs n cycles of activation updating.
+// If getNeurons is true, then neuron state is synced back
+// from the GPU (for cycle-level display etc). Otherwise only
+// layer-level state is synced.
+func (nt *Network) Cycle(ncyc int, getNeurons bool) {
 	nix := nt.NetIxs()
 	ctx := nt.Context()
 	nd := int(nix.NNeurons * ctx.NData)
 	ld := int(nix.NLayers * ctx.NData)
 	pd := int(nix.NPools * ctx.NData)
-	RunGatherSpikes(nd)
-	RunLayerGi(ld)
-	RunBetweenGi(ld)
-	RunPoolGi(pd)
-	RunCycleNeuron(nd)
-	RunSendSpike(nd)
-	RunCyclePost(ld)
-	RunCycleInc(1)
 
-	RunDoneLayers()
+	ToGPUCtxGlobal()
+	for range ncyc {
+		RunGatherSpikes(nd)
+		RunLayerGi(ld)
+		RunBetweenGi(ld)
+		RunPoolGi(pd)
+		RunCycleNeuron(nd)
+		RunSendSpike(nd)
+		RunCyclePost(ld)
+		RunCycleInc(1)
+	}
+
+	if getNeurons {
+		RunDoneLayersNeurons()
+	} else {
+		RunDoneLayers()
+	}
 
 	// todo: fix this:
 	// var ldt, vta *Layer
@@ -70,6 +82,7 @@ func (nt *Network) ApplyExts() {
 	if !UseGPU {
 		return
 	}
+	ToGPU(ExtsVar)
 	nix := nt.NetIxs()
 	ctx := nt.Context()
 	nd := int(nix.NNeurons * ctx.NData)
@@ -85,6 +98,7 @@ func (nt *Network) MinusPhase() {
 	RunMinusPhasePool(pd)
 	RunMinusPhaseNeuron(nd)
 	nt.MinusPhasePost()
+	ToGPULayersNeurons()
 	// todo:
 	// nt.GPU.SyncStateToGPU()
 }
@@ -118,6 +132,7 @@ func (nt *Network) PlusPhase() {
 	RunPlusPhasePool(pd)
 	RunPlusPhaseNeuron(nd)
 	nt.PlusPhasePost()
+	ToGPULayersNeurons()
 	// todo:
 	// nt.GPU.SyncStateToGPU()
 }
diff --git a/axon/act-net.goal b/axon/act-net.goal
index c0238d93c..cbe9f4910 100644
--- a/axon/act-net.goal
+++ b/axon/act-net.goal
@@ -4,24 +4,36 @@
 
 package axon
 
-// Cycle runs one cycle of activation updating using threading methods.
-func (nt *Network) Cycle() {
-	// todo: chunks of 10 cycles
+// todo: don't even need layer-level ultimately.
+
+// Cycle runs n cycles of activation updating.
+// If getNeurons is true, then neuron state is synced back
+// from the GPU (for cycle-level display etc). Otherwise only
+// layer-level state is synced.
+func (nt *Network) Cycle(ncyc int, getNeurons bool) {
 	nix := nt.NetIxs()
 	ctx := nt.Context()
 	nd := int(nix.NNeurons * ctx.NData)
 	ld := int(nix.NLayers * ctx.NData)
 	pd := int(nix.NPools * ctx.NData)
-	RunGatherSpikes(nd)
-	RunLayerGi(ld)
-	RunBetweenGi(ld)
-	RunPoolGi(pd)
-	RunCycleNeuron(nd)
-	RunSendSpike(nd)
-	RunCyclePost(ld)
-	RunCycleInc(1)
 
-	RunDoneLayers()
+	ToGPUCtxGlobal()
+	for range ncyc {
+		RunGatherSpikes(nd)
+		RunLayerGi(ld)
+		RunBetweenGi(ld)
+		RunPoolGi(pd)
+		RunCycleNeuron(nd)
+		RunSendSpike(nd)
+		RunCyclePost(ld)
+		RunCycleInc(1)
+	}
+
+	if getNeurons {
+		RunDoneLayersNeurons()
+	} else {
+		RunDoneLayers()
+	}
 
 	// todo: fix this:
 	// var ldt, vta *Layer
@@ -64,6 +76,7 @@ func (nt *Network) ApplyExts() {
 	if !UseGPU {
 		return
 	}
+	ToGPU(ExtsVar)
 	nix := nt.NetIxs()
 	ctx := nt.Context()
 	nd := int(nix.NNeurons * ctx.NData)
@@ -79,6 +92,7 @@ func (nt *Network) MinusPhase() {
 	RunMinusPhasePool(pd)
 	RunMinusPhaseNeuron(nd)
 	nt.MinusPhasePost()
+	ToGPULayersNeurons()
 	// todo:
 	// nt.GPU.SyncStateToGPU()
 }
@@ -112,6 +126,7 @@ func (nt *Network) PlusPhase() {
 	RunPlusPhasePool(pd)
 	RunPlusPhaseNeuron(nd)
 	nt.PlusPhasePost()
+	ToGPULayersNeurons()
 	// todo:
 	// nt.GPU.SyncStateToGPU()
 }
diff --git a/axon/basic_test.go b/axon/basic_test.go
index 3fdb3a645..c2647fbbf 100644
--- a/axon/basic_test.go
+++ b/axon/basic_test.go
@@ -21,6 +21,7 @@ import (
 	"cogentcore.org/core/tensor"
 	"github.com/emer/emergent/v2/etime"
 	"github.com/emer/emergent/v2/paths"
+	"github.com/stretchr/testify/assert"
 	"golang.org/x/exp/maps"
 )
 
@@ -228,7 +229,7 @@ func TestSpikeProp(t *testing.T) {
 		inCyc := 0
 		hidCyc := 0
 		for cyc := range 100 {
-			net.Cycle()
+			net.Cycle(1, true)
 			// fmt.Println(cyc, Neurons[Ge, hidLay.NeurStIndex, 0], Neurons[GeRaw, hidLay.NeurStIndex, 0])
 			if Neurons.Value(int(Spike), int(inLay.NeurStIndex), int(0)) > 0 {
 				// fmt.Println("in spike:", cyc)
@@ -327,7 +328,7 @@ func TestInitWeights(t *testing.T) {
 
 			for qtr := range 4 {
 				for range 50 {
-					testNet.Cycle()
+					testNet.Cycle(1, true)
 				}
 				if qtr == 2 {
 					testNet.MinusPhase()
@@ -343,6 +344,48 @@ func TestInitWeights(t *testing.T) {
 	ReportValDiffs(t, Tol8, valMapA, valMapB, "init1", "init2")
 }
 
+func TestGPUState(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+
+	testNetA := newTestNet(1)
+
+	GPUInit()
+	UseGPU = true
+
+	testNetB := newTestNet(1)
+
+	RunCycleInc(1)
+	// get everything back
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar, SynapsesVar, SynapseTracesVar, PathGBufVar, PathGSynsVar)
+	// note: the following requires turning off read-only in vars.go
+	// RunDone(LayersVar, PathsVar, NetworkIxsVar, NeuronIxsVar, SynapseIxsVar, PathSendConVar, RecvPathIxsVar, PathRecvConVar, RecvSynIxsVar, CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar, SynapsesVar, SynapseTracesVar, PathGBufVar, PathGSynsVar)
+	// assert.Equal(t, testNetA.LayParams, testNetB.LayParams)
+	// assert.Equal(t, testNetA.LayParams, Layers)
+	// assert.Equal(t, testNetA.PathParams, testNetB.PathParams)
+	// assert.Equal(t, testNetA.NetworkIxs, testNetB.NetworkIxs)
+	// assert.Equal(t, testNetA.NeuronIxs.Values, testNetB.NeuronIxs.Values)
+	// assert.Equal(t, testNetA.SynapseIxs.Values, testNetB.SynapseIxs.Values)
+	// assert.Equal(t, testNetA.PathSendCon.Values, testNetB.PathSendCon.Values)
+	// assert.Equal(t, testNetA.RecvPathIxs.Values, testNetB.RecvPathIxs.Values)
+	// assert.Equal(t, testNetA.PathRecvCon.Values, testNetB.PathRecvCon.Values)
+	// assert.Equal(t, testNetA.RecvSynIxs.Values, testNetB.RecvSynIxs.Values)
+	assert.NotEqual(t, testNetA.Ctx, testNetB.Ctx)
+	assert.Equal(t, testNetA.Neurons.Values, testNetB.Neurons.Values)
+	assert.Equal(t, testNetA.NeuronAvgs.Values, testNetB.NeuronAvgs.Values)
+	assert.Equal(t, testNetA.LayerStates.Values, testNetB.LayerStates.Values)
+	assert.Equal(t, testNetA.GlobalScalars.Values, testNetB.GlobalScalars.Values)
+	assert.Equal(t, testNetA.GlobalVectors.Values, testNetB.GlobalVectors.Values)
+	assert.Equal(t, testNetA.Exts.Values, testNetB.Exts.Values)
+	assert.Equal(t, testNetA.Pools.Values, testNetB.Pools.Values)
+	assert.Equal(t, testNetA.PoolsInt.Values, testNetB.PoolsInt.Values)
+	assert.Equal(t, testNetA.PathGBuf.Values, testNetB.PathGBuf.Values)
+	assert.Equal(t, testNetA.PathGSyns.Values, testNetB.PathGSyns.Values)
+	assert.Equal(t, testNetA.Synapses.Values, testNetB.Synapses.Values)
+	assert.Equal(t, testNetA.SynapseTraces.Values, testNetB.SynapseTraces.Values)
+}
+
 func TestNetAct(t *testing.T) {
 	NetActTest(t, Tol7, false)
 }
@@ -359,6 +402,11 @@ func TestGPUAct(t *testing.T) {
 // Note: use NetDebugAct for printf debugging of all values --
 // "this is only a test"
 func NetActTest(t *testing.T, tol float32, gpu bool) {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+
 	testNet := newTestNet(1)
 	ctx := testNet.Context()
 	testNet.InitExt()
@@ -368,14 +416,6 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	hidLay := testNet.LayerByName("Hidden")
 	outLay := testNet.LayerByName("Output")
 
-	if gpu {
-		GPUInit()
-		UseGPU = true
-		ToGPUAll()
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.CycleByCycle = true // alt modes
-	}
-
 	qtr0HidActs := []float32{0.6944439, 0, 0, 0}
 	qtr0HidGes := []float32{0.35385746, 0, 0, 0}
 	qtr0HidGis := []float32{0.15478331, 0.15478331, 0.15478331, 0.15478331}
@@ -426,10 +466,7 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 		for qtr := range 4 {
 			for cyc := range cycPerQtr {
 				_ = cyc
-				testNet.Cycle()
-				//	if gpu {
-				//		testNet.GPU.SyncNeuronsFromGPU()
-				//	}
+				testNet.Cycle(1, true)
 			}
 			if qtr == 2 {
 				testNet.MinusPhase()
@@ -484,41 +521,6 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	// testNet.GPU.Destroy()
 }
 
-func TestGPUDiffs(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
-	nonGPUValues := NetDebugAct(t, false, false, 1, false)
-	gpuValues := NetDebugAct(t, false, true, 1, false)
-	// note: this has bad tolerance due to NMDA -- can see that if you raise tol to Tol5 etc
-	ReportValDiffs(t, Tol4, nonGPUValues, gpuValues, "CPU", "GPU")
-}
-
-func TestDebugAct(t *testing.T) {
-	t.Skip("skipped in regular testing")
-	NetDebugAct(t, true, false, 1, false)
-}
-
-func TestDebugGPUAct(t *testing.T) {
-	t.Skip("skipped in regular testing")
-	NetDebugAct(t, true, true, 1, false)
-}
-
-func TestNDataDiffs(t *testing.T) {
-	nd1Values := NetDebugAct(t, false, false, 1, true)
-	nd4Values := NetDebugAct(t, false, false, 4, true)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
-}
-
-func TestGPUNDataDiffs(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
-	nd1Values := NetDebugAct(t, false, true, 1, true)
-	nd4Values := NetDebugAct(t, false, true, 4, true)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
-}
-
 // ReportValDiffs -- reports diffs between a, b values at given tolerance
 func ReportValDiffs(t *testing.T, tolerance float32, va, vb map[string]float32, aLabel, bLabel string, exclude ...string) {
 	keys := maps.Keys(va)
@@ -556,6 +558,11 @@ func ReportValDiffs(t *testing.T, tolerance float32, va, vb map[string]float32,
 // and also returns a map of all values and variables that can be used for a more
 // fine-grained diff test, e.g., see the GPU version.
 func NetDebugAct(t *testing.T, printValues bool, gpu bool, nData int, initWts bool) map[string]float32 {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+
 	testNet := newTestNet(nData)
 	ApplyParamSheets(testNet, layerParams["FullDecay"], pathParams["FullDecay"])
 
@@ -577,20 +584,14 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 
 	var vals []float32
 
-	if gpu {
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.RecFunTimes = true
-		// testNet.GPU.CycleByCycle = true // key for recording results cycle-by-cycle
-	}
-
 	// these control what is printed.
 	// the whole thing is run and returned in the valMap
-	valsPerRow := 8
+	valsPerRow := 4
 	nQtrs := 1     // max 4
-	cycPerQtr := 5 // max 50
-	nPats := 2     // max 4
-	stLayer := 1   // max 2
-	edLayer := 2   // max 3
+	cycPerQtr := 1 // max 50
+	nPats := 1     // max 4
+	stLayer := 0   // max 2
+	edLayer := 1   // max 3
 	nNeurs := 1    // max 4 -- number of neuron values to print
 
 	for pi := 0; pi < 4; pi++ {
@@ -611,9 +612,9 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 
 		testNet.ApplyExts() // key now for GPU
 
-		for qtr := 0; qtr < 4; qtr++ {
-			for cyc := 0; cyc < 50; cyc++ {
-				testNet.Cycle()
+		for qtr := 0; qtr < nQtrs; qtr++ {
+			for cyc := 0; cyc < cycPerQtr; cyc++ {
+				testNet.Cycle(1, true) // get neuron state
 
 				for ni := 0; ni < 4; ni++ {
 					for li := 0; li < 3; li++ {
@@ -668,6 +669,41 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 	return valMap
 }
 
+func TestGPUDiffs(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+	nonGPUValues := NetDebugAct(t, false, false, 1, false)
+	gpuValues := NetDebugAct(t, false, true, 1, false)
+	// note: this has bad tolerance due to NMDA -- can see that if you raise tol to Tol5 etc
+	ReportValDiffs(t, Tol4, nonGPUValues, gpuValues, "CPU", "GPU")
+}
+
+func TestDebugAct(t *testing.T) {
+	// t.Skip("skipped in regular testing")
+	NetDebugAct(t, true, false, 1, false)
+}
+
+func TestDebugGPUAct(t *testing.T) {
+	// t.Skip("skipped in regular testing")
+	NetDebugAct(t, true, true, 1, false)
+}
+
+func TestNDataDiffs(t *testing.T) {
+	nd1Values := NetDebugAct(t, false, false, 1, true)
+	nd4Values := NetDebugAct(t, false, false, 4, true)
+	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
+}
+
+func TestGPUNDataDiffs(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+	nd1Values := NetDebugAct(t, false, true, 1, true)
+	nd4Values := NetDebugAct(t, false, true, 4, true)
+	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
+}
+
 func TestNetLearn(t *testing.T) {
 	NetTestLearn(t, Tol7, false)
 }
@@ -680,6 +716,11 @@ func TestGPULearn(t *testing.T) {
 }
 
 func NetTestLearn(t *testing.T, tol float32, gpu bool) {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+
 	testNet := newTestNet(1)
 	ctx := testNet.Context()
 
@@ -735,12 +776,6 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 	testNet.InitWeights()
 	testNet.InitExt()
 
-	if gpu {
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.RecFunTimes = true // alt forms
-		// testNet.GPU.CycleByCycle = true //
-	}
-
 	for pi := 0; pi < 4; pi++ {
 		testNet.NewState(etime.Train, false)
 
@@ -752,10 +787,7 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				testNet.Cycle()
-				if gpu {
-					// testNet.GPU.SyncNeuronsFromGPU()
-				}
+				testNet.Cycle(1, true)
 
 				hidLay.UnitValues(&hidAct, "Act", 0)
 				hidLay.UnitValues(&hidGes, "Ge", 0)
@@ -916,8 +948,7 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 		testNet.NewState(etime.Train, false)
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				testNet.Cycle()
-				// testNet.GPU.SyncNeuronsFromGPU()
+				testNet.Cycle(1, true)
 
 				hidLay.UnitValues(&hidAct, "Act", 0)
 				hidLay.UnitValues(&hidGes, "Ge", 0)
@@ -1076,7 +1107,7 @@ func RunDebugLearn(t *testing.T, testNet *Network, printValues bool, gpu bool, i
 
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < 50; cyc++ {
-				testNet.Cycle()
+				testNet.Cycle(1, true)
 			}
 			if qtr == 2 {
 				testNet.MinusPhase()
@@ -1311,7 +1342,7 @@ func TestInhibAct(t *testing.T) {
 		inhibNet.NewState(etime.Train, false)
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				inhibNet.Cycle()
+				inhibNet.Cycle(1, true)
 
 				if printCycs {
 					inLay.UnitValues(&inActs, "Act", 0)
diff --git a/axon/basic_test.goal b/axon/basic_test.goal
index b7277c643..475c08ac8 100644
--- a/axon/basic_test.goal
+++ b/axon/basic_test.goal
@@ -19,6 +19,7 @@ import (
 	"cogentcore.org/core/tensor"
 	"github.com/emer/emergent/v2/etime"
 	"github.com/emer/emergent/v2/paths"
+	"github.com/stretchr/testify/assert"
 	"golang.org/x/exp/maps"
 )
 
@@ -116,7 +117,7 @@ func newTestNet(nData int) *Network {
 	testNet.Build()
 	testNet.Defaults()
 	ApplyParamSheets(testNet, layerParams["Base"], pathParams["Base"])
-	testNet.InitWeights()                      // get GScale here
+	testNet.InitWeights()  // get GScale here
 	testNet.NewState(etime.Train, false)
 	return testNet
 }
@@ -228,7 +229,7 @@ func TestSpikeProp(t *testing.T) {
 		inCyc := 0
 		hidCyc := 0
 		for cyc := range 100 {
-			net.Cycle()
+			net.Cycle(1, true)
 			// fmt.Println(cyc, Neurons[Ge, hidLay.NeurStIndex, 0], Neurons[GeRaw, hidLay.NeurStIndex, 0])
 			if Neurons[Spike, inLay.NeurStIndex, 0] > 0 {
 				// fmt.Println("in spike:", cyc)
@@ -327,7 +328,7 @@ func TestInitWeights(t *testing.T) {
 
 			for qtr := range 4 {
 				for range 50 {
-					testNet.Cycle()
+					testNet.Cycle(1, true)
 				}
 				if qtr == 2 {
 					testNet.MinusPhase()
@@ -343,6 +344,48 @@ func TestInitWeights(t *testing.T) {
 	ReportValDiffs(t, Tol8, valMapA, valMapB, "init1", "init2")
 }
 
+func TestGPUState(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+	
+	testNetA := newTestNet(1)
+	
+	GPUInit()
+	UseGPU = true
+
+	testNetB := newTestNet(1)
+
+	RunCycleInc(1)
+	// get everything back
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar, SynapsesVar, SynapseTracesVar, PathGBufVar, PathGSynsVar)
+	// note: the following requires turning off read-only in vars.go
+	// RunDone(LayersVar, PathsVar, NetworkIxsVar, NeuronIxsVar, SynapseIxsVar, PathSendConVar, RecvPathIxsVar, PathRecvConVar, RecvSynIxsVar, CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar, SynapsesVar, SynapseTracesVar, PathGBufVar, PathGSynsVar)
+	// assert.Equal(t, testNetA.LayParams, testNetB.LayParams)
+	// assert.Equal(t, testNetA.LayParams, Layers)
+	// assert.Equal(t, testNetA.PathParams, testNetB.PathParams)
+	// assert.Equal(t, testNetA.NetworkIxs, testNetB.NetworkIxs)
+	// assert.Equal(t, testNetA.NeuronIxs.Values, testNetB.NeuronIxs.Values)
+	// assert.Equal(t, testNetA.SynapseIxs.Values, testNetB.SynapseIxs.Values)
+	// assert.Equal(t, testNetA.PathSendCon.Values, testNetB.PathSendCon.Values)
+	// assert.Equal(t, testNetA.RecvPathIxs.Values, testNetB.RecvPathIxs.Values)
+	// assert.Equal(t, testNetA.PathRecvCon.Values, testNetB.PathRecvCon.Values)
+	// assert.Equal(t, testNetA.RecvSynIxs.Values, testNetB.RecvSynIxs.Values)
+	assert.NotEqual(t, testNetA.Ctx, testNetB.Ctx)
+	assert.Equal(t, testNetA.Neurons.Values, testNetB.Neurons.Values)
+	assert.Equal(t, testNetA.NeuronAvgs.Values, testNetB.NeuronAvgs.Values)
+	assert.Equal(t, testNetA.LayerStates.Values, testNetB.LayerStates.Values)
+	assert.Equal(t, testNetA.GlobalScalars.Values, testNetB.GlobalScalars.Values)
+	assert.Equal(t, testNetA.GlobalVectors.Values, testNetB.GlobalVectors.Values)
+	assert.Equal(t, testNetA.Exts.Values, testNetB.Exts.Values)
+	assert.Equal(t, testNetA.Pools.Values, testNetB.Pools.Values)
+	assert.Equal(t, testNetA.PoolsInt.Values, testNetB.PoolsInt.Values)
+	assert.Equal(t, testNetA.PathGBuf.Values, testNetB.PathGBuf.Values)
+	assert.Equal(t, testNetA.PathGSyns.Values, testNetB.PathGSyns.Values)
+	assert.Equal(t, testNetA.Synapses.Values, testNetB.Synapses.Values)
+	assert.Equal(t, testNetA.SynapseTraces.Values, testNetB.SynapseTraces.Values)
+}
+
 func TestNetAct(t *testing.T) {
 	NetActTest(t, Tol7, false)
 }
@@ -359,6 +402,11 @@ func TestGPUAct(t *testing.T) {
 // Note: use NetDebugAct for printf debugging of all values --
 // "this is only a test"
 func NetActTest(t *testing.T, tol float32, gpu bool) {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+
 	testNet := newTestNet(1)
 	ctx := testNet.Context()
 	testNet.InitExt()
@@ -368,14 +416,6 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	hidLay := testNet.LayerByName("Hidden")
 	outLay := testNet.LayerByName("Output")
 
-	if gpu {
-		GPUInit()
-		UseGPU = true
-		ToGPUAll()
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.CycleByCycle = true // alt modes
-	}
-
 	qtr0HidActs := []float32{0.6944439, 0, 0, 0}
 	qtr0HidGes := []float32{0.35385746, 0, 0, 0}
 	qtr0HidGis := []float32{0.15478331, 0.15478331, 0.15478331, 0.15478331}
@@ -426,10 +466,7 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 		for qtr := range 4 {
 			for cyc := range cycPerQtr {
 				_ = cyc
-				testNet.Cycle()
-				// if gpu {
-				// 	testNet.GPU.SyncNeuronsFromGPU()
-				// }
+				testNet.Cycle(1, true)
 			}
 			if qtr == 2 {
 				testNet.MinusPhase()
@@ -484,41 +521,6 @@ func NetActTest(t *testing.T, tol float32, gpu bool) {
 	// testNet.GPU.Destroy()
 }
 
-func TestGPUDiffs(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
-	nonGPUValues := NetDebugAct(t, false, false, 1, false)
-	gpuValues := NetDebugAct(t, false, true, 1, false)
-	// note: this has bad tolerance due to NMDA -- can see that if you raise tol to Tol5 etc
-	ReportValDiffs(t, Tol4, nonGPUValues, gpuValues, "CPU", "GPU")
-}
-
-func TestDebugAct(t *testing.T) {
-	t.Skip("skipped in regular testing")
-	NetDebugAct(t, true, false, 1, false)
-}
-
-func TestDebugGPUAct(t *testing.T) {
-	t.Skip("skipped in regular testing")
-	NetDebugAct(t, true, true, 1, false)
-}
-
-func TestNDataDiffs(t *testing.T) {
-	nd1Values := NetDebugAct(t, false, false, 1, true)
-	nd4Values := NetDebugAct(t, false, false, 4, true)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
-}
-
-func TestGPUNDataDiffs(t *testing.T) {
-	if os.Getenv("TEST_GPU") != "true" {
-		t.Skip("Set TEST_GPU env var to run GPU tests")
-	}
-	nd1Values := NetDebugAct(t, false, true, 1, true)
-	nd4Values := NetDebugAct(t, false, true, 4, true)
-	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
-}
-
 // ReportValDiffs -- reports diffs between a, b values at given tolerance
 func ReportValDiffs(t *testing.T, tolerance float32, va, vb map[string]float32, aLabel, bLabel string, exclude ...string) {
 	keys := maps.Keys(va)
@@ -556,6 +558,11 @@ func ReportValDiffs(t *testing.T, tolerance float32, va, vb map[string]float32,
 // and also returns a map of all values and variables that can be used for a more
 // fine-grained diff test, e.g., see the GPU version.
 func NetDebugAct(t *testing.T, printValues bool, gpu bool, nData int, initWts bool) map[string]float32 {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+	
 	testNet := newTestNet(nData)
 	ApplyParamSheets(testNet, layerParams["FullDecay"], pathParams["FullDecay"])
 	
@@ -577,20 +584,14 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 
 	var vals []float32
 
-	if gpu {
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.RecFunTimes = true
-		// testNet.GPU.CycleByCycle = true // key for recording results cycle-by-cycle
-	}
-
 	// these control what is printed.
 	// the whole thing is run and returned in the valMap
-	valsPerRow := 8
+	valsPerRow := 4
 	nQtrs := 1     // max 4
-	cycPerQtr := 5 // max 50
-	nPats := 2     // max 4
-	stLayer := 1   // max 2
-	edLayer := 2   // max 3
+	cycPerQtr := 1 // max 50
+	nPats := 1     // max 4
+	stLayer := 0   // max 2
+	edLayer := 1   // max 3
 	nNeurs := 1    // max 4 -- number of neuron values to print
 
 	for pi := 0; pi < 4; pi++ {
@@ -611,9 +612,9 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 
 		testNet.ApplyExts() // key now for GPU
 
-		for qtr := 0; qtr < 4; qtr++ {
-			for cyc := 0; cyc < 50; cyc++ {
-				testNet.Cycle()
+		for qtr := 0; qtr < nQtrs; qtr++ {
+			for cyc := 0; cyc < cycPerQtr; cyc++ {
+				testNet.Cycle(1, true) // get neuron state
 
 				for ni := 0; ni < 4; ni++ {
 					for li := 0; li < 3; li++ {
@@ -668,6 +669,41 @@ func RunDebugAct(t *testing.T, testNet *Network, printValues bool, gpu bool, ini
 	return valMap
 }
 
+func TestGPUDiffs(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+	nonGPUValues := NetDebugAct(t, false, false, 1, false)
+	gpuValues := NetDebugAct(t, false, true, 1, false)
+	// note: this has bad tolerance due to NMDA -- can see that if you raise tol to Tol5 etc
+	ReportValDiffs(t, Tol4, nonGPUValues, gpuValues, "CPU", "GPU")
+}
+
+func TestDebugAct(t *testing.T) {
+	// t.Skip("skipped in regular testing")
+	NetDebugAct(t, true, false, 1, false)
+}
+
+func TestDebugGPUAct(t *testing.T) {
+	// t.Skip("skipped in regular testing")
+	NetDebugAct(t, true, true, 1, false)
+}
+
+func TestNDataDiffs(t *testing.T) {
+	nd1Values := NetDebugAct(t, false, false, 1, true)
+	nd4Values := NetDebugAct(t, false, false, 4, true)
+	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
+}
+
+func TestGPUNDataDiffs(t *testing.T) {
+	if os.Getenv("TEST_GPU") != "true" {
+		t.Skip("Set TEST_GPU env var to run GPU tests")
+	}
+	nd1Values := NetDebugAct(t, false, true, 1, true)
+	nd4Values := NetDebugAct(t, false, true, 4, true)
+	ReportValDiffs(t, Tol8, nd1Values, nd4Values, "nData = 1", "nData = 4")
+}
+
 func TestNetLearn(t *testing.T) {
 	NetTestLearn(t, Tol7, false)
 }
@@ -680,6 +716,11 @@ func TestGPULearn(t *testing.T) {
 }
 
 func NetTestLearn(t *testing.T, tol float32, gpu bool) {
+	if gpu {
+		GPUInit()
+		UseGPU = true
+	}
+
 	testNet := newTestNet(1)
 	ctx := testNet.Context()
 
@@ -735,12 +776,6 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 	testNet.InitWeights()
 	testNet.InitExt()
 
-	if gpu {
-		// testNet.ConfigGPUnoGUI()
-		// testNet.GPU.RecFunTimes = true // alt forms
-		// testNet.GPU.CycleByCycle = true //
-	}
-
 	for pi := 0; pi < 4; pi++ {
 		testNet.NewState(etime.Train, false)
 
@@ -752,10 +787,7 @@ func NetTestLearn(t *testing.T, tol float32, gpu bool) {
 
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				testNet.Cycle()
-				if gpu {
-					// testNet.GPU.SyncNeuronsFromGPU()
-				}
+				testNet.Cycle(1, true)
 
 				hidLay.UnitValues(&hidAct, "Act", 0)
 				hidLay.UnitValues(&hidGes, "Ge", 0)
@@ -916,8 +948,7 @@ func NetTestRLRate(t *testing.T, tol float32, gpu bool) {
 		testNet.NewState(etime.Train, false)
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				testNet.Cycle()
-				// testNet.GPU.SyncNeuronsFromGPU()
+				testNet.Cycle(1, true)
 
 				hidLay.UnitValues(&hidAct, "Act", 0)
 				hidLay.UnitValues(&hidGes, "Ge", 0)
@@ -1076,7 +1107,7 @@ func RunDebugLearn(t *testing.T, testNet *Network, printValues bool, gpu bool, i
 
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < 50; cyc++ {
-				testNet.Cycle()
+				testNet.Cycle(1, true)
 			}
 			if qtr == 2 {
 				testNet.MinusPhase()
@@ -1310,7 +1341,7 @@ func TestInhibAct(t *testing.T) {
 		inhibNet.NewState(etime.Train, false)
 		for qtr := 0; qtr < 4; qtr++ {
 			for cyc := 0; cyc < cycPerQtr; cyc++ {
-				inhibNet.Cycle()
+				inhibNet.Cycle(1, true)
 
 				if printCycs {
 					inLay.UnitValues(&inActs, "Act", 0)
diff --git a/axon/gosl.go b/axon/gosl.go
index 5781c5362..73d319673 100644
--- a/axon/gosl.go
+++ b/axon/gosl.go
@@ -61,24 +61,24 @@ func GPUInit() {
 	{
 		sy := gpu.NewComputeSystem(gp, "Default")
 		GPUSystem = sy
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSubMeanPath.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSyn.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/LayerGi.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/CyclePost.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSyn.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleInc.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/BetweenGi.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleInc.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/LayerGi.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PoolGi.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy)
 		gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseNeuron.wgsl", sy)
-		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSubMeanPath.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy)
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy)
 		vars := sy.Vars()
 		{
 			sgp := vars.AddGroup(gpu.Storage)
@@ -146,760 +146,760 @@ func GPURelease() {
 	ComputeGPU.Release()
 }
 
-// RunDWtSyn runs the DWtSyn kernel with given number of elements,
+// RunBetweenGi runs the BetweenGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a
+// Alternatively, a single-shot RunOneBetweenGi call does Run and Done for a
 // single run-and-sync case.
-func RunDWtSyn(n int) {
+func RunBetweenGi(n int) {
 	if UseGPU {
-		RunDWtSynGPU(n)
+		RunBetweenGiGPU(n)
 	} else {
-		RunDWtSynCPU(n)
+		RunBetweenGiCPU(n)
 	}
 }
 
-// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info.
-func RunDWtSynGPU(n int) {
+// RunBetweenGiGPU runs the BetweenGi kernel on the GPU. See [RunBetweenGi] for more info.
+func RunBetweenGiGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtSyn"]
+	pl := sy.ComputePipelines["BetweenGi"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunDWtSynCPU runs the DWtSyn kernel on the CPU.
-func RunDWtSynCPU(n int) {
-	gpu.VectorizeFunc(0, n, DWtSyn)
+// RunBetweenGiCPU runs the BetweenGi kernel on the CPU.
+func RunBetweenGiCPU(n int) {
+	gpu.VectorizeFunc(0, n, BetweenGi)
 }
 
-// RunOneDWtSyn runs the DWtSyn kernel with given number of elements,
+// RunOneBetweenGi runs the BetweenGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneDWtSyn(n int, syncVars ...GPUVars) {
+func RunOneBetweenGi(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunDWtSynGPU(n)
+		RunBetweenGiGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunDWtSynCPU(n)
+		RunBetweenGiCPU(n)
 	}
 }
-// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
+// RunSendSpike runs the SendSpike kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a
+// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a
 // single run-and-sync case.
-func RunDWtSubMeanPath(n int) {
+func RunSendSpike(n int) {
 	if UseGPU {
-		RunDWtSubMeanPathGPU(n)
+		RunSendSpikeGPU(n)
 	} else {
-		RunDWtSubMeanPathCPU(n)
+		RunSendSpikeCPU(n)
 	}
 }
 
-// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info.
-func RunDWtSubMeanPathGPU(n int) {
+// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info.
+func RunSendSpikeGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtSubMeanPath"]
+	pl := sy.ComputePipelines["SendSpike"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU.
-func RunDWtSubMeanPathCPU(n int) {
-	gpu.VectorizeFunc(0, n, DWtSubMeanPath)
+// RunSendSpikeCPU runs the SendSpike kernel on the CPU.
+func RunSendSpikeCPU(n int) {
+	gpu.VectorizeFunc(0, n, SendSpike)
 }
 
-// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
+// RunOneSendSpike runs the SendSpike kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) {
+func RunOneSendSpike(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunDWtSubMeanPathGPU(n)
+		RunSendSpikeGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunDWtSubMeanPathCPU(n)
+		RunSendSpikeCPU(n)
 	}
 }
-// RunWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
+// RunCycleInc runs the CycleInc kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneWtFromDWtSyn call does Run and Done for a
+// Alternatively, a single-shot RunOneCycleInc call does Run and Done for a
 // single run-and-sync case.
-func RunWtFromDWtSyn(n int) {
+func RunCycleInc(n int) {
 	if UseGPU {
-		RunWtFromDWtSynGPU(n)
+		RunCycleIncGPU(n)
 	} else {
-		RunWtFromDWtSynCPU(n)
+		RunCycleIncCPU(n)
 	}
 }
 
-// RunWtFromDWtSynGPU runs the WtFromDWtSyn kernel on the GPU. See [RunWtFromDWtSyn] for more info.
-func RunWtFromDWtSynGPU(n int) {
+// RunCycleIncGPU runs the CycleInc kernel on the GPU. See [RunCycleInc] for more info.
+func RunCycleIncGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["WtFromDWtSyn"]
+	pl := sy.ComputePipelines["CycleInc"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunWtFromDWtSynCPU runs the WtFromDWtSyn kernel on the CPU.
-func RunWtFromDWtSynCPU(n int) {
-	gpu.VectorizeFunc(0, n, WtFromDWtSyn)
+// RunCycleIncCPU runs the CycleInc kernel on the CPU.
+func RunCycleIncCPU(n int) {
+	gpu.VectorizeFunc(0, n, CycleInc)
 }
 
-// RunOneWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
+// RunOneCycleInc runs the CycleInc kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneWtFromDWtSyn(n int, syncVars ...GPUVars) {
+func RunOneCycleInc(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunWtFromDWtSynGPU(n)
+		RunCycleIncGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunWtFromDWtSynCPU(n)
+		RunCycleIncCPU(n)
 	}
 }
-// RunGatherSpikes runs the GatherSpikes kernel with given number of elements,
+// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a
+// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a
 // single run-and-sync case.
-func RunGatherSpikes(n int) {
+func RunDWtFromDiSyn(n int) {
 	if UseGPU {
-		RunGatherSpikesGPU(n)
+		RunDWtFromDiSynGPU(n)
 	} else {
-		RunGatherSpikesCPU(n)
+		RunDWtFromDiSynCPU(n)
 	}
 }
 
-// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info.
-func RunGatherSpikesGPU(n int) {
+// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info.
+func RunDWtFromDiSynGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["GatherSpikes"]
+	pl := sy.ComputePipelines["DWtFromDiSyn"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU.
-func RunGatherSpikesCPU(n int) {
-	gpu.VectorizeFunc(0, n, GatherSpikes)
+// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU.
+func RunDWtFromDiSynCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtFromDiSyn)
 }
 
-// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements,
+// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneGatherSpikes(n int, syncVars ...GPUVars) {
+func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunGatherSpikesGPU(n)
+		RunDWtFromDiSynGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunGatherSpikesCPU(n)
+		RunDWtFromDiSynCPU(n)
 	}
 }
-// RunLayerGi runs the LayerGi kernel with given number of elements,
+// RunPoolGi runs the PoolGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneLayerGi call does Run and Done for a
+// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a
 // single run-and-sync case.
-func RunLayerGi(n int) {
+func RunPoolGi(n int) {
 	if UseGPU {
-		RunLayerGiGPU(n)
+		RunPoolGiGPU(n)
 	} else {
-		RunLayerGiCPU(n)
+		RunPoolGiCPU(n)
 	}
 }
 
-// RunLayerGiGPU runs the LayerGi kernel on the GPU. See [RunLayerGi] for more info.
-func RunLayerGiGPU(n int) {
+// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info.
+func RunPoolGiGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["LayerGi"]
+	pl := sy.ComputePipelines["PoolGi"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunLayerGiCPU runs the LayerGi kernel on the CPU.
-func RunLayerGiCPU(n int) {
-	gpu.VectorizeFunc(0, n, LayerGi)
+// RunPoolGiCPU runs the PoolGi kernel on the CPU.
+func RunPoolGiCPU(n int) {
+	gpu.VectorizeFunc(0, n, PoolGi)
 }
 
-// RunOneLayerGi runs the LayerGi kernel with given number of elements,
+// RunOnePoolGi runs the PoolGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneLayerGi(n int, syncVars ...GPUVars) {
+func RunOnePoolGi(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunLayerGiGPU(n)
+		RunPoolGiGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunLayerGiCPU(n)
+		RunPoolGiCPU(n)
 	}
 }
-// RunSendSpike runs the SendSpike kernel with given number of elements,
+// RunCycleNeuron runs the CycleNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a
+// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunSendSpike(n int) {
+func RunCycleNeuron(n int) {
 	if UseGPU {
-		RunSendSpikeGPU(n)
+		RunCycleNeuronGPU(n)
 	} else {
-		RunSendSpikeCPU(n)
+		RunCycleNeuronCPU(n)
 	}
 }
 
-// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info.
-func RunSendSpikeGPU(n int) {
+// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info.
+func RunCycleNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["SendSpike"]
+	pl := sy.ComputePipelines["CycleNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunSendSpikeCPU runs the SendSpike kernel on the CPU.
-func RunSendSpikeCPU(n int) {
-	gpu.VectorizeFunc(0, n, SendSpike)
+// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU.
+func RunCycleNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, CycleNeuron)
 }
 
-// RunOneSendSpike runs the SendSpike kernel with given number of elements,
+// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneSendSpike(n int, syncVars ...GPUVars) {
+func RunOneCycleNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunSendSpikeGPU(n)
+		RunCycleNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunSendSpikeCPU(n)
+		RunCycleNeuronCPU(n)
 	}
 }
-// RunCyclePost runs the CyclePost kernel with given number of elements,
+// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a
+// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunCyclePost(n int) {
+func RunApplyExtsNeuron(n int) {
 	if UseGPU {
-		RunCyclePostGPU(n)
+		RunApplyExtsNeuronGPU(n)
 	} else {
-		RunCyclePostCPU(n)
+		RunApplyExtsNeuronCPU(n)
 	}
 }
 
-// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info.
-func RunCyclePostGPU(n int) {
+// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info.
+func RunApplyExtsNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["CyclePost"]
+	pl := sy.ComputePipelines["ApplyExtsNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunCyclePostCPU runs the CyclePost kernel on the CPU.
-func RunCyclePostCPU(n int) {
-	gpu.VectorizeFunc(0, n, CyclePost)
+// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU.
+func RunApplyExtsNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, ApplyExtsNeuron)
 }
 
-// RunOneCyclePost runs the CyclePost kernel with given number of elements,
+// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneCyclePost(n int, syncVars ...GPUVars) {
+func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunCyclePostGPU(n)
+		RunApplyExtsNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunCyclePostCPU(n)
+		RunApplyExtsNeuronCPU(n)
 	}
 }
-// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
+// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunMinusPhaseNeuron(n int) {
+func RunPlusPhaseStartNeuron(n int) {
 	if UseGPU {
-		RunMinusPhaseNeuronGPU(n)
+		RunPlusPhaseStartNeuronGPU(n)
 	} else {
-		RunMinusPhaseNeuronCPU(n)
+		RunPlusPhaseStartNeuronCPU(n)
 	}
 }
 
-// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info.
-func RunMinusPhaseNeuronGPU(n int) {
+// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info.
+func RunPlusPhaseStartNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["MinusPhaseNeuron"]
+	pl := sy.ComputePipelines["PlusPhaseStartNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU.
-func RunMinusPhaseNeuronCPU(n int) {
-	gpu.VectorizeFunc(0, n, MinusPhaseNeuron)
-}
+// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU.
+func RunPlusPhaseStartNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhaseStartNeuron)
+}
 
-// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
+// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) {
+func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunMinusPhaseNeuronGPU(n)
+		RunPlusPhaseStartNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunMinusPhaseNeuronCPU(n)
+		RunPlusPhaseStartNeuronCPU(n)
 	}
 }
-// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
+// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhaseStartNeuron(n int) {
+func RunPlusPhasePool(n int) {
 	if UseGPU {
-		RunPlusPhaseStartNeuronGPU(n)
+		RunPlusPhasePoolGPU(n)
 	} else {
-		RunPlusPhaseStartNeuronCPU(n)
+		RunPlusPhasePoolCPU(n)
 	}
 }
 
-// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info.
-func RunPlusPhaseStartNeuronGPU(n int) {
+// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info.
+func RunPlusPhasePoolGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhaseStartNeuron"]
+	pl := sy.ComputePipelines["PlusPhasePool"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU.
-func RunPlusPhaseStartNeuronCPU(n int) {
-	gpu.VectorizeFunc(0, n, PlusPhaseStartNeuron)
+// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU.
+func RunPlusPhasePoolCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhasePool)
 }
 
-// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements,
+// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) {
+func RunOnePlusPhasePool(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhaseStartNeuronGPU(n)
+		RunPlusPhasePoolGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhaseStartNeuronCPU(n)
+		RunPlusPhasePoolCPU(n)
 	}
 }
-// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
+// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunApplyExtsNeuron(n int) {
+func RunPlusPhaseNeuron(n int) {
 	if UseGPU {
-		RunApplyExtsNeuronGPU(n)
+		RunPlusPhaseNeuronGPU(n)
 	} else {
-		RunApplyExtsNeuronCPU(n)
+		RunPlusPhaseNeuronCPU(n)
 	}
 }
 
-// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info.
-func RunApplyExtsNeuronGPU(n int) {
+// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info.
+func RunPlusPhaseNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["ApplyExtsNeuron"]
+	pl := sy.ComputePipelines["PlusPhaseNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU.
-func RunApplyExtsNeuronCPU(n int) {
-	gpu.VectorizeFunc(0, n, ApplyExtsNeuron)
+// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU.
+func RunPlusPhaseNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, PlusPhaseNeuron)
 }
 
-// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements,
+// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) {
+func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunApplyExtsNeuronGPU(n)
+		RunPlusPhaseNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunApplyExtsNeuronCPU(n)
+		RunPlusPhaseNeuronCPU(n)
 	}
 }
-// RunBetweenGi runs the BetweenGi kernel with given number of elements,
+// RunLayerGi runs the LayerGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneBetweenGi call does Run and Done for a
+// Alternatively, a single-shot RunOneLayerGi call does Run and Done for a
 // single run-and-sync case.
-func RunBetweenGi(n int) {
+func RunLayerGi(n int) {
 	if UseGPU {
-		RunBetweenGiGPU(n)
+		RunLayerGiGPU(n)
 	} else {
-		RunBetweenGiCPU(n)
+		RunLayerGiCPU(n)
 	}
 }
 
-// RunBetweenGiGPU runs the BetweenGi kernel on the GPU. See [RunBetweenGi] for more info.
-func RunBetweenGiGPU(n int) {
+// RunLayerGiGPU runs the LayerGi kernel on the GPU. See [RunLayerGi] for more info.
+func RunLayerGiGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["BetweenGi"]
+	pl := sy.ComputePipelines["LayerGi"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunBetweenGiCPU runs the BetweenGi kernel on the CPU.
-func RunBetweenGiCPU(n int) {
-	gpu.VectorizeFunc(0, n, BetweenGi)
+// RunLayerGiCPU runs the LayerGi kernel on the CPU.
+func RunLayerGiCPU(n int) {
+	gpu.VectorizeFunc(0, n, LayerGi)
 }
 
-// RunOneBetweenGi runs the BetweenGi kernel with given number of elements,
+// RunOneLayerGi runs the LayerGi kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneBetweenGi(n int, syncVars ...GPUVars) {
+func RunOneLayerGi(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunBetweenGiGPU(n)
+		RunLayerGiGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunBetweenGiCPU(n)
+		RunLayerGiCPU(n)
 	}
 }
-// RunCycleNeuron runs the CycleNeuron kernel with given number of elements,
+// RunWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneWtFromDWtSyn call does Run and Done for a
 // single run-and-sync case.
-func RunCycleNeuron(n int) {
+func RunWtFromDWtSyn(n int) {
 	if UseGPU {
-		RunCycleNeuronGPU(n)
+		RunWtFromDWtSynGPU(n)
 	} else {
-		RunCycleNeuronCPU(n)
+		RunWtFromDWtSynCPU(n)
 	}
 }
 
-// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info.
-func RunCycleNeuronGPU(n int) {
+// RunWtFromDWtSynGPU runs the WtFromDWtSyn kernel on the GPU. See [RunWtFromDWtSyn] for more info.
+func RunWtFromDWtSynGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["CycleNeuron"]
+	pl := sy.ComputePipelines["WtFromDWtSyn"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU.
-func RunCycleNeuronCPU(n int) {
-	gpu.VectorizeFunc(0, n, CycleNeuron)
+// RunWtFromDWtSynCPU runs the WtFromDWtSyn kernel on the CPU.
+func RunWtFromDWtSynCPU(n int) {
+	gpu.VectorizeFunc(0, n, WtFromDWtSyn)
 }
 
-// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements,
+// RunOneWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneCycleNeuron(n int, syncVars ...GPUVars) {
+func RunOneWtFromDWtSyn(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunCycleNeuronGPU(n)
+		RunWtFromDWtSynGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunCycleNeuronCPU(n)
+		RunWtFromDWtSynCPU(n)
 	}
 }
-// RunCycleInc runs the CycleInc kernel with given number of elements,
+// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneCycleInc call does Run and Done for a
+// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a
 // single run-and-sync case.
-func RunCycleInc(n int) {
+func RunMinusPhasePool(n int) {
 	if UseGPU {
-		RunCycleIncGPU(n)
+		RunMinusPhasePoolGPU(n)
 	} else {
-		RunCycleIncCPU(n)
+		RunMinusPhasePoolCPU(n)
 	}
 }
 
-// RunCycleIncGPU runs the CycleInc kernel on the GPU. See [RunCycleInc] for more info.
-func RunCycleIncGPU(n int) {
+// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info.
+func RunMinusPhasePoolGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["CycleInc"]
+	pl := sy.ComputePipelines["MinusPhasePool"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunCycleIncCPU runs the CycleInc kernel on the CPU.
-func RunCycleIncCPU(n int) {
-	gpu.VectorizeFunc(0, n, CycleInc)
+// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU.
+func RunMinusPhasePoolCPU(n int) {
+	gpu.VectorizeFunc(0, n, MinusPhasePool)
 }
 
-// RunOneCycleInc runs the CycleInc kernel with given number of elements,
+// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneCycleInc(n int, syncVars ...GPUVars) {
+func RunOneMinusPhasePool(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunCycleIncGPU(n)
+		RunMinusPhasePoolGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunCycleIncCPU(n)
+		RunMinusPhasePoolCPU(n)
 	}
 }
-// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements,
+// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a
+// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhasePool(n int) {
+func RunMinusPhaseNeuron(n int) {
 	if UseGPU {
-		RunPlusPhasePoolGPU(n)
+		RunMinusPhaseNeuronGPU(n)
 	} else {
-		RunPlusPhasePoolCPU(n)
+		RunMinusPhaseNeuronCPU(n)
 	}
 }
 
-// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info.
-func RunPlusPhasePoolGPU(n int) {
+// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info.
+func RunMinusPhaseNeuronGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhasePool"]
+	pl := sy.ComputePipelines["MinusPhaseNeuron"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU.
-func RunPlusPhasePoolCPU(n int) {
-	gpu.VectorizeFunc(0, n, PlusPhasePool)
+// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU.
+func RunMinusPhaseNeuronCPU(n int) {
+	gpu.VectorizeFunc(0, n, MinusPhaseNeuron)
 }
 
-// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements,
+// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhasePool(n int, syncVars ...GPUVars) {
+func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhasePoolGPU(n)
+		RunMinusPhaseNeuronGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhasePoolCPU(n)
+		RunMinusPhaseNeuronCPU(n)
 	}
 }
-// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
+// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a
+// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a
 // single run-and-sync case.
-func RunDWtFromDiSyn(n int) {
+func RunDWtSubMeanPath(n int) {
 	if UseGPU {
-		RunDWtFromDiSynGPU(n)
+		RunDWtSubMeanPathGPU(n)
 	} else {
-		RunDWtFromDiSynCPU(n)
+		RunDWtSubMeanPathCPU(n)
 	}
 }
 
-// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info.
-func RunDWtFromDiSynGPU(n int) {
+// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info.
+func RunDWtSubMeanPathGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["DWtFromDiSyn"]
+	pl := sy.ComputePipelines["DWtSubMeanPath"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU.
-func RunDWtFromDiSynCPU(n int) {
-	gpu.VectorizeFunc(0, n, DWtFromDiSyn)
+// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU.
+func RunDWtSubMeanPathCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtSubMeanPath)
 }
 
-// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements,
+// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) {
+func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunDWtFromDiSynGPU(n)
+		RunDWtSubMeanPathGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunDWtFromDiSynCPU(n)
+		RunDWtSubMeanPathCPU(n)
 	}
 }
-// RunPoolGi runs the PoolGi kernel with given number of elements,
+// RunGatherSpikes runs the GatherSpikes kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a
+// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a
 // single run-and-sync case.
-func RunPoolGi(n int) {
+func RunGatherSpikes(n int) {
 	if UseGPU {
-		RunPoolGiGPU(n)
+		RunGatherSpikesGPU(n)
 	} else {
-		RunPoolGiCPU(n)
+		RunGatherSpikesCPU(n)
 	}
 }
 
-// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info.
-func RunPoolGiGPU(n int) {
+// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info.
+func RunGatherSpikesGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PoolGi"]
+	pl := sy.ComputePipelines["GatherSpikes"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPoolGiCPU runs the PoolGi kernel on the CPU.
-func RunPoolGiCPU(n int) {
-	gpu.VectorizeFunc(0, n, PoolGi)
+// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU.
+func RunGatherSpikesCPU(n int) {
+	gpu.VectorizeFunc(0, n, GatherSpikes)
 }
 
-// RunOnePoolGi runs the PoolGi kernel with given number of elements,
+// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePoolGi(n int, syncVars ...GPUVars) {
+func RunOneGatherSpikes(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPoolGiGPU(n)
+		RunGatherSpikesGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPoolGiCPU(n)
+		RunGatherSpikesCPU(n)
 	}
 }
-// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
+// RunCyclePost runs the CyclePost kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a
+// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a
 // single run-and-sync case.
-func RunMinusPhasePool(n int) {
+func RunCyclePost(n int) {
 	if UseGPU {
-		RunMinusPhasePoolGPU(n)
+		RunCyclePostGPU(n)
 	} else {
-		RunMinusPhasePoolCPU(n)
+		RunCyclePostCPU(n)
 	}
 }
 
-// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info.
-func RunMinusPhasePoolGPU(n int) {
+// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info.
+func RunCyclePostGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["MinusPhasePool"]
+	pl := sy.ComputePipelines["CyclePost"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU.
-func RunMinusPhasePoolCPU(n int) {
-	gpu.VectorizeFunc(0, n, MinusPhasePool)
+// RunCyclePostCPU runs the CyclePost kernel on the CPU.
+func RunCyclePostCPU(n int) {
+	gpu.VectorizeFunc(0, n, CyclePost)
 }
 
-// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements,
+// RunOneCyclePost runs the CyclePost kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOneMinusPhasePool(n int, syncVars ...GPUVars) {
+func RunOneCyclePost(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunMinusPhasePoolGPU(n)
+		RunCyclePostGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunMinusPhasePoolCPU(n)
+		RunCyclePostCPU(n)
 	}
 }
-// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
+// RunDWtSyn runs the DWtSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // Can call multiple Run* kernels in a row, which are then all launched
 // in the same command submission on the GPU, which is by far the most efficient.
 // MUST call RunDone (with optional vars to sync) after all Run calls.
-// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a
+// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a
 // single run-and-sync case.
-func RunPlusPhaseNeuron(n int) {
+func RunDWtSyn(n int) {
 	if UseGPU {
-		RunPlusPhaseNeuronGPU(n)
+		RunDWtSynGPU(n)
 	} else {
-		RunPlusPhaseNeuronCPU(n)
+		RunDWtSynCPU(n)
 	}
 }
 
-// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info.
-func RunPlusPhaseNeuronGPU(n int) {
+// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info.
+func RunDWtSynGPU(n int) {
 	sy := GPUSystem
-	pl := sy.ComputePipelines["PlusPhaseNeuron"]
+	pl := sy.ComputePipelines["DWtSyn"]
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, 64)
 }
 
-// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU.
-func RunPlusPhaseNeuronCPU(n int) {
-	gpu.VectorizeFunc(0, n, PlusPhaseNeuron)
+// RunDWtSynCPU runs the DWtSyn kernel on the CPU.
+func RunDWtSynCPU(n int) {
+	gpu.VectorizeFunc(0, n, DWtSyn)
 }
 
-// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements,
+// RunOneDWtSyn runs the DWtSyn kernel with given number of elements,
 // on either the CPU or GPU depending on the UseGPU variable.
 // This version then calls RunDone with the given variables to sync
 // after the Run, for a single-shot Run-and-Done call. If multiple kernels
 // can be run in sequence, it is much more efficient to do multiple Run*
 // calls followed by a RunDone call.
-func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) {
+func RunOneDWtSyn(n int, syncVars ...GPUVars) {
 	if UseGPU {
-		RunPlusPhaseNeuronGPU(n)
+		RunDWtSynGPU(n)
 		RunDone(syncVars...)
 	} else {
-		RunPlusPhaseNeuronCPU(n)
+		RunDWtSynCPU(n)
 	}
 }
 // RunDone must be called after Run* calls to start compute kernels.
@@ -920,6 +920,9 @@ func RunDone(syncVars ...GPUVars) {
 
 // ToGPU copies given variables to the GPU for the system.
 func ToGPU(vars ...GPUVars) {
+	if !UseGPU {
+		return
+	}
 	sy := GPUSystem
 	syVars := sy.Vars()
 	for _, vr := range vars {
diff --git a/axon/init-net.go b/axon/init-net.go
index 623e3aba6..bec2bc8e0 100644
--- a/axon/init-net.go
+++ b/axon/init-net.go
@@ -45,9 +45,7 @@ func (nt *Network) NewState(mode enums.Enum, testing bool) {
 		}
 		ly.NewState(ctx)
 	}
-	//	if nt.GPU.On {
-	//		nt.GPU.SyncStateGBufToGPU()
-	//	}
+	ToGPULayers()
 }
 
 // InitWeights initializes synaptic weights and all other associated long-term state variables
@@ -75,8 +73,10 @@ func (nt *Network) InitWeights() { //types:add
 	}
 	// dur := time.Now().Sub(st)
 	// fmt.Printf("sym: %v\n", dur)
+	ToGPUAll()
+	ToGPU(SynapseTracesVar)          // only time we call this
+	ToGPU(PathGBufVar, PathGSynsVar) // and this
 	// nt.GPU.SyncAllToGPU()
-	// nt.GPU.SyncSynCaToGPU() // only time we call this
 	// nt.GPU.SyncGBufToGPU()
 }
 
diff --git a/axon/init-net.goal b/axon/init-net.goal
index bc7608801..15d2ae780 100644
--- a/axon/init-net.goal
+++ b/axon/init-net.goal
@@ -42,9 +42,7 @@ func (nt *Network) NewState(mode enums.Enum, testing bool) {
 		}
 		ly.NewState(ctx)
 	}
-	// if nt.GPU.On {
-	// 	nt.GPU.SyncStateGBufToGPU()
-	// }
+	ToGPULayers()
 }
 
 // InitWeights initializes synaptic weights and all other associated long-term state variables
@@ -72,8 +70,10 @@ func (nt *Network) InitWeights() { //types:add
 	}
 	// dur := time.Now().Sub(st)
 	// fmt.Printf("sym: %v\n", dur)
+	ToGPUAll()
+	ToGPU(SynapseTracesVar)          // only time we call this
+	ToGPU(PathGBufVar, PathGSynsVar) // and this
 	// nt.GPU.SyncAllToGPU()
-	// nt.GPU.SyncSynCaToGPU() // only time we call this
 	// nt.GPU.SyncGBufToGPU()
 }
 
diff --git a/axon/layer_test.go b/axon/layer_test.go
index d0f6f1613..28005823e 100644
--- a/axon/layer_test.go
+++ b/axon/layer_test.go
@@ -117,7 +117,7 @@ func TestLayerToJson(t *testing.T) {
 	// from net B. TODO: Would be better if we ran a cycle first, to get more variance.
 	net := createNetwork(ctx, shape, t)
 	hiddenLayer := net.LayerByName("Hidden")
-	net.Cycle() // run one cycle to make the weights more different
+	net.Cycle(1, true) // run one cycle to make the weights more different
 
 	netC := createNetwork(ctxC, shape, t)
 	hiddenLayerC := netC.LayerByName("Hidden")
diff --git a/axon/looper.go b/axon/looper.go
index 6510b9258..41a61c7d2 100644
--- a/axon/looper.go
+++ b/axon/looper.go
@@ -13,13 +13,15 @@ import (
 // LooperStandard adds all the standard Axon Trial and Cycle level processing calls
 // to the given Looper Stacks. cycle and trial are the enums for the looper levels,
 // trainMode is the training mode enum value.
+//   - fastNCycles is the number of cycles to run in one chunk, when single-cycle iteration
+//     is not otherwise required (based on step level, netview update level).
 //   - minus and plus phases of the theta cycle (trial), at plusStart (150) and plusEnd (199) cycles.
 //   - embedded beta phases within theta, that record St1 and St2 states.
 //   - net.Cycle() at every cycle step.
 //   - net.DWt() and net.WtFromDWt() learning calls in training mode, with netview update
 //     between these two calls if it is visible and viewing synapse variables.
 //   - netview update calls at appropriate levels (no-op if no GUI)
-func LooperStandard(ls *looper.Stacks, net *Network, viewFunc func() *netview.NetView, plusStart, plusEnd int, cycle, trial, trainMode enums.Enum) {
+func LooperStandard(ls *looper.Stacks, net *Network, viewFunc func(mode enums.Enum) *NetViewUpdate, fastNCycles, plusStart, plusEnd int, cycle, trial, trainMode enums.Enum) {
 	ls.AddOnStartToAll("SetContextMode", func(md, tm enums.Enum) {
 		ctx := net.Context()
 		ctx.Mode = int32(md.Int64())
@@ -39,34 +41,38 @@ func LooperStandard(ls *looper.Stacks, net *Network, viewFunc func() *netview.Ne
 		ctx.NewPhase(true)
 		net.PlusPhaseStart()
 	})
-	for m, st := range ls.Stacks {
+	for mode, st := range ls.Stacks {
 		cycLoop := st.Loops[cycle]
 		cycLoop.OnStart.Add("Cycle", func() {
-			// TODO:
-			// if ls.ModeStack().StepLevel == cycle {
-			// 	net.GPU.CycleByCycle = true
-			// } else {
-			// 	if viewupdt.IsCycleUpdating() {
-			// 		net.GPU.CycleByCycle = true
-			// 	} else {
-			// 		net.GPU.CycleByCycle = false
-			// 	}
-			// }
-			net.Cycle()
+			nCycles := 10
+			getNeurons := false
+			if ls.ModeStack().StepLevel.Int64() == cycle.Int64() {
+				nCycles = 1
+				getNeurons = true
+			} else if view := viewFunc(mode); view != nil {
+				if view.IsCycleUpdating() {
+					nCycles = 1
+					getNeurons = true
+				}
+			}
+			net.Cycle(nCycles, getNeurons)
+			if nCycles > 1 {
+				cycLoop.Counter.Cur += nCycles - 1
+			}
 		})
 
 		trlLoop := st.Loops[trial]
 		trlLoop.OnStart.Add("NewState", func() {
-			testing := m.Int64() != trainMode.Int64()
-			net.NewState(m, testing)
+			testing := mode.Int64() != trainMode.Int64()
+			net.NewState(mode, testing)
 		})
 		trlLoop.OnEnd.Add("PlusPhase:End", func() {
 			net.PlusPhase()
 		})
-		if m.Int64() == trainMode.Int64() {
+		if mode.Int64() == trainMode.Int64() {
 			trlLoop.OnEnd.Add("UpdateWeights", func() {
-				net.DWt()
-				if view := viewFunc(); view != nil && view.IsViewingSynapse() {
+				net.DWt() // todo: need to get synapses here, not after
+				if view := viewFunc(mode); view != nil && view.IsViewingSynapse() {
 					//TODO:
 					// net.GPU.SyncSynapsesFromGPU()
 					// net.GPU.SyncSynCaFromGPU() // note: only time we call this
@@ -79,23 +85,24 @@ func LooperStandard(ls *looper.Stacks, net *Network, viewFunc func() *netview.Ne
 }
 
 // LooperUpdateNetView adds netview update calls to the given
-// trial and cycle levels for given NetViewUpdate associated with given mode.
-// The netviewCountersFunc returns the counters and other stats
-// to display at the bottom of the NetView, and is passed the CountersString()
-// for the given mode's [looper.Stack].
-func LooperUpdateNetView(ls *looper.Stacks, mode, cycle, trial enums.Enum, viewupdt *NetViewUpdate, countersFunc func(md, tm enums.Enum) string) {
-	st := ls.Stacks[mode]
-	cycLoop := st.Loops[cycle]
-	cycLoop.OnEnd.Add("GUI:UpdateNetView", func() {
-		counters := countersFunc(mode, cycle)
-		viewupdt.UpdateCycle(cycLoop.Counter.Cur, counters)
-	})
-	trlLoop := st.Loops[trial]
-	trlLoop.OnEnd.Add("GUI:UpdateNetView", func() {
-		counters := countersFunc(mode, trial)
-		viewupdt.GoUpdate(counters)
-	})
-
+// trial and cycle levels for given NetViewUpdate associated with the mode,
+// returned by the given viewFunc function.
+// The countersFunc returns the counters and other stats to display at the
+// bottom of the NetView, based on given mode and level.
+func LooperUpdateNetView(ls *looper.Stacks, cycle, trial enums.Enum, viewFunc func(mode enums.Enum) *NetViewUpdate, countersFunc func(mode, level enums.Enum) string) {
+	for mode, st := range ls.Stacks {
+		viewUpdt := viewFunc(mode)
+		cycLoop := st.Loops[cycle]
+		cycLoop.OnEnd.Add("GUI:UpdateNetView", func() {
+			counters := countersFunc(mode, cycle)
+			viewUpdt.UpdateCycle(cycLoop.Counter.Cur, counters)
+		})
+		trlLoop := st.Loops[trial]
+		trlLoop.OnEnd.Add("GUI:UpdateNetView", func() {
+			counters := countersFunc(mode, trial)
+			viewUpdt.GoUpdate(counters)
+		})
+	}
 }
 
 //////// NetViewUpdate
diff --git a/axon/network.go b/axon/network.go
index df4613510..b935aa7ba 100644
--- a/axon/network.go
+++ b/axon/network.go
@@ -131,6 +131,8 @@ type Network struct {
 
 	//////// Params
 
+	// todo: rename LayParams -> LayerParams
+
 	// LayParams are all the layer parameters. [NLayers]
 	LayParams []LayerParams `display:"-"`
 
@@ -921,30 +923,56 @@ func ToGPUIndexes() {
 	ToGPU(NetworkIxsVar, NeuronIxsVar, SynapseIxsVar, PathSendConVar, RecvPathIxsVar, PathRecvConVar, RecvSynIxsVar)
 }
 
-// ToGPULayers copies all the layer-level state to the GPU.
+// ToGPUCtxGlobal copies Context and Global vars to the GPU.
+// This is done at start of each Cycle update.
+func ToGPUCtxGlobal() {
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar)
+}
+
+// todo: probably don't need PoolsIntVar beyond the first init
+
+// ToGPULayers copies all the layer-level state to the GPU, including context and globals.
 func ToGPULayers() {
-	ToGPU(CtxVar, NeuronsVar, NeuronAvgsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
 }
 
-// RunDoneLayers finishes running and copies all the layer-level state to the GPU.
-func RunDoneLayers() {
-	RunDone(CtxVar, NeuronsVar, NeuronAvgsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+// ToGPUNeurons copies Neurons, NeuronAvgs to the GPU.
+func ToGPUNeurons() {
+	ToGPU(NeuronsVar, NeuronAvgsVar)
+}
+
+// ToGPULayersNeurons copies all the layer-level and neuron state to the GPU.
+func ToGPULayersNeurons() {
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar)
 }
 
 // ToGPUSynapses copies the Synapse state to the GPU.
 func ToGPUSynapses() {
-	ToGPU(SynapsesVar, SynapseTracesVar)
+	ToGPU(SynapsesVar)
 }
 
-// ToGPUAll copies all state up to the GPU.
+// ToGPUAll copies all state up to the GPU. Only for InitWeights.
 func ToGPUAll() {
 	ToGPUIndexes()
-	ToGPU(CtxVar)
 	ToGPUParams()
-	ToGPULayers()
+	ToGPULayersNeurons()
 	ToGPUSynapses()
 }
 
+// note: RunDone can only be run once, so all vars need to be present in the one call.
+
+// RunDoneLayers finishes running and copies all the layer-level state from the GPU,
+// (and Context, Globals) but NOT neurons. This is the minimal case for Cycle().
+func RunDoneLayers() {
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+}
+
+// RunDoneLayersNeurons finishes running and copies all the layer-level
+// and neuron state from the GPU, including context and globals.
+func RunDoneLayersNeurons() {
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar)
+}
+
 // BuildPathGBuf builds the PathGBuf, PathGSyns,
 // based on the MaxDelay values in the PathParams,
 // which should have been configured by this point.
diff --git a/axon/network.goal b/axon/network.goal
index b7f828087..6ad069362 100644
--- a/axon/network.goal
+++ b/axon/network.goal
@@ -129,6 +129,8 @@ type Network struct {
 
 	//////// Params
 
+	// todo: rename LayParams -> LayerParams
+	
 	// LayParams are all the layer parameters. [NLayers]
 	LayParams []LayerParams `display:"-"`
 
@@ -918,30 +920,56 @@ func ToGPUIndexes() {
 	ToGPU(NetworkIxsVar, NeuronIxsVar, SynapseIxsVar, PathSendConVar, RecvPathIxsVar, PathRecvConVar, RecvSynIxsVar)
 }
 
-// ToGPULayers copies all the layer-level state to the GPU.
+// ToGPUCtxGlobal copies Context and Global vars to the GPU.
+// This is done at start of each Cycle update.
+func ToGPUCtxGlobal() {
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar)
+}
+
+// todo: probably don't need PoolsIntVar beyond the first init
+
+// ToGPULayers copies all the layer-level state to the GPU, including context and globals.
 func ToGPULayers() {
-	ToGPU(CtxVar, NeuronsVar, NeuronAvgsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
 }
 
-// RunDoneLayers finishes running and copies all the layer-level state to the GPU.
-func RunDoneLayers() {
-	RunDone(CtxVar, NeuronsVar, NeuronAvgsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+// ToGPUNeurons copies Neurons, NeuronAvgs to the GPU.
+func ToGPUNeurons() {
+	ToGPU(NeuronsVar, NeuronAvgsVar)
+}
+
+// ToGPULayersNeurons copies all the layer-level and neuron state to the GPU.
+func ToGPULayersNeurons() {
+	ToGPU(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar)
 }
 
 // ToGPUSynapses copies the Synapse state to the GPU.
 func ToGPUSynapses() {
-	ToGPU(SynapsesVar, SynapseTracesVar)
+	ToGPU(SynapsesVar)
 }
 
-// ToGPUAll copies all state up to the GPU.
+// ToGPUAll copies all state up to the GPU. Only for InitWeights.
 func ToGPUAll() {
 	ToGPUIndexes()
-	ToGPU(CtxVar)
 	ToGPUParams()
-	ToGPULayers()
+	ToGPULayersNeurons()
 	ToGPUSynapses()
 }
 
+// note: RunDone can only be run once, so all vars need to be present in the one call.
+
+// RunDoneLayers finishes running and copies all the layer-level state from the GPU,
+// (and Context, Globals) but NOT neurons. This is the minimal case for Cycle().
+func RunDoneLayers() {
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar)
+}
+
+// RunDoneLayersNeurons finishes running and copies all the layer-level
+// and neuron state from the GPU, including context and globals.
+func RunDoneLayersNeurons() {
+	RunDone(CtxVar, GlobalScalarsVar, GlobalVectorsVar, LayerStatesVar, PoolsVar, PoolsIntVar, NeuronsVar, NeuronAvgsVar)
+}
+
 // BuildPathGBuf builds the PathGBuf, PathGSyns,
 // based on the MaxDelay values in the PathParams,
 // which should have been configured by this point.
diff --git a/axon/params.go b/axon/params.go
index 026431269..9a10a4998 100644
--- a/axon/params.go
+++ b/axon/params.go
@@ -23,10 +23,10 @@ type PathSheets = params.Sheets[*PathParams]
 type Params struct {
 
 	// Layer has the parameters to apply to the [LayerParams] for layers.
-	Layer LayerSheets
+	Layer LayerSheets `display:"-"`
 
 	// Path has the parameters to apply to the [PathParams] for paths.
-	Path PathSheets
+	Path PathSheets `display:"-"`
 
 	// ExtraSheets has optional additional sheets of parameters
 	// to apply after the default Base sheet.
diff --git a/axon/shaders/CycleNeuron.wgsl b/axon/shaders/CycleNeuron.wgsl
index 8486f4a08..080c8c861 100644
--- a/axon/shaders/CycleNeuron.wgsl
+++ b/axon/shaders/CycleNeuron.wgsl
@@ -97,10 +97,9 @@ fn SetNeuronExtPosNeg(ctx: ptr<function,Context>, ni: u32,di: u32, val: f32) {
 	}
 }
 fn LayerParams_CycleNeuron(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	var pi = LayerParams_PoolIndex(ly, NeuronIxs[IndexU322D(NeuronIxs[0], NeuronIxs[1], u32(NrnSubPool),u32(ni))]);
-	var lpi = LayerParams_PoolIndex(ly, u32(u32(0)));
+	var pi = LayerParams_PoolIndex(ly, NeuronIxs[IndexU322D(NeuronIxs[0], NeuronIxs[1],
+	u32(NrnSubPool),u32(ni))]);
 	LayerParams_GInteg(ly, ctx, pi, ni, di);
-	LayerParams_SpikeFromG(ly, ctx, lpi, ni, di);
 }
 fn LayerParams_PulvinarDriver(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>, lni: u32,di: u32, drvGe: ptr<function,f32>,nonDrivePct: ptr<function,f32>) {
 	var dli = u32((*ly).Pulv.DriveLayIndex);
@@ -125,8 +124,8 @@ fn LayerParams_GInteg(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>,
 	LayerParams_SpecialPostGs(ly, ctx, ni, di, saveVal);
 }
 fn LayerParams_SpecialPreGs(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>, pi: u32,ni: u32,di: u32, drvGe: f32, nonDrivePct: f32) -> f32 {
-	var saveVal = f32(0); // sometimes we need to use a value computed here, for the post Gs step
-	var pi0 = pi - 1;         // 0-n pool index
+	var saveVal = f32(0);        // sometimes we need to use a value computed here, for the post Gs step
+	var pi0 = pi - (*ly).PoolSt - 1; // 0-n pool index
 	var pnn = u32(PoolNNeurons(pi));
 	var pni = NeuronIxs[IndexU322D(NeuronIxs[0], NeuronIxs[1], u32(NrnNeurIndex),u32(ni))] - u32(PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(PoolNeurSt),u32(pi),u32(di))]);
 	var nrnCtxtGe = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CtxtGe),u32(ni),u32(di))];
@@ -360,30 +359,6 @@ fn LayerParams_GNeuroMod(ly: ptr<function,LayerParams>, ctx: ptr<function,Contex
 	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Ge),u32(ni),u32(di))] *= ggain;
 	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Gi),u32(ni),u32(di))] *= ggain;
 }
-fn LayerParams_SpikeFromG(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>, lpi: u32,ni: u32,di: u32) {
-	ActParams_VmFromG(&(*ly).Acts, ctx, ni, di);
-	ActParams_SpikeFromVm(&(*ly).Acts, ctx, ni, di);
-	LearnNeurParams_CaFromSpike(&(*ly).Learn, ctx, ni, di);
-	var lmax = PoolAvgMax(AMGeInt, AMCycle, Max, lpi, di);
-	if (lmax > 0) {
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(GeIntNorm),u32(ni),u32(di))] = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(GeInt),u32(ni),u32(di))] / lmax;
-	} else {
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(GeIntNorm),u32(ni),u32(di))] = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(GeInt),u32(ni),u32(di))];
-	}
-	if ((*ctx).Cycle >= (*ly).Acts.Dt.MaxCycStart) {
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkMaxCa),u32(ni),u32(di))] += (*ly).Learn.CaSpk.Dt.PDt * (Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkM),u32(ni),u32(di))] - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkMaxCa),u32(ni),u32(di))]);
-		var spkmax = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkMaxCa),u32(ni),u32(di))];
-		if (spkmax > Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkMax),u32(ni),u32(di))]) {
-			Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkMax),u32(ni),u32(di))] = spkmax;
-		}
-	}
-	var spk = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spike),u32(ni),u32(di))];
-	if (spk > 0) {
-		var spksper = (*ctx).ThetaCycles / 8;
-		var bin = min((*ctx).Cycle/spksper, 7);
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SpkBin0 + NeuronVars(bin)),u32(ni),u32(di))] += spk;
-	}
-}
 
 ///////////// import: "act-net.go"
 fn CycleNeuron(i: u32) { //gosl:kernel
@@ -436,23 +411,6 @@ struct SpikeParams {
 	RDt: f32,
 	pad: i32,
 }
-fn SpikeParams_ActFromISI(sk: ptr<function,SpikeParams>, isi: f32,timeInc: f32,integ: f32) -> f32 {
-	if (isi <= 0) {
-		return f32(0);
-	}
-			var maxInt = 1.0 / (timeInc * integ * (*sk).MaxHz); // interval at max hz..
-return maxInt / isi; // normalized
-}
-fn SpikeParams_AvgFromISI(sk: ptr<function,SpikeParams>, avg: f32, isi: f32) -> f32 {
-	var av = avg;
-	if (av <= 0) {
-		av = isi;
-	} else if (isi < 0.8*av) {
-		av = isi; // if significantly less than we take that
-	} else { // integrate on slower
-		av += (*sk).ISIDt * (isi - av); // running avg updt
-	}return av;
-}
 struct DendParams {
 	GbarExp: f32,
 	GbarR: f32,
@@ -759,131 +717,6 @@ fn ActParams_GiFromSyn(ac: ptr<function,ActParams>, ctx: ptr<function,Context>,
 		return f32(0);
 	}return giSyn;
 }
-fn ActParams_InetFromG(ac: ptr<function,ActParams>, vm: f32,ge: f32,gl: f32,gi: f32,gk: f32) -> f32 {
-	var inet = ge*((*ac).Erev.E-vm) + gl*(*ac).Gbar.L*((*ac).Erev.L-vm) + gi*((*ac).Erev.I-vm) + gk*((*ac).Erev.K-vm);
-	if (inet > (*ac).Dt.VmTau) {
-		inet = (*ac).Dt.VmTau;
-	} else if (inet < -(*ac).Dt.VmTau) {
-		inet = -(*ac).Dt.VmTau;
-	}return inet;
-}
-fn ActParams_VmFromInet(ac: ptr<function,ActParams>, vm: f32,dt: f32,inet: f32) -> f32 {
-	return F32_ClipValue(&(*ac).VmRange, vm + dt*inet);
-}
-fn ActParams_VmInteg(ac: ptr<function,ActParams>, vm: f32,dt: f32,ge: f32,gl: f32,gi: f32,gk: f32, nvm: ptr<function,f32>,inet: ptr<function,f32>) {
-	var dtEff = dt * (*ac).Dt.DtStep;
-	*nvm = vm;
-	for (var i = i32(0); i < (*ac).Dt.VmSteps; i++) {
-		*inet = ActParams_InetFromG(ac, *nvm, ge, gl, gi, gk);
-		*nvm = ActParams_VmFromInet(ac, *nvm, dtEff, *inet);
-	}
-}
-fn ActParams_VmFromG(ac: ptr<function,ActParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	var updtVm = true;
-	var isi = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(ISI),u32(ni),u32(di))];
-	if ((*ac).Spikes.Tr > 0 && isi >= 0 && isi < f32((*ac).Spikes.Tr)) {
-		updtVm = false; // don't update the spiking vm during refract
-	}
-	var ge = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Ge),u32(ni),u32(di))] * (*ac).Gbar.E;
-	var gi = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Gi),u32(ni),u32(di))] * (*ac).Gbar.I;
-	var gk = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Gk),u32(ni),u32(di))] * (*ac).Gbar.K;
-	var nvm: f32;
-	var inet: f32;
-	var expi: f32;
-	if (updtVm) {
-		ActParams_VmInteg(ac, Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))], (*ac).Dt.VmDt, ge, f32(f32(1)), gi, gk, &nvm, &inet);
-		if (updtVm && (*ac).Spikes.Exp == 1) { // add spike current if relevant
-			var exVm: f32;
-			exVm = 0.5 * (nvm + Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[ // midpoint for this
-			2], u32(Vm),u32(ni),u32(di))]);
-			expi = (*ac).Gbar.L * (*ac).Spikes.ExpSlope *
-				FastExp((exVm-(*ac).Spikes.Thr)/(*ac).Spikes.ExpSlope);
-			if (expi > (*ac).Dt.VmTau) {
-				expi = (*ac).Dt.VmTau;
-			}
-			inet += expi;
-			nvm = ActParams_VmFromInet(ac, nvm, (*ac).Dt.VmDt, expi);
-		}
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))] = nvm;
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Inet),u32(ni),u32(di))] = inet;
-	} else { // decay back to VmR
-		var dvm: f32;
-		if (i32(isi) == (*ac).Spikes.Tr-1) {
-			dvm = (*ac).Spikes.VmR - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))];
-		} else {
-			dvm = (*ac).Spikes.RDt * ((*ac).Spikes.VmR - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))]);
-		}
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))] += dvm;
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Inet),u32(ni),u32(di))] = dvm * (*ac).Dt.VmTau;
-	}
-	var glEff = f32(1);
-	if (!updtVm) {
-		glEff += (*ac).Dend.GbarR;
-	}
-	var giEff: f32;
-	giEff = gi + (*ac).Gbar.I*Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(SSGiDend),u32(ni),u32(di))];
-	ActParams_VmInteg(ac, Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VmDend),u32(ni),u32(di))], (*ac).Dt.VmDendDt, ge, glEff, giEff, gk, &nvm, &inet);
-	if (updtVm) {
-		nvm = ActParams_VmFromInet(ac, nvm, (*ac).Dt.VmDendDt, (*ac).Dend.GbarExp*expi);
-	}
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2],
-	u32(VmDend),u32(ni),u32(di))] = nvm;
-}
-fn ActParams_SpikeFromVmVars(ac: ptr<function,ActParams>, nrnISI: ptr<function,f32>,nrnISIAvg: ptr<function,f32>,nrnSpike: ptr<function,f32>,nrnSpiked: ptr<function,f32>,nrnAct: ptr<function,f32>, nrnVm: f32) {
-	var thr: f32;
-	if ((*ac).Spikes.Exp == 1) {
-		thr = (*ac).Spikes.ExpThr;
-	} else {
-		thr = (*ac).Spikes.Thr;
-	}
-	if (nrnVm >= thr) {
-		*nrnSpike = f32(1);
-		if (*nrnISIAvg == -1) {
-			*nrnISIAvg = f32(-2);
-		} else if (*nrnISI > 0) { // must have spiked to update
-			*nrnISIAvg = SpikeParams_AvgFromISI(&(*ac).Spikes, *nrnISIAvg, *nrnISI+1);
-		}
-		*nrnISI = f32(0);
-	} else {
-		*nrnSpike = f32(0);
-		if (*nrnISI >= 0) {
-			*nrnISI += f32(1);
-			if (*nrnISI < 10) {
-				*nrnSpiked = f32(1);
-			} else {
-				*nrnSpiked = f32(0);
-			}
-			if (*nrnISI > 200) { // keep from growing infinitely large
-				*nrnISI = f32(-1);
-			}
-		} else {
-			*nrnSpiked = f32(0);
-		}
-		if (*nrnISIAvg >= 0 && *nrnISI > 0 && *nrnISI > 1.2**nrnISIAvg) {
-			*nrnISIAvg = SpikeParams_AvgFromISI(&(*ac).Spikes, *nrnISIAvg, *nrnISI);
-		}
-	}
-	var nwAct = SpikeParams_ActFromISI(&(*ac).Spikes, *nrnISIAvg, f32(.001), (*ac).Dt.Integ);
-	if (nwAct > 1) {
-		nwAct = f32(1);
-	}
-	nwAct = *nrnAct + (*ac).Dt.VmDt*(nwAct-*nrnAct);
-	*nrnAct = nwAct;
-}
-fn ActParams_SpikeFromVm(ac: ptr<function,ActParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	var nrnISI = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(ISI),u32(ni),u32(di))];
-	var nrnISIAvg = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(ISIAvg),u32(ni),u32(di))];
-	var nrnSpike = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spike),u32(ni),u32(di))];
-	var nrnSpiked = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spiked),u32(ni),u32(di))];
-	var nrnAct = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Act),u32(ni),u32(di))];
-	var nrnVm = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Vm),u32(ni),u32(di))];
-	ActParams_SpikeFromVmVars(ac, &nrnISI, &nrnISIAvg, &nrnSpike, &nrnSpiked, &nrnAct, nrnVm);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(ISI),u32(ni),u32(di))] = nrnISI;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(ISIAvg),u32(ni),u32(di))] = nrnISIAvg;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spike),u32(ni),u32(di))] = nrnSpike;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spiked),u32(ni),u32(di))] = nrnSpiked;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Act),u32(ni),u32(di))] = nrnAct;
-}
 
 ///////////// import: "chans-ak.go"
 struct AKsParams {
@@ -1436,11 +1269,6 @@ struct CaDtParams { //types:add
 	pad: i32,
 	pad1: i32,
 }
-fn CaDtParams_FromCa(kp: ptr<function,CaDtParams>, ca: f32, caM: ptr<function,f32>,caP: ptr<function,f32>,caD: ptr<function,f32>) {
-	*caM += (*kp).MDt * (ca - *caM);
-	*caP += (*kp).PDt * (*caM - *caP);
-	*caD += (*kp).DDt * (*caP - *caD);
-}
 struct NeurCaParams {
 	SpikeG: f32,
 	SynTau: f32,
@@ -1448,11 +1276,6 @@ struct NeurCaParams {
 	pad: i32,
 	Dt: CaDtParams,
 }
-fn NeurCaParams_CaFromSpike(np: ptr<function,NeurCaParams>, spike: f32, caSyn: ptr<function,f32>,caM: ptr<function,f32>,caP: ptr<function,f32>,caD: ptr<function,f32>) {
-	var nsp = (*np).SpikeG * spike;
-	*caSyn += (*np).SynDt * (nsp - *caSyn);
-	CaDtParams_FromCa(&(*np).Dt, nsp, caM, caP, caD);
-}
 struct SynCaParams { //types:add
 	CaScale: f32,
 	pad: i32,
@@ -1589,20 +1412,6 @@ struct CaLrnParams {
 	NormInv: f32,
 	pad: i32,
 }
-fn CaLrnParams_VgccCaFromSpike(np: ptr<function,CaLrnParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	if ((*np).SpkVGCC == 1) {
-		Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VgccCa),u32(ni),u32(di))] = (*np).SpkVgccCa * Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spike),u32(ni),u32(di))];
-	}
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VgccCaInt),u32(ni),u32(di))] += Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VgccCa),u32(ni),u32(di))] - (*np).VgccDt*Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VgccCaInt),u32(ni),u32(di))];
-}
-fn CaLrnParams_CaLrns(np: ptr<function,CaLrnParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	CaLrnParams_VgccCaFromSpike(np, ctx, ni, di);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaLrn),u32(ni),u32(di))] = (*np).NormInv * (Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NmdaCa),u32(ni),u32(di))] + Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(VgccCaInt),u32(ni),u32(di))]);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaM),u32(ni),u32(di))] += (*np).Dt.MDt * (Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaLrn),u32(ni),u32(di))] - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaM),u32(ni),u32(di))]);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaP),u32(ni),u32(di))] += (*np).Dt.PDt * (Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaM),u32(ni),u32(di))] - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaP),u32(ni),u32(di))]);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaD),u32(ni),u32(di))] += (*np).Dt.DDt * (Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaP),u32(ni),u32(di))] - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaD),u32(ni),u32(di))]);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaDiff),u32(ni),u32(di))] = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaP),u32(ni),u32(di))] - Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NrnCaD),u32(ni),u32(di))];
-}
 struct TrgAvgActParams {
 	GiBaseInit: f32,
 	RescaleOn: i32,
@@ -1639,17 +1448,6 @@ fn LearnNeurParams_LrnNMDAFromRaw(ln: ptr<function,LearnNeurParams>, ctx: ptr<fu
 	var gnmda = NMDAParams_Gnmda(&(*ln).LrnNMDA, Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(GnmdaLrn),u32(ni),u32(di))], vmd);
 	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(NmdaCa),u32(ni),u32(di))] = f32(gnmda * NMDAParams_CaFromV(&(*ln).LrnNMDA, vmd));
 }
-fn LearnNeurParams_CaFromSpike(ln: ptr<function,LearnNeurParams>, ctx: ptr<function,Context>, ni: u32,di: u32) {
-	var caSyn: f32;
-	var caSpkM = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkM),u32(ni),u32(di))];
-	var caSpkP = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkP),u32(ni),u32(di))];
-	var caSpkD = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkD),u32(ni),u32(di))];
-	NeurCaParams_CaFromSpike(&(*ln).CaSpk, Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Spike),u32(ni),u32(di))], &caSyn, &caSpkM, &caSpkP, &caSpkD);
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkM),u32(ni),u32(di))] = caSpkM;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkP),u32(ni),u32(di))] = caSpkP;
-	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkD),u32(ni),u32(di))] = caSpkD;
-	CaLrnParams_CaLrns(&(*ln).CaLearn, ctx, ni, di);
-}
 struct SWtInitParams {
 	SPct: f32,
 	Mean: f32,
@@ -1744,14 +1542,6 @@ struct F32 {
 	pad: i32,
 	pad1: i32, // for gpu use
 }
-fn F32_ClipValue(mr: ptr<function,F32>, val: f32) -> f32 {
-	if (val < (*mr).Min) {
-		return (*mr).Min;
-	}
-	if (val > (*mr).Max) {
-		return (*mr).Max;
-	}return val;
-}
 
 ///////////// import: "network.go"
 struct NetworkIndexes {
diff --git a/axon/shaders/SendSpike.wgsl b/axon/shaders/SendSpike.wgsl
index a6df3c3db..97cc3369e 100644
--- a/axon/shaders/SendSpike.wgsl
+++ b/axon/shaders/SendSpike.wgsl
@@ -94,7 +94,7 @@ fn LayerParams_SendSpike(ly: ptr<function,LayerParams>, ctx: ptr<function,Contex
 fn LayerParams_PostSpikeSpecial(ly: ptr<function,LayerParams>, ctx: ptr<function,Context>, lpi: u32,pi: u32,ni: u32,di: u32) {
 	Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(Burst),u32(ni),u32(di))] = Neurons[IndexF323D(Neurons[0], Neurons[1], Neurons[2], u32(CaSpkP),u32(ni),u32(di))];
 	var li = (*ly).Index;
-	var pi0 = pi - 1; // 0-n pool index
+	var pi0 = pi - (*ly).PoolSt - 1; // 0-n pool index
 	var pnn = u32(PoolNNeurons(pi));
 	var pni = NeuronIxs[IndexU322D(NeuronIxs[0], NeuronIxs[1], u32(NrnNeurIndex),u32(ni))] - u32(PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(PoolNeurSt),u32(pi),u32(di))]);
 	var hasRew = GlobalScalars[IndexF322D(GlobalScalars[0], GlobalScalars[1], u32(GvHasRew),u32(di))] > 0;
diff --git a/axon/vars.go b/axon/vars.go
index 71ebfc038..4c3da5082 100644
--- a/axon/vars.go
+++ b/axon/vars.go
@@ -126,6 +126,8 @@ var (
 	//gosl:dims 3
 	Pools *tensor.Float32
 
+	// todo: following should be read-only
+
 	// PoolsInt are the [PoolIntVars] int32 state values for layer and sub-pool
 	// inhibition, AvgMax atomic integration, and other vars: use [AvgMaxIntVarIndex]
 	// [PoolIntVars+AvgMax][Layer * Pools][Data]
diff --git a/examples/ra25/ra25.go b/examples/ra25/ra25.go
index cd99f6acc..f530e0b98 100644
--- a/examples/ra25/ra25.go
+++ b/examples/ra25/ra25.go
@@ -202,7 +202,7 @@ type Sim struct {
 	Net *axon.Network `new-window:"+" display:"no-inline"`
 
 	// network parameter management
-	Params axon.Params `display:"add-fields"`
+	Params axon.Params
 
 	// contains looper control loops for running sim
 	Loops *looper.Stacks `new-window:"+" display:"no-inline"`
@@ -374,6 +374,14 @@ func (ss *Sim) CurrentMode() Modes {
 	return md
 }
 
+// NetViewUpdater returns the NetViewUpdate for given mode.
+func (ss *Sim) NetViewUpdater(mode enums.Enum) *axon.NetViewUpdate {
+	if mode.Int64() == Train.Int64() {
+		return &ss.TrainUpdate
+	}
+	return &ss.TestUpdate
+}
+
 // ConfigLoops configures the control loops: Training, Testing
 func (ss *Sim) ConfigLoops() {
 	ls := looper.NewStacks()
@@ -391,7 +399,7 @@ func (ss *Sim) ConfigLoops() {
 		AddLevelIncr(Trial, trls, ss.Config.Run.NData).
 		AddLevel(Cycle, 200)
 
-	axon.LooperStandard(ls, ss.Net, ss.GUI.NetView, 150, 199, Cycle, Trial, Train)
+	axon.LooperStandard(ls, ss.Net, ss.NetViewUpdater, 10, 150, 199, Cycle, Trial, Train)
 
 	ls.Stacks[Train].OnInit.Add("Init", func() { ss.Init() })
 
@@ -401,7 +409,8 @@ func (ss *Sim) ConfigLoops() {
 
 	ls.Loop(Train, Run).OnStart.Add("NewRun", ss.NewRun)
 
-	ls.Loop(Train, Epoch).IsDone.AddBool("NZeroStop", func() bool {
+	trainEpoch := ls.Loop(Train, Epoch)
+	trainEpoch.IsDone.AddBool("NZeroStop", func() bool {
 		stopNz := ss.Config.Run.NZero
 		if stopNz <= 0 {
 			return false
@@ -413,8 +422,6 @@ func (ss *Sim) ConfigLoops() {
 		return false
 	})
 
-	// Add Testing
-	trainEpoch := ls.Loop(Train, Epoch)
 	trainEpoch.OnStart.Add("TestAtInterval", func() {
 		if (ss.Config.Run.TestInterval > 0) && ((trainEpoch.Counter.Cur+1)%ss.Config.Run.TestInterval == 0) {
 			ss.TestAll()
@@ -426,10 +433,6 @@ func (ss *Sim) ConfigLoops() {
 	ls.AddOnStartToAll("StatsStart", ss.StatsStart)
 	ls.AddOnEndToAll("StatsStep", ss.StatsStep)
 
-	// ls.Loop(Test, Epoch).OnEnd.Add("LogTestErrors", func() {
-	// 	axon.LogTestErrors(&ss.Logs)
-	// })
-
 	// Save weights to file, to look at later
 	ls.Loop(Train, Run).OnEnd.Add("SaveWeights", func() {
 		ctrString := fmt.Sprintf("%03d_%05d", ls.Loop(Train, Run).Counter.Cur, ls.Loop(Train, Epoch).Counter.Cur)
@@ -439,8 +442,7 @@ func (ss *Sim) ConfigLoops() {
 	//////// GUI
 
 	if ss.Config.GUI {
-		axon.LooperUpdateNetView(ls, Train, Cycle, Trial, &ss.TrainUpdate, ss.StatCounters)
-		axon.LooperUpdateNetView(ls, Test, Cycle, Trial, &ss.TestUpdate, ss.StatCounters)
+		axon.LooperUpdateNetView(ls, Cycle, Trial, ss.NetViewUpdater, ss.StatCounters)
 
 		ls.Stacks[Train].OnInit.Add("GUI-Init", func() { ss.GUI.UpdateWindow() })
 		ls.Stacks[Test].OnInit.Add("GUI-Init", func() { ss.GUI.UpdateWindow() })