diff --git a/axon/gosl.go b/axon/gosl.go index d154060a8..8d9def8c2 100644 --- a/axon/gosl.go +++ b/axon/gosl.go @@ -61,24 +61,24 @@ func GPUInit() { { sy := gpu.NewComputeSystem(gp, "Default") GPUSystem = sy - gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseNeuron.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/BetweenGi.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/PoolGi.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSyn.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleInc.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/LayerGi.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/CyclePost.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/ApplyExtsNeuron.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtFromDiSyn.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/DWtSubMeanPath.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/GatherSpikes.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/WtFromDWtSyn.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/PoolGi.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhaseNeuron.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhaseStartNeuron.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/MinusPhasePool.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/BetweenGi.wgsl", sy) gpu.NewComputePipelineShaderFS(shaders, "shaders/CycleNeuron.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/SendSpike.wgsl", sy) - gpu.NewComputePipelineShaderFS(shaders, "shaders/PlusPhasePool.wgsl", sy) + gpu.NewComputePipelineShaderFS(shaders, "shaders/CyclePost.wgsl", sy) vars := sy.Vars() { sgp := vars.AddGroup(gpu.Storage) @@ -146,274 +146,172 @@ func GPURelease() { ComputeGPU.Release() } -// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements, -// on either the CPU or GPU depending on the UseGPU variable. -// Can call multiple Run* kernels in a row, which are then all launched -// in the same command submission on the GPU, which is by far the most efficient. -// MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a -// single run-and-sync case. -func RunDWtSubMeanPath(n int) { - if UseGPU { - RunDWtSubMeanPathGPU(n) - } else { - RunDWtSubMeanPathCPU(n) - } -} - -// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info. -func RunDWtSubMeanPathGPU(n int) { - sy := GPUSystem - pl := sy.ComputePipelines["DWtSubMeanPath"] - ce, _ := sy.BeginComputePass() - pl.Dispatch1D(ce, n, 64) -} - -// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU. -func RunDWtSubMeanPathCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - DWtSubMeanPath(uint32(i)) - } -} - -// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements, -// on either the CPU or GPU depending on the UseGPU variable. -// This version then calls RunDone with the given variables to sync -// after the Run, for a single-shot Run-and-Done call. If multiple kernels -// can be run in sequence, it is much more efficient to do multiple Run* -// calls followed by a RunDone call. -func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) { - if UseGPU { - RunDWtSubMeanPathGPU(n) - RunDone(syncVars...) - } else { - RunDWtSubMeanPathCPU(n) - } -} -// RunGatherSpikes runs the GatherSpikes kernel with given number of elements, -// on either the CPU or GPU depending on the UseGPU variable. -// Can call multiple Run* kernels in a row, which are then all launched -// in the same command submission on the GPU, which is by far the most efficient. -// MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a -// single run-and-sync case. -func RunGatherSpikes(n int) { - if UseGPU { - RunGatherSpikesGPU(n) - } else { - RunGatherSpikesCPU(n) - } -} - -// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info. -func RunGatherSpikesGPU(n int) { - sy := GPUSystem - pl := sy.ComputePipelines["GatherSpikes"] - ce, _ := sy.BeginComputePass() - pl.Dispatch1D(ce, n, 64) -} - -// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU. -func RunGatherSpikesCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - GatherSpikes(uint32(i)) - } -} - -// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements, -// on either the CPU or GPU depending on the UseGPU variable. -// This version then calls RunDone with the given variables to sync -// after the Run, for a single-shot Run-and-Done call. If multiple kernels -// can be run in sequence, it is much more efficient to do multiple Run* -// calls followed by a RunDone call. -func RunOneGatherSpikes(n int, syncVars ...GPUVars) { - if UseGPU { - RunGatherSpikesGPU(n) - RunDone(syncVars...) - } else { - RunGatherSpikesCPU(n) - } -} -// RunCycleNeuron runs the CycleNeuron kernel with given number of elements, +// RunPoolGi runs the PoolGi kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a +// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a // single run-and-sync case. -func RunCycleNeuron(n int) { +func RunPoolGi(n int) { if UseGPU { - RunCycleNeuronGPU(n) + RunPoolGiGPU(n) } else { - RunCycleNeuronCPU(n) + RunPoolGiCPU(n) } } -// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info. -func RunCycleNeuronGPU(n int) { +// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info. +func RunPoolGiGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["CycleNeuron"] + pl := sy.ComputePipelines["PoolGi"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU. -func RunCycleNeuronCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - CycleNeuron(uint32(i)) - } +// RunPoolGiCPU runs the PoolGi kernel on the CPU. +func RunPoolGiCPU(n int) { + gpu.VectorizeFunc(0, n, PoolGi) } -// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements, +// RunOnePoolGi runs the PoolGi kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneCycleNeuron(n int, syncVars ...GPUVars) { +func RunOnePoolGi(n int, syncVars ...GPUVars) { if UseGPU { - RunCycleNeuronGPU(n) + RunPoolGiGPU(n) RunDone(syncVars...) } else { - RunCycleNeuronCPU(n) + RunPoolGiCPU(n) } } -// RunSendSpike runs the SendSpike kernel with given number of elements, +// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a +// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a // single run-and-sync case. -func RunSendSpike(n int) { +func RunMinusPhaseNeuron(n int) { if UseGPU { - RunSendSpikeGPU(n) + RunMinusPhaseNeuronGPU(n) } else { - RunSendSpikeCPU(n) + RunMinusPhaseNeuronCPU(n) } } -// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info. -func RunSendSpikeGPU(n int) { +// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info. +func RunMinusPhaseNeuronGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["SendSpike"] + pl := sy.ComputePipelines["MinusPhaseNeuron"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunSendSpikeCPU runs the SendSpike kernel on the CPU. -func RunSendSpikeCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - SendSpike(uint32(i)) - } +// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU. +func RunMinusPhaseNeuronCPU(n int) { + gpu.VectorizeFunc(0, n, MinusPhaseNeuron) } -// RunOneSendSpike runs the SendSpike kernel with given number of elements, +// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneSendSpike(n int, syncVars ...GPUVars) { +func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) { if UseGPU { - RunSendSpikeGPU(n) + RunMinusPhaseNeuronGPU(n) RunDone(syncVars...) } else { - RunSendSpikeCPU(n) + RunMinusPhaseNeuronCPU(n) } } -// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements, +// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a +// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a // single run-and-sync case. -func RunPlusPhasePool(n int) { +func RunPlusPhaseStartNeuron(n int) { if UseGPU { - RunPlusPhasePoolGPU(n) + RunPlusPhaseStartNeuronGPU(n) } else { - RunPlusPhasePoolCPU(n) + RunPlusPhaseStartNeuronCPU(n) } } -// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info. -func RunPlusPhasePoolGPU(n int) { +// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info. +func RunPlusPhaseStartNeuronGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["PlusPhasePool"] + pl := sy.ComputePipelines["PlusPhaseStartNeuron"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU. -func RunPlusPhasePoolCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - PlusPhasePool(uint32(i)) - } +// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU. +func RunPlusPhaseStartNeuronCPU(n int) { + gpu.VectorizeFunc(0, n, PlusPhaseStartNeuron) } -// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements, +// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOnePlusPhasePool(n int, syncVars ...GPUVars) { +func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) { if UseGPU { - RunPlusPhasePoolGPU(n) + RunPlusPhaseStartNeuronGPU(n) RunDone(syncVars...) } else { - RunPlusPhasePoolCPU(n) + RunPlusPhaseStartNeuronCPU(n) } } -// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements, +// RunDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a +// Alternatively, a single-shot RunOneDWtSubMeanPath call does Run and Done for a // single run-and-sync case. -func RunDWtFromDiSyn(n int) { +func RunDWtSubMeanPath(n int) { if UseGPU { - RunDWtFromDiSynGPU(n) + RunDWtSubMeanPathGPU(n) } else { - RunDWtFromDiSynCPU(n) + RunDWtSubMeanPathCPU(n) } } -// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info. -func RunDWtFromDiSynGPU(n int) { +// RunDWtSubMeanPathGPU runs the DWtSubMeanPath kernel on the GPU. See [RunDWtSubMeanPath] for more info. +func RunDWtSubMeanPathGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["DWtFromDiSyn"] + pl := sy.ComputePipelines["DWtSubMeanPath"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU. -func RunDWtFromDiSynCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - DWtFromDiSyn(uint32(i)) - } +// RunDWtSubMeanPathCPU runs the DWtSubMeanPath kernel on the CPU. +func RunDWtSubMeanPathCPU(n int) { + gpu.VectorizeFunc(0, n, DWtSubMeanPath) } -// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements, +// RunOneDWtSubMeanPath runs the DWtSubMeanPath kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) { +func RunOneDWtSubMeanPath(n int, syncVars ...GPUVars) { if UseGPU { - RunDWtFromDiSynGPU(n) + RunDWtSubMeanPathGPU(n) RunDone(syncVars...) } else { - RunDWtFromDiSynCPU(n) + RunDWtSubMeanPathCPU(n) } } // RunWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements, @@ -441,10 +339,7 @@ func RunWtFromDWtSynGPU(n int) { // RunWtFromDWtSynCPU runs the WtFromDWtSyn kernel on the CPU. func RunWtFromDWtSynCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - WtFromDWtSyn(uint32(i)) - } + gpu.VectorizeFunc(0, n, WtFromDWtSyn) } // RunOneWtFromDWtSyn runs the WtFromDWtSyn kernel with given number of elements, @@ -486,10 +381,7 @@ func RunBetweenGiGPU(n int) { // RunBetweenGiCPU runs the BetweenGi kernel on the CPU. func RunBetweenGiCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - BetweenGi(uint32(i)) - } + gpu.VectorizeFunc(0, n, BetweenGi) } // RunOneBetweenGi runs the BetweenGi kernel with given number of elements, @@ -506,229 +398,214 @@ func RunOneBetweenGi(n int, syncVars ...GPUVars) { RunBetweenGiCPU(n) } } -// RunPoolGi runs the PoolGi kernel with given number of elements, +// RunCycleNeuron runs the CycleNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOnePoolGi call does Run and Done for a +// Alternatively, a single-shot RunOneCycleNeuron call does Run and Done for a // single run-and-sync case. -func RunPoolGi(n int) { +func RunCycleNeuron(n int) { if UseGPU { - RunPoolGiGPU(n) + RunCycleNeuronGPU(n) } else { - RunPoolGiCPU(n) + RunCycleNeuronCPU(n) } } -// RunPoolGiGPU runs the PoolGi kernel on the GPU. See [RunPoolGi] for more info. -func RunPoolGiGPU(n int) { +// RunCycleNeuronGPU runs the CycleNeuron kernel on the GPU. See [RunCycleNeuron] for more info. +func RunCycleNeuronGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["PoolGi"] + pl := sy.ComputePipelines["CycleNeuron"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunPoolGiCPU runs the PoolGi kernel on the CPU. -func RunPoolGiCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - PoolGi(uint32(i)) - } +// RunCycleNeuronCPU runs the CycleNeuron kernel on the CPU. +func RunCycleNeuronCPU(n int) { + gpu.VectorizeFunc(0, n, CycleNeuron) } -// RunOnePoolGi runs the PoolGi kernel with given number of elements, +// RunOneCycleNeuron runs the CycleNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOnePoolGi(n int, syncVars ...GPUVars) { +func RunOneCycleNeuron(n int, syncVars ...GPUVars) { if UseGPU { - RunPoolGiGPU(n) + RunCycleNeuronGPU(n) RunDone(syncVars...) } else { - RunPoolGiCPU(n) + RunCycleNeuronCPU(n) } } -// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements, +// RunCyclePost runs the CyclePost kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a +// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a // single run-and-sync case. -func RunApplyExtsNeuron(n int) { +func RunCyclePost(n int) { if UseGPU { - RunApplyExtsNeuronGPU(n) + RunCyclePostGPU(n) } else { - RunApplyExtsNeuronCPU(n) + RunCyclePostCPU(n) } } -// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info. -func RunApplyExtsNeuronGPU(n int) { +// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info. +func RunCyclePostGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["ApplyExtsNeuron"] + pl := sy.ComputePipelines["CyclePost"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU. -func RunApplyExtsNeuronCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - ApplyExtsNeuron(uint32(i)) - } +// RunCyclePostCPU runs the CyclePost kernel on the CPU. +func RunCyclePostCPU(n int) { + gpu.VectorizeFunc(0, n, CyclePost) } -// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements, +// RunOneCyclePost runs the CyclePost kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) { +func RunOneCyclePost(n int, syncVars ...GPUVars) { if UseGPU { - RunApplyExtsNeuronGPU(n) + RunCyclePostGPU(n) RunDone(syncVars...) } else { - RunApplyExtsNeuronCPU(n) + RunCyclePostCPU(n) } } -// RunPlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements, +// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOnePlusPhaseStartNeuron call does Run and Done for a +// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a // single run-and-sync case. -func RunPlusPhaseStartNeuron(n int) { +func RunMinusPhasePool(n int) { if UseGPU { - RunPlusPhaseStartNeuronGPU(n) + RunMinusPhasePoolGPU(n) } else { - RunPlusPhaseStartNeuronCPU(n) + RunMinusPhasePoolCPU(n) } } -// RunPlusPhaseStartNeuronGPU runs the PlusPhaseStartNeuron kernel on the GPU. See [RunPlusPhaseStartNeuron] for more info. -func RunPlusPhaseStartNeuronGPU(n int) { +// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info. +func RunMinusPhasePoolGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["PlusPhaseStartNeuron"] + pl := sy.ComputePipelines["MinusPhasePool"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunPlusPhaseStartNeuronCPU runs the PlusPhaseStartNeuron kernel on the CPU. -func RunPlusPhaseStartNeuronCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - PlusPhaseStartNeuron(uint32(i)) - } +// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU. +func RunMinusPhasePoolCPU(n int) { + gpu.VectorizeFunc(0, n, MinusPhasePool) } -// RunOnePlusPhaseStartNeuron runs the PlusPhaseStartNeuron kernel with given number of elements, +// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOnePlusPhaseStartNeuron(n int, syncVars ...GPUVars) { +func RunOneMinusPhasePool(n int, syncVars ...GPUVars) { if UseGPU { - RunPlusPhaseStartNeuronGPU(n) + RunMinusPhasePoolGPU(n) RunDone(syncVars...) } else { - RunPlusPhaseStartNeuronCPU(n) + RunMinusPhasePoolCPU(n) } } -// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements, +// RunGatherSpikes runs the GatherSpikes kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a +// Alternatively, a single-shot RunOneGatherSpikes call does Run and Done for a // single run-and-sync case. -func RunPlusPhaseNeuron(n int) { +func RunGatherSpikes(n int) { if UseGPU { - RunPlusPhaseNeuronGPU(n) + RunGatherSpikesGPU(n) } else { - RunPlusPhaseNeuronCPU(n) + RunGatherSpikesCPU(n) } } -// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info. -func RunPlusPhaseNeuronGPU(n int) { +// RunGatherSpikesGPU runs the GatherSpikes kernel on the GPU. See [RunGatherSpikes] for more info. +func RunGatherSpikesGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["PlusPhaseNeuron"] + pl := sy.ComputePipelines["GatherSpikes"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU. -func RunPlusPhaseNeuronCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - PlusPhaseNeuron(uint32(i)) - } +// RunGatherSpikesCPU runs the GatherSpikes kernel on the CPU. +func RunGatherSpikesCPU(n int) { + gpu.VectorizeFunc(0, n, GatherSpikes) } -// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements, +// RunOneGatherSpikes runs the GatherSpikes kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) { +func RunOneGatherSpikes(n int, syncVars ...GPUVars) { if UseGPU { - RunPlusPhaseNeuronGPU(n) + RunGatherSpikesGPU(n) RunDone(syncVars...) } else { - RunPlusPhaseNeuronCPU(n) + RunGatherSpikesCPU(n) } } -// RunDWtSyn runs the DWtSyn kernel with given number of elements, +// RunSendSpike runs the SendSpike kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a +// Alternatively, a single-shot RunOneSendSpike call does Run and Done for a // single run-and-sync case. -func RunDWtSyn(n int) { +func RunSendSpike(n int) { if UseGPU { - RunDWtSynGPU(n) + RunSendSpikeGPU(n) } else { - RunDWtSynCPU(n) + RunSendSpikeCPU(n) } } -// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info. -func RunDWtSynGPU(n int) { +// RunSendSpikeGPU runs the SendSpike kernel on the GPU. See [RunSendSpike] for more info. +func RunSendSpikeGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["DWtSyn"] + pl := sy.ComputePipelines["SendSpike"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunDWtSynCPU runs the DWtSyn kernel on the CPU. -func RunDWtSynCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - DWtSyn(uint32(i)) - } +// RunSendSpikeCPU runs the SendSpike kernel on the CPU. +func RunSendSpikeCPU(n int) { + gpu.VectorizeFunc(0, n, SendSpike) } -// RunOneDWtSyn runs the DWtSyn kernel with given number of elements, +// RunOneSendSpike runs the SendSpike kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneDWtSyn(n int, syncVars ...GPUVars) { +func RunOneSendSpike(n int, syncVars ...GPUVars) { if UseGPU { - RunDWtSynGPU(n) + RunSendSpikeGPU(n) RunDone(syncVars...) } else { - RunDWtSynCPU(n) + RunSendSpikeCPU(n) } } // RunCycleInc runs the CycleInc kernel with given number of elements, @@ -756,10 +633,7 @@ func RunCycleIncGPU(n int) { // RunCycleIncCPU runs the CycleInc kernel on the CPU. func RunCycleIncCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - CycleInc(uint32(i)) - } + gpu.VectorizeFunc(0, n, CycleInc) } // RunOneCycleInc runs the CycleInc kernel with given number of elements, @@ -776,49 +650,130 @@ func RunOneCycleInc(n int, syncVars ...GPUVars) { RunCycleIncCPU(n) } } -// RunMinusPhasePool runs the MinusPhasePool kernel with given number of elements, +// RunPlusPhasePool runs the PlusPhasePool kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneMinusPhasePool call does Run and Done for a +// Alternatively, a single-shot RunOnePlusPhasePool call does Run and Done for a // single run-and-sync case. -func RunMinusPhasePool(n int) { +func RunPlusPhasePool(n int) { if UseGPU { - RunMinusPhasePoolGPU(n) + RunPlusPhasePoolGPU(n) } else { - RunMinusPhasePoolCPU(n) + RunPlusPhasePoolCPU(n) } } -// RunMinusPhasePoolGPU runs the MinusPhasePool kernel on the GPU. See [RunMinusPhasePool] for more info. -func RunMinusPhasePoolGPU(n int) { +// RunPlusPhasePoolGPU runs the PlusPhasePool kernel on the GPU. See [RunPlusPhasePool] for more info. +func RunPlusPhasePoolGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["MinusPhasePool"] + pl := sy.ComputePipelines["PlusPhasePool"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunMinusPhasePoolCPU runs the MinusPhasePool kernel on the CPU. -func RunMinusPhasePoolCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - MinusPhasePool(uint32(i)) +// RunPlusPhasePoolCPU runs the PlusPhasePool kernel on the CPU. +func RunPlusPhasePoolCPU(n int) { + gpu.VectorizeFunc(0, n, PlusPhasePool) +} + +// RunOnePlusPhasePool runs the PlusPhasePool kernel with given number of elements, +// on either the CPU or GPU depending on the UseGPU variable. +// This version then calls RunDone with the given variables to sync +// after the Run, for a single-shot Run-and-Done call. If multiple kernels +// can be run in sequence, it is much more efficient to do multiple Run* +// calls followed by a RunDone call. +func RunOnePlusPhasePool(n int, syncVars ...GPUVars) { + if UseGPU { + RunPlusPhasePoolGPU(n) + RunDone(syncVars...) + } else { + RunPlusPhasePoolCPU(n) + } +} +// RunPlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements, +// on either the CPU or GPU depending on the UseGPU variable. +// Can call multiple Run* kernels in a row, which are then all launched +// in the same command submission on the GPU, which is by far the most efficient. +// MUST call RunDone (with optional vars to sync) after all Run calls. +// Alternatively, a single-shot RunOnePlusPhaseNeuron call does Run and Done for a +// single run-and-sync case. +func RunPlusPhaseNeuron(n int) { + if UseGPU { + RunPlusPhaseNeuronGPU(n) + } else { + RunPlusPhaseNeuronCPU(n) } } -// RunOneMinusPhasePool runs the MinusPhasePool kernel with given number of elements, +// RunPlusPhaseNeuronGPU runs the PlusPhaseNeuron kernel on the GPU. See [RunPlusPhaseNeuron] for more info. +func RunPlusPhaseNeuronGPU(n int) { + sy := GPUSystem + pl := sy.ComputePipelines["PlusPhaseNeuron"] + ce, _ := sy.BeginComputePass() + pl.Dispatch1D(ce, n, 64) +} + +// RunPlusPhaseNeuronCPU runs the PlusPhaseNeuron kernel on the CPU. +func RunPlusPhaseNeuronCPU(n int) { + gpu.VectorizeFunc(0, n, PlusPhaseNeuron) +} + +// RunOnePlusPhaseNeuron runs the PlusPhaseNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneMinusPhasePool(n int, syncVars ...GPUVars) { +func RunOnePlusPhaseNeuron(n int, syncVars ...GPUVars) { if UseGPU { - RunMinusPhasePoolGPU(n) + RunPlusPhaseNeuronGPU(n) RunDone(syncVars...) } else { - RunMinusPhasePoolCPU(n) + RunPlusPhaseNeuronCPU(n) + } +} +// RunDWtSyn runs the DWtSyn kernel with given number of elements, +// on either the CPU or GPU depending on the UseGPU variable. +// Can call multiple Run* kernels in a row, which are then all launched +// in the same command submission on the GPU, which is by far the most efficient. +// MUST call RunDone (with optional vars to sync) after all Run calls. +// Alternatively, a single-shot RunOneDWtSyn call does Run and Done for a +// single run-and-sync case. +func RunDWtSyn(n int) { + if UseGPU { + RunDWtSynGPU(n) + } else { + RunDWtSynCPU(n) + } +} + +// RunDWtSynGPU runs the DWtSyn kernel on the GPU. See [RunDWtSyn] for more info. +func RunDWtSynGPU(n int) { + sy := GPUSystem + pl := sy.ComputePipelines["DWtSyn"] + ce, _ := sy.BeginComputePass() + pl.Dispatch1D(ce, n, 64) +} + +// RunDWtSynCPU runs the DWtSyn kernel on the CPU. +func RunDWtSynCPU(n int) { + gpu.VectorizeFunc(0, n, DWtSyn) +} + +// RunOneDWtSyn runs the DWtSyn kernel with given number of elements, +// on either the CPU or GPU depending on the UseGPU variable. +// This version then calls RunDone with the given variables to sync +// after the Run, for a single-shot Run-and-Done call. If multiple kernels +// can be run in sequence, it is much more efficient to do multiple Run* +// calls followed by a RunDone call. +func RunOneDWtSyn(n int, syncVars ...GPUVars) { + if UseGPU { + RunDWtSynGPU(n) + RunDone(syncVars...) + } else { + RunDWtSynCPU(n) } } // RunLayerGi runs the LayerGi kernel with given number of elements, @@ -846,10 +801,7 @@ func RunLayerGiGPU(n int) { // RunLayerGiCPU runs the LayerGi kernel on the CPU. func RunLayerGiCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - LayerGi(uint32(i)) - } + gpu.VectorizeFunc(0, n, LayerGi) } // RunOneLayerGi runs the LayerGi kernel with given number of elements, @@ -866,94 +818,88 @@ func RunOneLayerGi(n int, syncVars ...GPUVars) { RunLayerGiCPU(n) } } -// RunCyclePost runs the CyclePost kernel with given number of elements, +// RunApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneCyclePost call does Run and Done for a +// Alternatively, a single-shot RunOneApplyExtsNeuron call does Run and Done for a // single run-and-sync case. -func RunCyclePost(n int) { +func RunApplyExtsNeuron(n int) { if UseGPU { - RunCyclePostGPU(n) + RunApplyExtsNeuronGPU(n) } else { - RunCyclePostCPU(n) + RunApplyExtsNeuronCPU(n) } } -// RunCyclePostGPU runs the CyclePost kernel on the GPU. See [RunCyclePost] for more info. -func RunCyclePostGPU(n int) { +// RunApplyExtsNeuronGPU runs the ApplyExtsNeuron kernel on the GPU. See [RunApplyExtsNeuron] for more info. +func RunApplyExtsNeuronGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["CyclePost"] + pl := sy.ComputePipelines["ApplyExtsNeuron"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunCyclePostCPU runs the CyclePost kernel on the CPU. -func RunCyclePostCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - CyclePost(uint32(i)) - } +// RunApplyExtsNeuronCPU runs the ApplyExtsNeuron kernel on the CPU. +func RunApplyExtsNeuronCPU(n int) { + gpu.VectorizeFunc(0, n, ApplyExtsNeuron) } -// RunOneCyclePost runs the CyclePost kernel with given number of elements, +// RunOneApplyExtsNeuron runs the ApplyExtsNeuron kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneCyclePost(n int, syncVars ...GPUVars) { +func RunOneApplyExtsNeuron(n int, syncVars ...GPUVars) { if UseGPU { - RunCyclePostGPU(n) + RunApplyExtsNeuronGPU(n) RunDone(syncVars...) } else { - RunCyclePostCPU(n) + RunApplyExtsNeuronCPU(n) } } -// RunMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements, +// RunDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // Can call multiple Run* kernels in a row, which are then all launched // in the same command submission on the GPU, which is by far the most efficient. // MUST call RunDone (with optional vars to sync) after all Run calls. -// Alternatively, a single-shot RunOneMinusPhaseNeuron call does Run and Done for a +// Alternatively, a single-shot RunOneDWtFromDiSyn call does Run and Done for a // single run-and-sync case. -func RunMinusPhaseNeuron(n int) { +func RunDWtFromDiSyn(n int) { if UseGPU { - RunMinusPhaseNeuronGPU(n) + RunDWtFromDiSynGPU(n) } else { - RunMinusPhaseNeuronCPU(n) + RunDWtFromDiSynCPU(n) } } -// RunMinusPhaseNeuronGPU runs the MinusPhaseNeuron kernel on the GPU. See [RunMinusPhaseNeuron] for more info. -func RunMinusPhaseNeuronGPU(n int) { +// RunDWtFromDiSynGPU runs the DWtFromDiSyn kernel on the GPU. See [RunDWtFromDiSyn] for more info. +func RunDWtFromDiSynGPU(n int) { sy := GPUSystem - pl := sy.ComputePipelines["MinusPhaseNeuron"] + pl := sy.ComputePipelines["DWtFromDiSyn"] ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, 64) } -// RunMinusPhaseNeuronCPU runs the MinusPhaseNeuron kernel on the CPU. -func RunMinusPhaseNeuronCPU(n int) { - // todo: need threaded api -- not tensor - for i := range n { - MinusPhaseNeuron(uint32(i)) - } +// RunDWtFromDiSynCPU runs the DWtFromDiSyn kernel on the CPU. +func RunDWtFromDiSynCPU(n int) { + gpu.VectorizeFunc(0, n, DWtFromDiSyn) } -// RunOneMinusPhaseNeuron runs the MinusPhaseNeuron kernel with given number of elements, +// RunOneDWtFromDiSyn runs the DWtFromDiSyn kernel with given number of elements, // on either the CPU or GPU depending on the UseGPU variable. // This version then calls RunDone with the given variables to sync // after the Run, for a single-shot Run-and-Done call. If multiple kernels // can be run in sequence, it is much more efficient to do multiple Run* // calls followed by a RunDone call. -func RunOneMinusPhaseNeuron(n int, syncVars ...GPUVars) { +func RunOneDWtFromDiSyn(n int, syncVars ...GPUVars) { if UseGPU { - RunMinusPhaseNeuronGPU(n) + RunDWtFromDiSynGPU(n) RunDone(syncVars...) } else { - RunMinusPhaseNeuronCPU(n) + RunDWtFromDiSynCPU(n) } } // RunDone must be called after Run* calls to start compute kernels. diff --git a/axon/shaders/LayerGi.wgsl b/axon/shaders/LayerGi.wgsl index 4f7470073..83b3f2ca6 100644 --- a/axon/shaders/LayerGi.wgsl +++ b/axon/shaders/LayerGi.wgsl @@ -1234,16 +1234,19 @@ fn PoolNNeurons(pi: u32) -> i32 { } fn PoolAvgMaxCalcVar(vr: AvgMaxVars, pi: u32,di: u32) { var floatFromInt = f32(1.0) / f32(u32(1)<<20); - var vim = AvgMaxIntVarIndex(vr, Max); - var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))]; + var vis = AvgMaxIntVarIndex(vr, Avg); + var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))]; if (sum < 0) { sum = i32(u32(1) << 20); } Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Avg)),u32(pi),u32(di))] = f32(sum) * floatFromInt; - var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(AvgMaxIntVarIndex(vr, Max)),u32(pi),u32(di))]; + PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))] = 0; + var vim = AvgMaxIntVarIndex(vr, Max); + var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))]; if (mx < 0) { mx = i32(u32(1) << 20); } + PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))] = 0; Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Max)),u32(pi),u32(di))] = f32(mx) * floatFromInt; } fn PoolAvgMaxCalc(pi: u32,di: u32) { diff --git a/axon/shaders/PoolGi.wgsl b/axon/shaders/PoolGi.wgsl index b5fd7f8fd..58849710b 100644 --- a/axon/shaders/PoolGi.wgsl +++ b/axon/shaders/PoolGi.wgsl @@ -1240,16 +1240,19 @@ fn PoolNNeurons(pi: u32) -> i32 { } fn PoolAvgMaxCalcVar(vr: AvgMaxVars, pi: u32,di: u32) { var floatFromInt = f32(1.0) / f32(u32(1)<<20); - var vim = AvgMaxIntVarIndex(vr, Max); - var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))]; + var vis = AvgMaxIntVarIndex(vr, Avg); + var sum = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))]; if (sum < 0) { sum = i32(u32(1) << 20); } Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Avg)),u32(pi),u32(di))] = f32(sum) * floatFromInt; - var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(AvgMaxIntVarIndex(vr, Max)),u32(pi),u32(di))]; + PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vis),u32(pi),u32(di))] = 0; + var vim = AvgMaxIntVarIndex(vr, Max); + var mx = PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))]; if (mx < 0) { mx = i32(u32(1) << 20); } + PoolsInt[IndexI323D(PoolsInt[0], PoolsInt[1], PoolsInt[2], u32(vim),u32(pi),u32(di))] = 0; Pools[IndexF323D(Pools[0], Pools[1], Pools[2], u32(AvgMaxVarIndex(vr, AMCycle, Max)),u32(pi),u32(di))] = f32(mx) * floatFromInt; } fn PoolAvgMaxCalc(pi: u32,di: u32) { diff --git a/axon/simstats.go b/axon/simstats.go new file mode 100644 index 000000000..e109e6847 --- /dev/null +++ b/axon/simstats.go @@ -0,0 +1,123 @@ +// Code generated by "goal build"; DO NOT EDIT. +//line simstats.goal:1 +// Copyright (c) 2024, The Emergent Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package axon + +import ( + "time" + + "cogentcore.org/core/base/timer" + "cogentcore.org/core/enums" + "cogentcore.org/core/plot" + "cogentcore.org/core/tensor/datafs" + "cogentcore.org/core/tensor/stats/stats" +) + +// StatPerTrialMSec returns a Stats function that reports the number of milliseconds +// per trial, for the given times and training mode enum values. +// The times should start at the Trial and go up from there: data will +// be recorded from the second time level. The statName is the name of another +// stat that is used to get the number of trials. +func StatPerTrialMSec(statDir *datafs.Data, statName string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) { + var epcTimer timer.Time + return func(lmode enums.Enum, ltime enums.Enum, start bool) { + if lmode.Int64() != trainMode.Int64() || ltime.Int64() <= times[0].Int64() { + return + } + name := "PerTrialMSec" + modeDir := statDir.RecycleDir(lmode.String()) + timeDir := modeDir.RecycleDir(ltime.String()) + tsr := datafs.Value[float64](timeDir, name) + if start { + tsr.SetNumRows(0) + if ps := plot.GetStylersFrom(tsr); ps == nil { + ps.Add(func(s *plot.Style) { + s.Range.SetMin(0).SetMax(1) + }) + plot.SetStylersTo(tsr, ps) + } + return + } + for i, tm := range times { + if ltime.Int64() != tm.Int64() { + continue + } + switch i { + case 0: + continue + case 1: + epcTimer.Stop() + subd := modeDir.RecycleDir(times[0].String()) + trls := subd.Value(statName) // must be a stat + epcTimer.N = trls.Len() + pertrl := float64(epcTimer.Avg()) / float64(time.Millisecond) + tsr.AppendRowFloat(pertrl) + epcTimer.ResetStart() + default: + subd := modeDir.RecycleDir(times[i-1].String()) + stat := stats.StatMean.Call(subd.Value(name)) + tsr.AppendRow(stat) + } + } + } +} + +// StatDiagnostics returns a Stats function that computes key +// statistics. +func StatDiagnostics(statDir *datafs.Data, net *Network, layerNames []string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) { + statNames := []string{"ActMAvg", "ActMMax", "MaxGeM"} + return func(lmode enums.Enum, ltime enums.Enum, start bool) { + if lmode.Int64() != trainMode.Int64() || ltime.Int64() < times[0].Int64() { + return + } + modeDir := statDir.RecycleDir(lmode.String()) + timeDir := modeDir.RecycleDir(ltime.String()) + ndata := net.Context().NData + for _, lnm := range layerNames { + for si, statName := range statNames { + ly := net.LayerByName(lnm) + lpi := ly.Params.PoolIndex(0) + name := lnm + "_" + statName + tsr := datafs.Value[float64](timeDir, name) + if start { + tsr.SetNumRows(0) + if ps := plot.GetStylersFrom(tsr); ps == nil { + ps.Add(func(s *plot.Style) { + s.Range.SetMin(0).SetMax(1) + }) + plot.SetStylersTo(tsr, ps) + } + return + } + for i, tm := range times { + if ltime.Int64() != tm.Int64() { + continue + } + switch i { + case 0: + for di := range ndata { + var stat float32 + switch si { + case 0: + stat = PoolAvgMax(AMAct, AMMinus, Avg, lpi, di) + case 1: + stat = PoolAvgMax(AMAct, AMMinus, Max, lpi, di) + case 2: + stat = PoolAvgMax(AMGeInt, AMMinus, Max, lpi, di) + } + tsr.AppendRowFloat(float64(stat)) + } + // todo: last 5 here + default: + subd := modeDir.RecycleDir(times[i-1].String()) + stat := stats.StatMean.Call(subd.Value(name)) + tsr.AppendRow(stat) + } + } + } + } + } +} diff --git a/axon/simstats.goal b/axon/simstats.goal new file mode 100644 index 000000000..75648666d --- /dev/null +++ b/axon/simstats.goal @@ -0,0 +1,121 @@ +// Copyright (c) 2024, The Emergent Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package axon + +import ( + "time" + + "cogentcore.org/core/base/timer" + "cogentcore.org/core/enums" + "cogentcore.org/core/plot" + "cogentcore.org/core/tensor/datafs" + "cogentcore.org/core/tensor/stats/stats" +) + +// StatPerTrialMSec returns a Stats function that reports the number of milliseconds +// per trial, for the given times and training mode enum values. +// The times should start at the Trial and go up from there: data will +// be recorded from the second time level. The statName is the name of another +// stat that is used to get the number of trials. +func StatPerTrialMSec(statDir *datafs.Data, statName string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) { + var epcTimer timer.Time + return func(lmode enums.Enum, ltime enums.Enum, start bool) { + if lmode.Int64() != trainMode.Int64() || ltime.Int64() <= times[0].Int64() { + return + } + name := "PerTrialMSec" + modeDir := statDir.RecycleDir(lmode.String()) + timeDir := modeDir.RecycleDir(ltime.String()) + tsr := datafs.Value[float64](timeDir, name) + if start { + tsr.SetNumRows(0) + if ps := plot.GetStylersFrom(tsr); ps == nil { + ps.Add(func(s *plot.Style) { + s.Range.SetMin(0).SetMax(1) + }) + plot.SetStylersTo(tsr, ps) + } + return + } + for i, tm := range times { + if ltime.Int64() != tm.Int64() { + continue + } + switch i { + case 0: + continue + case 1: + epcTimer.Stop() + subd := modeDir.RecycleDir(times[0].String()) + trls := subd.Value(statName) // must be a stat + epcTimer.N = trls.Len() + pertrl := float64(epcTimer.Avg()) / float64(time.Millisecond) + tsr.AppendRowFloat(pertrl) + epcTimer.ResetStart() + default: + subd := modeDir.RecycleDir(times[i-1].String()) + stat := stats.StatMean.Call(subd.Value(name)) + tsr.AppendRow(stat) + } + } + } +} + +// StatDiagnostics returns a Stats function that computes key +// statistics. +func StatDiagnostics(statDir *datafs.Data, net *Network, layerNames []string, trainMode enums.Enum, times ...enums.Enum) func(lmode enums.Enum, ltime enums.Enum, start bool) { + statNames := []string{"ActMAvg", "ActMMax", "MaxGeM"} + return func(lmode enums.Enum, ltime enums.Enum, start bool) { + if lmode.Int64() != trainMode.Int64() || ltime.Int64() < times[0].Int64() { + return + } + modeDir := statDir.RecycleDir(lmode.String()) + timeDir := modeDir.RecycleDir(ltime.String()) + ndata := net.Context().NData + for _, lnm := range layerNames { + for si, statName := range statNames { + ly := net.LayerByName(lnm) + lpi := ly.Params.PoolIndex(0) + name := lnm + "_" + statName + tsr := datafs.Value[float64](timeDir, name) + if start { + tsr.SetNumRows(0) + if ps := plot.GetStylersFrom(tsr); ps == nil { + ps.Add(func(s *plot.Style) { + s.Range.SetMin(0).SetMax(1) + }) + plot.SetStylersTo(tsr, ps) + } + return + } + for i, tm := range times { + if ltime.Int64() != tm.Int64() { + continue + } + switch i { + case 0: + for di := range ndata { + var stat float32 + switch si { + case 0: + stat = PoolAvgMax(AMAct, AMMinus, Avg, lpi, di) + case 1: + stat = PoolAvgMax(AMAct, AMMinus, Max, lpi, di) + case 2: + stat = PoolAvgMax(AMGeInt, AMMinus, Max, lpi, di) + } + tsr.AppendRowFloat(float64(stat)) + } + // todo: last 5 here + default: + subd := modeDir.RecycleDir(times[i-1].String()) + stat := stats.StatMean.Call(subd.Value(name)) + tsr.AppendRow(stat) + } + } + } + } + } +} diff --git a/examples/ra25/ra25.go b/examples/ra25/ra25.go index 289f94b45..979f35a9c 100644 --- a/examples/ra25/ra25.go +++ b/examples/ra25/ra25.go @@ -596,14 +596,25 @@ func (ss *Sim) RunStats(lmode Modes, ltime Times, lphase StatsPhase) { } } +func (ss *Sim) StatsData(lmode Modes, ltime Times) *datafs.Data { + modeDir := ss.Stats.RecycleDir(lmode.String()) + return modeDir.RecycleDir(ltime.String()) +} + func (ss *Sim) InitStats() { for mode, st := range ss.Loops.Stacks { + cmd := mode.(Modes) for _, tm := range st.Order { ctm := tm.(Times) if ctm == Cycle { continue } ss.RunStats(mode.(Modes), ctm, Start) + if ss.GUI.Tabs != nil { + if cmd == Train && ctm == Epoch { + ss.GUI.Tabs.PlotDataFS(ss.StatsData(cmd, ctm)) + } + } } } } @@ -611,8 +622,10 @@ func (ss *Sim) InitStats() { // ConfigStats handles configures functions to do all stats computation // in the datafs system. func (ss *Sim) ConfigStats() { + net := ss.Net ss.Stats, _ = ss.Root.Mkdir("Stats") ss.Current, _ = ss.Stats.Mkdir("Current") + // todo: move this to simstats: for md, st := range ss.Loops.Stacks { cmd := md.(Modes) for _, tm := range st.Order { @@ -625,7 +638,7 @@ func (ss *Sim) ConfigStats() { return } name := tm.String() // name of stat = time - ndata := ss.Config.Run.NData + ndata := int(ss.Net.Context().NData) modeDir := ss.Stats.RecycleDir(lmode.String()) timeDir := modeDir.RecycleDir(ltime.String()) tsr := datafs.Value[int](timeDir, name) @@ -655,12 +668,14 @@ func (ss *Sim) ConfigStats() { }) } } + // todo: loop over stat names as in diagnostics + // and include NZero, and stopping just grabs that from current. ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { name := "UnitErr" modeDir := ss.Stats.RecycleDir(lmode.String()) timeDir := modeDir.RecycleDir(ltime.String()) tsr := datafs.Value[float64](timeDir, name) - ndata := ss.Config.Run.NData + ndata := int(ss.Net.Context().NData) if lphase == Start { tsr.SetNumRows(0) if ps := plot.GetStylersFrom(tsr); ps == nil { @@ -763,6 +778,15 @@ func (ss *Sim) ConfigStats() { tsr.AppendRow(stat) } }) + perTrlFunc := axon.StatPerTrialMSec(ss.Stats, "Err", Train, Trial, Epoch, Run) + ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { + perTrlFunc(lmode, ltime, lphase == Start) + }) + lays := net.LayersByType(axon.SuperLayer, axon.CTLayer, axon.TargetLayer) + diagsFunc := axon.StatDiagnostics(ss.Stats, net, lays, Train, Trial, Epoch, Run) + ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { + diagsFunc(lmode, ltime, lphase == Start) + }) } // StatCounters returns counters string to show at bottom of netview. @@ -790,25 +814,6 @@ func (ss *Sim) StatCounters(md, tm enums.Enum) string { // ss.ViewUpdate.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Di", "TrialName", "Cycle", "UnitErr", "TrlErr", "PhaseDiff"}) } -// TrialStats computes the trial-level statistics. -// Aggregation is done directly from log data. -func (ss *Sim) TrialStats(di int) { - // out := ss.Net.LayerByName("Output") - // - // ss.Stats.SetFloat("PhaseDiff", float64(out.Values[di].PhaseDiff.Cor)) - // ss.Stats.SetFloat("UnitErr", out.PctUnitErr(&ss.Context)[di]) - // - // if ss.Stats.Float("UnitErr") > 0 { - // ss.Stats.SetFloat("TrlErr", 1) - // } else { - // - // ss.Stats.SetFloat("TrlErr", 0) - // } -} - -////////////////////////////////////////////////////////////////////////////// -// Logging - func (ss *Sim) ConfigLogs() { // ss.Stats.SetString("RunName", ss.Params.RunName(0)) // used for naming logs, stats, etc // @@ -844,40 +849,6 @@ func (ss *Sim) ConfigLogs() { // ss.Logs.SetMeta(Train, Run, "LegendCol", "RunName") } -// Log is the main logging function, handles special things for different scopes -func (ss *Sim) Log(mode Modes, time Times) { - // ctx := ss.Net.Context() - // - // if mode != Analyze { - // ctx.Mode = mode // Also set specifically in a Loop callback. - // } - // - // dt := ss.Logs.Table(mode, time) - // - // if dt == nil { - // return - // } - // - // row := dt.Rows - // - // switch { - // case time == Cycle: - // - // return - // - // case time == Trial: - // - // for di := 0; di < int(ctx.NData); di++ { - // ss.TrialStats(di) - // ss.StatCounters(di) - // ss.Logs.LogRowDi(mode, time, row, di) - // } - // return // don't do reg below - // } - // - // ss.Logs.LogRow(mode, time, row) // also logs to file, etc -} - //////////////////////////////////////////////////////////////////////////////////////////// // Gui @@ -902,6 +873,7 @@ func (ss *Sim) ConfigGUI() { nv.SceneXYZ().Camera.LookAt(math32.Vec3(0, 0, 0), math32.Vec3(0, 1, 0)) ss.GUI.UpdateFiles() + ss.InitStats() // ss.GUI.AddPlots(title, &ss.Logs) ss.GUI.FinalizeGUI(false) // if ss.Config.Run.GPU { diff --git a/examples/ra25/ra25.goal b/examples/ra25/ra25.goal index c65bb11e8..2c154a233 100644 --- a/examples/ra25/ra25.goal +++ b/examples/ra25/ra25.goal @@ -594,14 +594,25 @@ func (ss *Sim) RunStats(lmode Modes, ltime Times, lphase StatsPhase) { } } +func (ss *Sim) StatsData(lmode Modes, ltime Times) *datafs.Data { + modeDir := ss.Stats.RecycleDir(lmode.String()) + return modeDir.RecycleDir(ltime.String()) +} + func (ss *Sim) InitStats() { for mode, st := range ss.Loops.Stacks { + cmd := mode.(Modes) for _, tm := range st.Order { ctm := tm.(Times) if ctm == Cycle { continue } ss.RunStats(mode.(Modes), ctm, Start) + if ss.GUI.Tabs != nil { + if cmd == Train && ctm == Epoch { + ss.GUI.Tabs.PlotDataFS(ss.StatsData(cmd, ctm)) + } + } } } } @@ -609,8 +620,10 @@ func (ss *Sim) InitStats() { // ConfigStats handles configures functions to do all stats computation // in the datafs system. func (ss *Sim) ConfigStats() { + net := ss.Net ss.Stats, _ = ss.Root.Mkdir("Stats") ss.Current, _ = ss.Stats.Mkdir("Current") + // todo: move this to simstats: for md, st := range ss.Loops.Stacks { cmd := md.(Modes) for _, tm := range st.Order { @@ -623,7 +636,7 @@ func (ss *Sim) ConfigStats() { return } name := tm.String() // name of stat = time - ndata := ss.Config.Run.NData + ndata := int(ss.Net.Context().NData) modeDir := ss.Stats.RecycleDir(lmode.String()) timeDir := modeDir.RecycleDir(ltime.String()) tsr := datafs.Value[int](timeDir, name) @@ -653,12 +666,14 @@ func (ss *Sim) ConfigStats() { }) } } + // todo: loop over stat names as in diagnostics + // and include NZero, and stopping just grabs that from current. ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { name := "UnitErr" modeDir := ss.Stats.RecycleDir(lmode.String()) timeDir := modeDir.RecycleDir(ltime.String()) tsr := datafs.Value[float64](timeDir, name) - ndata := ss.Config.Run.NData + ndata := int(ss.Net.Context().NData) if lphase == Start { tsr.SetNumRows(0) if ps := plot.GetStylersFrom(tsr); ps == nil { @@ -761,6 +776,15 @@ func (ss *Sim) ConfigStats() { tsr.AppendRow(stat) } }) + perTrlFunc := axon.StatPerTrialMSec(ss.Stats, "Err", Train, Trial, Epoch, Run) + ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { + perTrlFunc(lmode, ltime, lphase == Start) + }) + lays := net.LayersByType(axon.SuperLayer, axon.CTLayer, axon.TargetLayer) + diagsFunc := axon.StatDiagnostics(ss.Stats, net, lays, Train, Trial, Epoch, Run) + ss.AddStat(func(lmode Modes, ltime Times, lphase StatsPhase) { + diagsFunc(lmode, ltime, lphase == Start) + }) } // StatCounters returns counters string to show at bottom of netview. @@ -786,24 +810,6 @@ func (ss *Sim) StatCounters(md, tm enums.Enum) string { // ss.ViewUpdate.Text = ss.Stats.Print([]string{"Run", "Epoch", "Trial", "Di", "TrialName", "Cycle", "UnitErr", "TrlErr", "PhaseDiff"}) } -// TrialStats computes the trial-level statistics. -// Aggregation is done directly from log data. -func (ss *Sim) TrialStats(di int) { - // out := ss.Net.LayerByName("Output") - // - // ss.Stats.SetFloat("PhaseDiff", float64(out.Values[di].PhaseDiff.Cor)) - // ss.Stats.SetFloat("UnitErr", out.PctUnitErr(&ss.Context)[di]) - // - // if ss.Stats.Float("UnitErr") > 0 { - // ss.Stats.SetFloat("TrlErr", 1) - // } else { - // ss.Stats.SetFloat("TrlErr", 0) - // } -} - -////////////////////////////////////////////////////////////////////////////// -// Logging - func (ss *Sim) ConfigLogs() { // ss.Stats.SetString("RunName", ss.Params.RunName(0)) // used for naming logs, stats, etc // @@ -839,33 +845,6 @@ func (ss *Sim) ConfigLogs() { // ss.Logs.SetMeta(Train, Run, "LegendCol", "RunName") } -// Log is the main logging function, handles special things for different scopes -func (ss *Sim) Log(mode Modes, time Times) { - // ctx := ss.Net.Context() - // if mode != Analyze { - // ctx.Mode = mode // Also set specifically in a Loop callback. - // } - // dt := ss.Logs.Table(mode, time) - // if dt == nil { - // return - // } - // row := dt.Rows - // - // switch { - // case time == Cycle: - // return - // case time == Trial: - // for di := 0; di < int(ctx.NData); di++ { - // ss.TrialStats(di) - // ss.StatCounters(di) - // ss.Logs.LogRowDi(mode, time, row, di) - // } - // return // don't do reg below - // } - // - // ss.Logs.LogRow(mode, time, row) // also logs to file, etc -} - //////////////////////////////////////////////////////////////////////////////////////////// // Gui @@ -890,6 +869,7 @@ func (ss *Sim) ConfigGUI() { nv.SceneXYZ().Camera.LookAt(math32.Vec3(0, 0, 0), math32.Vec3(0, 1, 0)) ss.GUI.UpdateFiles() + ss.InitStats() // ss.GUI.AddPlots(title, &ss.Logs) ss.GUI.FinalizeGUI(false) // if ss.Config.Run.GPU {