diff --git a/.github/workflows/test_basic.yaml b/.github/workflows/test_basic.yaml index 4a98e09d..6d354c14 100644 --- a/.github/workflows/test_basic.yaml +++ b/.github/workflows/test_basic.yaml @@ -1,16 +1,12 @@ name: Regression tests on: + push: + branches: [ "main" ] pull_request: branches: [ "main" ] - types: [ opened, synchronize, labeled ] jobs: examples_dry_run: name: Dry Run (${{ matrix.target }}) - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest strategy: matrix: @@ -22,11 +18,6 @@ jobs: run: | python3 example.py --dry-run --only-target=${{ matrix.target }} tutorial: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -35,11 +26,6 @@ jobs: run: | (cd tutorial && ./tutorial_all.sh) examples_basic: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -48,11 +34,6 @@ jobs: run: | python3 example.py --examples simple0,simple1,simple0_loop,simple1_loop examples_ntt_kyber_dilithium_helium_core: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -61,11 +42,6 @@ jobs: run: | python3 example.py --examples ntt_kyber_1_23_45_67_m55,ntt_dilithium_12_34_56_78_m55 --timeout=300 examples_ntt_kyber_dilithium_neon_core: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -74,11 +50,6 @@ jobs: run: | python3 example.py --examples ntt_kyber_123_4567_a55,ntt_dilithium_123_45678_a55 --timeout=300 sqmag: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -87,11 +58,6 @@ jobs: run: | (cd paper/scripts && NO_LOG=Y ./slothy_sqmag.sh) fft: - if: ${{ github.event.label.name == 'needs-ci' || - github.event.pull_request.user.login == 'hanno-becker' || - github.event.pull_request.user.login == 'dop-amin' || - github.event.pull_request.user.login == 'mkannwischer' - }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/example.py b/example.py index c398799a..ce5bb75e 100644 --- a/example.py +++ b/example.py @@ -669,7 +669,9 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): def core(self,slothy): slothy.config.variable_size=True slothy.config.inputs_are_outputs = True + slothy.fusion_region("start", "end", ssa=False) slothy.optimize(start="start", end="end") + class Armv7mExample0Func(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): @@ -1569,7 +1571,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 16 - slothy.config.unsafe_address_offset_fixup = False + slothy.config.unsafe_address_offset_fixup = True slothy.config.variable_size = True slothy.config.inputs_are_outputs = True @@ -1605,7 +1607,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 16 - slothy.config.unsafe_address_offset_fixup = False + slothy.config.unsafe_address_offset_fixup = True slothy.config.variable_size = True @@ -1616,12 +1618,12 @@ def core(self, slothy): slothy.config.sw_pipelining.optimize_postamble = True slothy.config.sw_pipelining.allow_pre = True - slothy.optimize_loop("layer123_loop") + slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.optimize_loop("layer456_first_loop") slothy.optimize_loop("layer456_loop") slothy.config.inputs_are_outputs = True - slothy.optimize_loop("layer78_loop") + slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop) class pointwise_montgomery_dilithium(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): @@ -1814,6 +1816,7 @@ def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 32 r = slothy.config.reserved_regs + r.add("r1") r = r.union(f"s{i}" for i in range(31)) # reserve FPR slothy.config.reserved_regs = r @@ -1825,13 +1828,12 @@ def core(self, slothy): slothy.config.variable_size = True slothy.config.split_heuristic = True slothy.config.timeout = 360 # Not more than 2min per step - slothy.config.split_heuristic_factor = 1 slothy.config.visualize_expected_performance = False - slothy.config.split_heuristic_factor = 4 + slothy.config.split_heuristic_factor = 5 slothy.config.split_heuristic_stepsize = 0.15 - slothy.optimize_loop("layer1234_loop") + slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.config.split_heuristic_optimize_seam = 6 - slothy.optimize_loop("layer1234_loop") + slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.config.outputs = ["r14"] @@ -2179,12 +2181,11 @@ def core(self, slothy): slothy.config.variable_size = True r = slothy.config.reserved_regs - r.add("r14") slothy.config.reserved_regs = r slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class basemul_acc_32_16_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): @@ -2278,14 +2279,10 @@ def core(self, slothy): slothy.config.inputs_are_outputs = True slothy.config.variable_size = True - r = slothy.config.reserved_regs - r.add("r14") - slothy.config.reserved_regs = r - slothy.config.unsafe_address_offset_fixup = False slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class add_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): @@ -2484,16 +2481,14 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non def core(self, slothy): slothy.config.inputs_are_outputs = True slothy.config.variable_size = True - slothy.config.outputs = ["r14"] slothy.config.unsafe_address_offset_fixup = False r = slothy.config.reserved_regs - r.add("r14") r = r.union(f"s{i}" for i in range(32)) # reserve FPR slothy.config.reserved_regs = r slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class matacc_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): diff --git a/examples/naive/armv7m/armv7m_simple0.s b/examples/naive/armv7m/armv7m_simple0.s index 311b016d..504062ac 100644 --- a/examples/naive/armv7m/armv7m_simple0.s +++ b/examples/naive/armv7m/armv7m_simple0.s @@ -29,4 +29,8 @@ smlabt r3,r2, r2, r1 asrs r3, r3,#1 str r3, [r0,#4] // @slothy:writes=a +ldrd r0, r3, [r0, #4] +ldm r0 ,{r0-r2} +add r2,r3,r2 +str r1, [sp, #0] end: \ No newline at end of file diff --git a/examples/naive/armv7m/basemul_acc_32_32_kyber.s b/examples/naive/armv7m/basemul_acc_32_32_kyber.s index 6d6b3e5d..ca30e75a 100644 --- a/examples/naive/armv7m/basemul_acc_32_32_kyber.s +++ b/examples/naive/armv7m/basemul_acc_32_32_kyber.s @@ -32,31 +32,31 @@ basemul_asm_acc_opt_32_32: movw loop, #64 1: - ldr poly0, [aptr], #8 - ldr poly1, [bptr], #8 + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 ldr.w res0, [rptr_tmp] - ldr tmp2, [aprimeptr], #8 + ldr tmp2, [aprimeptr], #4 ldr.w res1, [rptr_tmp, #4] // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res smlad tmp2, tmp2, poly1, res0 - str tmp2, [rptr_tmp], #16 + str tmp2, [rptr_tmp], #4 // poly1_t * poly0_b + poly1_b * poly0_t + res smladx tmp, poly0, poly1, res1 - str tmp, [rptr_tmp, #-12] + str tmp, [rptr_tmp], #4 - ldr poly0, [aptr, #-4] - ldr poly1, [bptr, #-4] - ldr res0, [rptr_tmp, #-8] - ldr tmp2, [aprimeptr, #-4] - ldr res1, [rptr_tmp, #-4] + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp] + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp, #4] smlad tmp2, tmp2, poly1, res0 - str tmp2, [rptr_tmp, #-8] + str tmp2, [rptr_tmp], #4 smladx tmp, poly0, poly1, res1 - str tmp, [rptr_tmp, #-4] + str tmp, [rptr_tmp], #4 subs.w loop, loop, #1 bne.w 1b diff --git a/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s index bf02e568..3070a36b 100644 --- a/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s +++ b/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s @@ -12,7 +12,7 @@ .macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv ldr \poly0, [\bptr], #8 - ldr \res0, [\rptr_tmp], #16 // @slothy:core=True + ldr \res0, [\rptr_tmp], #16 // @slothy:core=True // @slothy:before=cmp smulwt \tmp, \zeta, \poly1 smlabt \tmp, \tmp, \q, \qa @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16: push {r4-r11, r14} rptr .req r0 - bptr .req r1 + bptr .req r3 aptr .req r2 zetaptr .req r3 t0 .req r4 @@ -85,7 +85,7 @@ frombytes_mul_asm_acc_32_16: qinv .req r11 zeta .req r12 ctr .req r14 - rptr_tmp .req r3 + rptr_tmp .req r1 movw qa, #26632 movt q, #3329 @@ -93,35 +93,20 @@ frombytes_mul_asm_acc_32_16: movw qinv, #62209 movt qinv, #27560 - vmov s2, zetaptr + vmov s1, r1 ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack - vmov s1, rptr_tmp + add ctr, rptr_tmp, #64*4*4 1: + ldr.w zeta, [zetaptr], #4 deserialize aptr, tmp, tmp2, tmp3, t0, t1 - vmov tmp, s2 - ldr zeta, [tmp], #4 - vmov s2, tmp + vmov s2, zetaptr + vmov bptr, s1 doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - cmp.w rptr_tmp, ctr + vmov s1, bptr // @slothy:core=True + cmp.w rptr_tmp, ctr // @slothy:id=cmp + vmov zetaptr, s2 bne.w 1b - // Original code - // ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack - // vmov s1, tmp - // vmov s2, zetaptr - // add ctr, tmp, #64*4*4 - // 1: - // vmov zetaptr, s2 - // ldr.w zeta, [zetaptr], #4 - // deserialize aptr, tmp, tmp2, tmp3, t0, t1 - // vmov s2, zetaptr - // vmov rptr_tmp, s1 - // doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - // vmov s1, rptr_tmp - // cmp.w rptr_tmp, ctr - // bne.w 1b - pop {r4-r11, pc} - .size frombytes_mul_asm_acc_32_16, .-frombytes_mul_asm_acc_32_16 \ No newline at end of file diff --git a/examples/naive/armv7m/frombytes_mul_acc_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_kyber.s index 563e0151..084a65d9 100644 --- a/examples/naive/armv7m/frombytes_mul_acc_kyber.s +++ b/examples/naive/armv7m/frombytes_mul_acc_kyber.s @@ -32,7 +32,7 @@ // r[1] in upper half of tmp2 pkhtb \tmp, \tmp2, \tmp, asr #16 uadd16 \res0, \res0, \tmp - str \res0, [\rptr], #8 // @slothy:core=True + str \res0, [\rptr], #8 // @slothy:core=True // @slothy:before=cmp neg \zeta, \zeta @@ -101,13 +101,13 @@ frombytes_mul_asm_acc: movt qinv, #27560 add ctr, rptr, #64*4*2 - vmov s0, ctr 1: ldr.w zeta, [zetaptr], #4 deserialize aptr, tmp, tmp2, tmp3, t0, t1 + vmov s0, ctr doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv vmov ctr, s0 - cmp.w rptr, ctr + cmp.w rptr, ctr // @slothy:id=cmp bne.w 1b pop {r4-r11, pc} diff --git a/examples/naive/armv7m/intt_dilithium_123_456_78.s b/examples/naive/armv7m/intt_dilithium_123_456_78.s index cd92e1d3..6cd3e27a 100644 --- a/examples/naive/armv7m/intt_dilithium_123_456_78.s +++ b/examples/naive/armv7m/intt_dilithium_123_456_78.s @@ -221,9 +221,9 @@ pqcrystals_dilithium_invntt_tomont: str.w pol5, [ptr_p, #5*distance/4] str.w pol6, [ptr_p, #6*distance/4] str.w pol7, [ptr_p, #7*distance/4] - str.w pol0, [ptr_p], #strincr + str.w pol0, [ptr_p], #strincr // @slothy:before=cmp vmov temp_l, s9 - cmp.w ptr_p, temp_l + cmp.w ptr_p, temp_l // @slothy:id=cmp bne.w layer123_loop sub ptr_p, #32*strincr @@ -248,21 +248,21 @@ pqcrystals_dilithium_invntt_tomont: ldr.w pol3, [ptr_p, #7*distance2/4] _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - ldr.w pol0, [ptr_p], #128 - ldr pol1, [ptr_p, #1*distance2/4-128] - ldr pol2, [ptr_p, #2*distance2/4-128] - ldr pol3, [ptr_p, #3*distance2/4-128] + ldr.w pol0, [ptr_p] + ldr pol1, [ptr_p, #1*distance2/4] + ldr pol2, [ptr_p, #2*distance2/4] + ldr pol3, [ptr_p, #3*distance2/4] _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - str pol1, [ptr_p, #1*distance2/4-128] - str pol2, [ptr_p, #2*distance2/4-128] - str pol3, [ptr_p, #3*distance2/4-128] - str.w pol5, [ptr_p, #5*distance2/4-128] - str.w pol6, [ptr_p, #6*distance2/4-128] - str.w pol7, [ptr_p, #7*distance2/4-128] - str pol0, [ptr_p, #-128] - str.w pol4, [ptr_p], #128 - //add.w ptr_p, #strincr2 + str pol1, [ptr_p, #1*distance2/4] + str pol2, [ptr_p, #2*distance2/4] + str pol3, [ptr_p, #3*distance2/4] + str.w pol4, [ptr_p, #4*distance2/4] + str.w pol5, [ptr_p, #5*distance2/4] + str.w pol6, [ptr_p, #6*distance2/4] + str.w pol7, [ptr_p, #7*distance2/4] + str pol0, [ptr_p] + add.w ptr_p, ptr_p, #strincr2 vmov temp_l, s10 cmp.w ptr_p, temp_l @@ -281,26 +281,26 @@ pqcrystals_dilithium_invntt_tomont: vldm ptr_zeta!, {s2-s8} vmov s0, ptr_zeta layer456_loop: - ldr.w pol0, [ptr_p], #128 - ldr pol1, [ptr_p, #1*distance2/4-128] - ldr pol2, [ptr_p, #2*distance2/4-128] - ldr pol3, [ptr_p, #3*distance2/4-128] - ldr.w pol4, [ptr_p, #4*distance2/4-128] - ldr.w pol5, [ptr_p, #5*distance2/4-128] - ldr.w pol6, [ptr_p, #6*distance2/4-128] - ldr.w pol7, [ptr_p, #7*distance2/4-128] + ldr.w pol0, [ptr_p] + ldr pol1, [ptr_p, #1*distance2/4] + ldr pol2, [ptr_p, #2*distance2/4] + ldr pol3, [ptr_p, #3*distance2/4] + ldr.w pol4, [ptr_p, #4*distance2/4] + ldr.w pol5, [ptr_p, #5*distance2/4] + ldr.w pol6, [ptr_p, #6*distance2/4] + ldr.w pol7, [ptr_p, #7*distance2/4] _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - str pol1, [ptr_p, #1*distance2/4-128] - str pol2, [ptr_p, #2*distance2/4-128] - str pol3, [ptr_p, #3*distance2/4-128] - str.w pol5, [ptr_p, #5*distance2/4-128] - str.w pol6, [ptr_p, #6*distance2/4-128] - str.w pol7, [ptr_p, #7*distance2/4-128] - str pol0, [ptr_p, #-128] - str.w pol4, [ptr_p], #128 - //add.w ptr_p, #strincr2 + str pol1, [ptr_p, #1*distance2/4] + str pol2, [ptr_p, #2*distance2/4] + str pol3, [ptr_p, #3*distance2/4] + str.w pol4, [ptr_p, #4*distance2/4] + str.w pol5, [ptr_p, #5*distance2/4] + str.w pol6, [ptr_p, #6*distance2/4] + str.w pol7, [ptr_p, #7*distance2/4] + str pol0, [ptr_p] + add.w ptr_p, ptr_p, #strincr2 vmov temp_l, s10 cmp.w ptr_p, temp_l @@ -342,10 +342,10 @@ pqcrystals_dilithium_invntt_tomont: str.w pol1, [ptr_p, #256] str.w pol2, [ptr_p, #512] str.w pol3, [ptr_p, #768] - str pol0, [ptr_p], #strincr3 // @slothy:core + str pol0, [ptr_p], #strincr3 // @slothy:core // @slothy:before=cmp vmov cntr, s9 - cmp.w ptr_p, cntr + cmp.w ptr_p, cntr // @slothy:id=cmp bne.w layer78_loop //restore registers diff --git a/examples/naive/armv7m/keccakf1600_adomnicai_m7.s b/examples/naive/armv7m/keccakf1600_adomnicai_m7.s index c0c88a9f..20698a54 100644 --- a/examples/naive/armv7m/keccakf1600_adomnicai_m7.s +++ b/examples/naive/armv7m/keccakf1600_adomnicai_m7.s @@ -1144,4 +1144,4 @@ KeccakP1600_Permute_Round1Mod4: add sp, #mSize pop { r4 - r12, pc } -.size KeccakP1600_Permute, .-KeccakP1600_Permute +.size KeccakF1600_StatePermute_adomnicai_m7, .-KeccakF1600_StatePermute_adomnicai_m7 diff --git a/examples/naive/armv7m/ntt_769_dilithium.s b/examples/naive/armv7m/ntt_769_dilithium.s index 6f7c51a7..ee67b44a 100644 --- a/examples/naive/armv7m/ntt_769_dilithium.s +++ b/examples/naive/armv7m/ntt_769_dilithium.s @@ -146,7 +146,6 @@ small_ntt_asm_769: // s24: tmp // s25: twiddle_ptr vmov s24, tmp - vmov s25, twiddle_ptr layer1234_loop: // load a1, a3, ..., a15 vmov s23, poly @@ -251,10 +250,10 @@ small_ntt_asm_769: uadd16 tmp, poly0, poly1 usub16 twiddle1, poly0, poly1 str.w twiddle1, [poly, #offset] - str.w tmp, [poly], #4 // @slothy:core + str.w tmp, [poly], #4 // @slothy:core // @slothy:before=cmp vmov tmp, s24 - cmp.w poly, tmp + cmp.w poly, tmp // @slothy:id=cmp bne.w layer1234_loop sub.w poly, #8*strincr @@ -266,7 +265,6 @@ small_ntt_asm_769: add.w tmp, poly, #strincr2*16 vmov s13, tmp - vmov twiddle_ptr, s25 layer567_loop: vmov s23, poly load poly, poly0, poly1, poly2, poly3, #0, #distance2/4, #2*distance2/4, #3*distance2/4 diff --git a/examples/naive/armv7m/ntt_dilithium.s b/examples/naive/armv7m/ntt_dilithium.s index 2dcaaa4d..3c353952 100644 --- a/examples/naive/armv7m/ntt_dilithium.s +++ b/examples/naive/armv7m/ntt_dilithium.s @@ -262,10 +262,10 @@ pqcrystals_dilithium_ntt: str.w pol5, [ptr_p, #5*distance/4] str.w pol6, [ptr_p, #6*distance/4] str.w pol7, [ptr_p, #7*distance/4] - str pol0, [ptr_p], #strincr // @slothy:core=True + str pol0, [ptr_p], #strincr // @slothy:core=True // @slothy:before=cmp vmov temp_l, s9 - cmp.w ptr_p, temp_l + cmp.w ptr_p, temp_l // @slothy:id=cmp bne layer123_loop sub ptr_p, #32*4 @@ -299,9 +299,9 @@ pqcrystals_dilithium_ntt: str.w pol5, [ptr_p, #5*distance2/4] str.w pol6, [ptr_p, #6*distance2/4] str.w pol7, [ptr_p, #7*distance2/4] - str pol0, [ptr_p], #4 // @slothy:core=True + str pol0, [ptr_p], #4 // @slothy:core=True // @slothy:before=cmp vmov temp_l, s10 - cmp.w ptr_p, temp_l + cmp.w ptr_p, temp_l // @slothy:id=cmp bne layer456_loop add.w ptr_p, #112 @@ -328,8 +328,8 @@ pqcrystals_dilithium_ntt: str.w pol1, [ptr_p, #4] str.w pol2, [ptr_p, #8] str.w pol3, [ptr_p, #12] - str pol0, [ptr_p], #16 // @slothy:core=True - cmp.w ptr_p, cntr + str pol0, [ptr_p], #16 // @slothy:core=True // @slothy:before=cmp + cmp.w ptr_p, cntr // @slothy:id=cmp bne.w layer78_loop //restore registers diff --git a/examples/opt/armv7m/armv7m_simple0_func_opt_m7.s b/examples/opt/armv7m/armv7m_simple0_func_opt_m7.s index 3f31d469..f795225b 100644 --- a/examples/opt/armv7m/armv7m_simple0_func_opt_m7.s +++ b/examples/opt/armv7m/armv7m_simple0_func_opt_m7.s @@ -16,8 +16,8 @@ my_func: // Cycle bound: 5.0 // IPC bound: 1.20 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.00s + // User time: 0.00s // // ----- cycle (expected) ------> // 0 25 diff --git a/examples/opt/armv7m/armv7m_simple0_opt_m7.s b/examples/opt/armv7m/armv7m_simple0_opt_m7.s index 9e9d3fa0..f31e2af4 100644 --- a/examples/opt/armv7m/armv7m_simple0_opt_m7.s +++ b/examples/opt/armv7m/armv7m_simple0_opt_m7.s @@ -1,69 +1,96 @@ start: - // Instructions: 24 - // Expected cycles: 26 - // Expected IPC: 0.92 - // - // Cycle bound: 26.0 - // IPC bound: 0.92 - // - // Wall time: 0.20s - // User time: 0.20s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r6, [r0, #4] // *............................. // @slothy:reads=a - add r10, r2, r6 // .*............................ - eor.w r1, r10, r3 // ..*........................... - smlabt r7, r2, r2, r1 // ..*........................... - asrs r5, r7, #1 // ....*......................... - str r5, [r0, #4] // ....*......................... // @slothy:writes=a - ldm r0, {r7,r9,r11} // .....*........................ // @slothy:reads=a - add r8, r9, r7 // ........*..................... - eor.w r2, r8, r11 // .........*.................... - smlabt r12, r9, r9, r2 // .........*.................... - asrs r11, r12, #1 // ...........*.................. - str r11, [r0, #4] // ...........*.................. // @slothy:writes=a - ldm r0, {r7,r8,r10} // ............*................. // @slothy:reads=a - add r6, r8, r7 // ...............*.............. - eor.w r5, r6, r10 // ................*............. - smlabt r12, r8, r8, r5 // ................*............. - asrs r9, r12, #1 // ..................*........... - str r9, [r0, #4] // ..................*........... // @slothy:writes=a - ldm r0, {r1,r2,r8} // ...................*.......... // @slothy:reads=a - add r14, r2, r1 // ......................*....... - eor.w r5, r14, r8 // .......................*...... - smlabt r10, r2, r2, r5 // .......................*...... - asrs r3, r10, #1 // .........................*.... - str r3, [r0, #4] // .........................*.... // @slothy:writes=a - - // ------ cycle (expected) ------> + // Instructions: 37 + // Expected cycles: 36 + // Expected IPC: 1.03 + // + // Cycle bound: 36.0 + // IPC bound: 1.03 + // + // Wall time: 0.09s + // User time: 0.09s + // + // -------- cycle (expected) ---------> // 0 25 - // |------------------------|----- - // ldr r1, [r0, #4] // *.............................. - // add r1, r2,r1 // .*............................. - // eor.w r1,r1, r3 // ..*............................ - // smlabt r3,r2, r2, r1 // ..*............................ - // asrs r3, r3,#1 // ....*.......................... - // str r3, [r0,#4] // ....*.......................... - // ldm r0, {r1-r2,r14} // .....*......................... - // add r1, r2,r1 // ........*...................... - // eor.w r1,r1, r14 // .........*..................... - // smlabt r3,r2, r2, r1 // .........*..................... - // asrs r3, r3,#1 // ...........*................... - // str r3, [r0,#4] // ...........*................... - // ldm r0, {r1-r3} // ............*.................. - // add r1, r2,r1 // ...............*............... - // eor.w r1,r1, r3 // ................*.............. - // smlabt r3,r2, r2, r1 // ................*.............. - // asrs r3, r3,#1 // ..................*............ - // str r3, [r0,#4] // ..................*............ - // ldm r0, {r1,r2,r3} // ...................*........... - // add r1, r2,r1 // ......................*........ - // eor.w r1,r1, r3 // .......................*....... - // smlabt r3,r2, r2, r1 // .......................*....... - // asrs r3, r3,#1 // .........................*..... - // str r3, [r0,#4] // .........................*..... + // |------------------------|---------- + ldr r4, [r0, #4] // *................................... // @slothy:reads=a + add r4, r2, r4 // .*.................................. + eor.w r4, r4, r3 // ..*................................. + smlabt r2, r2, r2, r4 // ..*................................. + asrs r2, r2, #1 // ....*............................... + str r2, [r0, #4] // ....*............................... // @slothy:writes=a + ldr r3, [r0, #0] // ........*........................... // @slothy:reads=a + ldr r2, [r0, #8] // .........*.......................... // @slothy:reads=a + ldr r6, [r0, #4] // .........*.......................... // @slothy:reads=a + add r5, r6, r3 // ..........*......................... + eor.w r4, r5, r2 // ...........*........................ + smlabt r2, r6, r6, r4 // ...........*........................ + asrs r2, r2, #1 // .............*...................... + str r2, [r0, #4] // .............*...................... // @slothy:writes=a + ldr r4, [r0, #0] // .................*.................. // @slothy:reads=a + ldr r14, [r0, #8] // ..................*................. // @slothy:reads=a + ldr r2, [r0, #4] // ..................*................. // @slothy:reads=a + add r6, r2, r4 // ...................*................ + ldr r4, [r0, #4] // ...................*................ + eor.w r6, r6, r14 // ....................*............... + smlabt r2, r2, r2, r6 // ....................*............... + ldr r14, [r4, #4] // .....................*.............. + asrs r2, r2, #1 // ......................*............. + str r2, [r0, #4] // ......................*............. // @slothy:writes=a + ldr r3, [r0, #0] // ..........................*......... // @slothy:reads=a + ldr r5, [r0, #8] // ...........................*........ // @slothy:reads=a + ldr r6, [r0, #4] // ...........................*........ // @slothy:reads=a + add r2, r6, r3 // ............................*....... + ldr r3, [r0, #8] // ............................*....... + eor.w r2, r2, r5 // .............................*...... + smlabt r6, r6, r6, r2 // .............................*...... + ldr r2, [r4, #8] // ..............................*..... + str r14, [sp, #0] // ..............................*..... + asrs r14, r6, #1 // ...............................*.... + str r14, [r0, #4] // ...............................*.... // @slothy:writes=a + add r2, r3, r2 // ................................*... + ldr r0, [r4, #0] // ...................................* + + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + // ldr r1, [r0, #4] // *................................... + // add r1, r2, r1 // .*.................................. + // eor.w r1, r1, r3 // ..*................................. + // smlabt r3, r2, r2, r1 // ..*................................. + // asrs r3, r3, #1 // ....*............................... + // str r3, [r0, #4] // ....*............................... + // ldr r1, [r0, #0] // ........*........................... + // ldr r2, [r0, #4] // .........*.......................... + // ldr r14, [r0, #8] // .........*.......................... + // add r1, r2, r1 // ..........*......................... + // eor.w r1, r1, r14 // ...........*........................ + // smlabt r3, r2, r2, r1 // ...........*........................ + // asrs r3, r3, #1 // .............*...................... + // str r3, [r0, #4] // .............*...................... + // ldr r1, [r0, #0] // .................*.................. + // ldr r2, [r0, #4] // ..................*................. + // ldr r3, [r0, #8] // ..................*................. + // add r1, r2, r1 // ...................*................ + // eor.w r1, r1, r3 // ....................*............... + // smlabt r3, r2, r2, r1 // ....................*............... + // asrs r3, r3, #1 // ......................*............. + // str r3, [r0, #4] // ......................*............. + // ldr r1, [r0, #0] // ..........................*......... + // ldr r2, [r0, #4] // ...........................*........ + // ldr r3, [r0, #8] // ...........................*........ + // add r1, r2, r1 // ............................*....... + // eor.w r1, r1, r3 // .............................*...... + // smlabt r3, r2, r2, r1 // .............................*...... + // asrs r3, r3, #1 // ...............................*.... + // str r3, [r0, #4] // ...............................*.... + // ldr r3, [r0, #8] // ............................*....... + // ldr r0, [r0, #4] // ...................*................ + // ldr r1, [r0, #4] // .....................*.............. + // ldr r2, [r0, #8] // ..............................*..... + // ldr r0, [r0, #0] // ...................................* + // add r2, r3, r2 // ................................*... + // str r1, [sp, #0] // ..............................*..... end: + diff --git a/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s b/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s index 855e6e69..6c1ab50e 100644 --- a/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s +++ b/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s @@ -31,85 +31,89 @@ basemul_asm_acc_opt_32_32_opt_m7: // movt qinv, #27560 movw loop, #64 - // Instructions: 1 - // Expected cycles: 1 - // Expected IPC: 1.00 - // - // Cycle bound: 1.0 - // IPC bound: 1.00 - // - // Wall time: 0.00s - // User time: 0.00s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r10, [r2], #8 // *............................. + // Instructions: 2 + // Expected cycles: 2 + // Expected IPC: 1.00 + // + // Cycle bound: 2.0 + // IPC bound: 1.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r4, [r3], #4 // *............................. + ldr r8, [r3], #4 // .*............................ - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr r10, [r2], #8 // *.............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r4, [r3], #4 // *.............................. + // ldr r8, [r3], #4 // .*............................. sub r14, r14, #1 1: - // Instructions: 19 - // Expected cycles: 10 - // Expected IPC: 1.90 - // - // Cycle bound: 11.0 - // IPC bound: 1.73 - // - // Wall time: 0.32s - // User time: 0.32s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r11, [r1], #8 // *............................. - ldr.w r4, [r0, #4] // *............................. - ldr r6, [r3, #4] // .*............................ - ldr r12, [r0, #8] // .*............................ - ldr r8, [r2, #-4] // ..*........................... - smladx r5, r11, r10, r4 // ..*........................... - ldr r9, [r1, #-4] // ...*.......................... - str r5, [r0, #4] // ...*.......................... - smlad r12, r6, r8, r12 // ....*......................... - ldr r7, [r0, #12] // ....*......................... - ldr r11, [r3], #8 // .....*........................ - str r12, [r0, #8] // .....*........................ - ldr.w r4, [r0] // ......*....................... - smladx r9, r9, r8, r7 // ......*....................... - str r9, [r0, #12] // .......*...................... - subs.w r14, r14, #1 // .......*...................... - smlad r4, r11, r10, r4 // ........*..................... - ldr r10, [r2], #8 // ........e..................... - str r4, [r0], #16 // .........*.................... + // Instructions: 20 + // Expected cycles: 10 + // Expected IPC: 2.00 + // + // Cycle bound: 12.0 + // IPC bound: 1.67 + // + // Wall time: 0.48s + // User time: 0.48s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r10, [r2], #4 // *............................. + ldr.w r7, [r0] // *............................. + ldr r5, [r1], #4 // .*............................ + ldr.w r12, [r0, #8] // .*............................ + ldr.w r9, [r0, #4] // ..*........................... + smlad r7, r4, r10, r7 // ..*........................... + ldr r6, [r2], #4 // ...*.......................... + str r7, [r0], #4 // ...*.......................... + ldr.w r7, [r0, #8] // ....*......................... + smladx r5, r5, r10, r9 // ....*......................... + ldr r11, [r1], #4 // .....*........................ + smlad r10, r8, r6, r12 // .....*........................ + subs.w r14, r14, #1 // ......*....................... + str r5, [r0], #4 // ......*....................... + ldr r4, [r3], #4 // .......e...................... + smladx r7, r11, r6, r7 // .......*...................... + str r10, [r0], #4 // ........*..................... + ldr r8, [r3], #4 // ........e..................... + str r7, [r0], #4 // .........*.................... + bne.w 1b // .........*.................... // @slothy:branch // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r4, [r1], #8 // ..*.........~.........~........ - // ldr r6, [r2], #8 // e.'.......~.'.......~.'........ - // ldr.w r5, [r0] // ..'.....*...'.....~...'.....~.. - // ldr r12, [r3], #8 // ..'....*....'....~....'....~... - // ldr.w r7, [r0, #4] // ..*.........~.........~........ - // smlad r12, r12, r6, r5 // ~.'.......*.'.......~.'........ - // str r12, [r0], #16 // .~'........*'........~'........ - // smladx r11, r4, r6, r7 // ..'.*.......'.~.......'.~...... - // str r11, [r0, #-12] // ..'..*......'..~......'..~..... - // ldr r4, [r1, #-4] // ..'..*......'..~......'..~..... - // ldr r6, [r2, #-4] // ..'.*.......'.~.......'.~...... - // ldr r5, [r0, #-8] // ..'*........'~........'~....... - // ldr r12, [r3, #-4] // ..'*........'~........'~....... - // ldr r7, [r0, #-4] // ..'...*.....'...~.....'...~.... - // smlad r12, r12, r6, r5 // ..'...*.....'...~.....'...~.... - // str r12, [r0, #-8] // ..'....*....'....~....'....~... - // smladx r11, r4, r6, r7 // ..'.....*...'.....~...'.....~.. - // str r11, [r0, #-4] // ..'......*..'......~..'......~. - // subs.w r14, r14, #1 // ..'......*..'......~..'......~. + // ldr r4, [r1], #4 // ...'*........'~........'~...... + // ldr r6, [r2], #4 // ...*.........~.........~....... + // ldr.w r5, [r0] // ...*.........~.........~....... + // ldr r12, [r3], #4 // e..'......~..'......~..'....... + // ldr.w r7, [r0, #4] // ...'.*.......'.~.......'.~..... + // smlad r12, r12, r6, r5 // ...'.*.......'.~.......'.~..... + // str r12, [r0], #4 // ...'..*......'..~......'..~.... + // smladx r11, r4, r6, r7 // ...'...*.....'...~.....'...~... + // str r11, [r0], #4 // ...'.....*...'.....~...'.....~. + // ldr r4, [r1], #4 // ...'....*....'....~....'....~.. + // ldr r6, [r2], #4 // ...'..*......'..~......'..~.... + // ldr.w r5, [r0] // ...'*........'~........'~...... + // ldr r12, [r3], #4 // .e.'.......~.'.......~.'....... + // ldr.w r7, [r0, #4] // ...'...*.....'...~.....'...~... + // smlad r12, r12, r6, r5 // ...'....*....'....~....'....~.. + // str r12, [r0], #4 // .~.'.......*.'.......~.'....... + // smladx r11, r4, r6, r7 // ~..'......*..'......~..'....... + // str r11, [r0], #4 // ..~'........*'........~'....... + // subs.w r14, r14, #1 // ...'.....*...'.....~...'.....~. + // bne.w 1b // ..~'........*'........~'....... + - bne 1b // Instructions: 18 // Expected cycles: 10 // Expected IPC: 1.80 @@ -117,52 +121,51 @@ basemul_asm_acc_opt_32_32_opt_m7: // Cycle bound: 10.0 // IPC bound: 1.80 // - // Wall time: 0.02s - // User time: 0.02s + // Wall time: 0.05s + // User time: 0.05s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r8, [r1], #8 // *............................. - ldr.w r11, [r0, #4] // *............................. - ldr r7, [r3, #4] // .*............................ - ldr r9, [r3], #8 // .*............................ - smladx r12, r8, r10, r11 // ..*........................... - ldr.w r4, [r0] // ..*........................... - ldr r8, [r0, #8] // ...*.......................... - str r12, [r0, #4] // ...*.......................... - smlad r10, r9, r10, r4 // ....*......................... - ldr r11, [r2, #-4] // ....*......................... - str r10, [r0], #16 // .....*........................ - ldr r10, [r1, #-4] // .....*........................ - smlad r4, r7, r11, r8 // ......*....................... - ldr r5, [r0, #-4] // ......*....................... - subs.w r14, r14, #1 // .......*...................... - str r4, [r0, #-8] // .......*...................... - smladx r6, r10, r11, r5 // ........*..................... - str r6, [r0, #-4] // .........*.................... + ldr.w r6, [r0] // *............................. + ldr r10, [r2], #4 // *............................. + ldr.w r9, [r0, #8] // .*............................ + ldr r11, [r1], #4 // .*............................ + ldr r12, [r2], #4 // ..*........................... + smlad r5, r4, r10, r6 // ..*........................... + str r5, [r0], #4 // ...*.......................... + ldr.w r7, [r0, #0] // ...*.......................... + subs.w r14, r14, #1 // ....*......................... + smlad r5, r8, r12, r9 // ....*......................... + ldr r6, [r1], #4 // .....*........................ + smladx r10, r11, r10, r7 // .....*........................ + str r10, [r0], #4 // ......*....................... + ldr.w r10, [r0, #4] // ......*....................... + str r5, [r0], #4 // .......*...................... + smladx r9, r6, r12, r10 // ........*..................... + str r9, [r0], #4 // .........*.................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr r11, [r1], #8 // *.............................. - // ldr.w r4, [r0, #4] // *.............................. - // ldr r6, [r3, #4] // .*............................. - // ldr r12, [r0, #8] // ...*........................... - // ldr r8, [r2, #-4] // ....*.......................... - // smladx r5, r11, r10, r4 // ..*............................ - // ldr r9, [r1, #-4] // .....*......................... - // str r5, [r0, #4] // ...*........................... - // smlad r12, r6, r8, r12 // ......*........................ - // ldr r7, [r0, #12] // ......*........................ - // ldr r11, [r3], #8 // .*............................. - // str r12, [r0, #8] // .......*....................... - // ldr.w r4, [r0] // ..*............................ - // smladx r9, r9, r8, r7 // ........*...................... - // str r9, [r0, #12] // .........*..................... - // subs.w r14, r14, #1 // .......*....................... - // smlad r4, r11, r10, r4 // ....*.......................... - // str r4, [r0], #16 // .....*......................... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r10, [r2], #4 // *.............................. + // ldr.w r7, [r0] // *.............................. + // ldr r5, [r1], #4 // .*............................. + // ldr.w r12, [r0, #8] // .*............................. + // ldr.w r9, [r0, #4] // ...*........................... + // smlad r7, r4, r10, r7 // ..*............................ + // ldr r6, [r2], #4 // ..*............................ + // str r7, [r0], #4 // ...*........................... + // ldr.w r7, [r0, #8] // ......*........................ + // smladx r5, r5, r10, r9 // .....*......................... + // ldr r11, [r1], #4 // .....*......................... + // smlad r10, r8, r6, r12 // ....*.......................... + // subs.w r14, r14, #1 // ....*.......................... + // str r5, [r0], #4 // ......*........................ + // smladx r7, r11, r6, r7 // ........*...................... + // str r10, [r0], #4 // .......*....................... + // str r7, [r0], #4 // .........*..................... + // bne.w 1b // .........*..................... pop {r4-r11, pc} diff --git a/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s b/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s index 63e37d6e..4cd64405 100644 --- a/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s +++ b/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s @@ -12,7 +12,7 @@ .macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv ldr \poly0, [\bptr], #8 - ldr \res0, [\rptr_tmp], #16 // @slothy:core + ldr \res0, [\rptr_tmp], #16 // @slothy:core // @slothy:before=cmp smulwt \tmp, \zeta, \poly1 smlabt \tmp, \tmp, \q, \qa @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16_opt_m7: push {r4-r11, r14} rptr .req r0 - bptr .req r1 + bptr .req r3 aptr .req r2 zetaptr .req r3 t0 .req r4 @@ -85,7 +85,7 @@ frombytes_mul_asm_acc_32_16_opt_m7: qinv .req r11 zeta .req r12 ctr .req r14 - rptr_tmp .req r3 + rptr_tmp .req r1 movw qa, #26632 movt q, #3329 @@ -93,264 +93,258 @@ frombytes_mul_asm_acc_32_16_opt_m7: movw qinv, #62209 movt qinv, #27560 - vmov s2, zetaptr + vmov s1, r1 ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack - vmov s1, rptr_tmp + add ctr, rptr_tmp, #64*4*4 - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldrb.w r8, [r2, #5] // *............................. - ldrh.w r12, [r2, #3] // *............................. - ldrb.w r4, [r2, #2] // .*............................ - ldrh.w r6, [r2], #6 // .*............................ - ubfx r7, r12, #12, #4 // ...*.......................... - orr r5, r7, r8, lsl #4 // ....*......................... + // Instructions: 5 + // Expected cycles: 5 + // Expected IPC: 1.00 + // + // Cycle bound: 5.0 + // IPC bound: 1.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldrb.w r8, [r2, #5] // *............................. + ldrh.w r7, [r2, #3] // *............................. + ldrb.w r12, [r2, #2] // .*............................ + ldrh.w r5, [r2], #6 // .*............................ + ubfx r4, r5, #12, #4 // ....*......................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldrh.w r12, [r2, #3] // *.............................. - // ldrb.w r7, [r2, #5] // *.............................. - // ldrb.w r4, [r2, #2] // .*............................. - // ubfx r5, r12, #12, #4 // ...*........................... - // orr r5, r5, r7, lsl #4 // ....*.......................... - // ldrh.w r6, [r2], #6 // .*............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldrb.w r8, [r2, #5] // *.............................. + // ldrh.w r7, [r2, #3] // *.............................. + // ldrb.w r12, [r2, #2] // .*............................. + // ldrh.w r5, [r2], #6 // .*............................. + // ubfx r4, r5, #12, #4 // ....*.......................... sub r14, r14, #16 1: - // Instructions: 45 - // Expected cycles: 26 - // Expected IPC: 1.73 - // - // Cycle bound: 29.0 - // IPC bound: 1.55 - // - // Wall time: 10.31s - // User time: 10.31s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ubfx r7, r6, #12, #4 // *............................. - vmov r8, s2 // *............................. - orr r7, r7, r4, lsl #4 // .*............................ - ldr r4, [r8], #4 // .*............................ - ubfx r6, r6, #0, #12 // ..*........................... - orr r6, r6, r7, lsl #16 // ...*.......................... - vmov s2, r8 // ...*.......................... - ubfx r8, r12, #0, #12 // ....*......................... - smulwt r7, r4, r6 // ....*......................... - orr r5, r8, r5, lsl #16 // .....*........................ - ldr r12, [r1], #8 // .....*........................ - neg r4, r4 // ......*....................... - smlabt r8, r7, r9, r10 // ......*....................... - ldr r7, [r3], #16 // .......*...................... // @slothy:core - smulwt r4, r4, r5 // .......*...................... - smlatt r7, r12, r8, r7 // ........*..................... - smlabt r4, r4, r9, r10 // .........*.................... - ldr r8, [r3, #-12] // .........*.................... - smlabb r7, r12, r6, r7 // ..........*................... - smladx r8, r12, r6, r8 // ...........*.................. - mul r7, r7, r11 // ............*................. - ldr r6, [r1, #-4] // .............*................ - mul r8, r8, r11 // .............*................ - smlatt r12, r7, r9, r10 // ..............*............... - ldr r7, [r3, #-8] // ..............*............... - smlatt r8, r8, r9, r10 // ...............*.............. - smlatt r7, r6, r4, r7 // ................*............. - ldr r4, [r3, #-4] // ................*............. - pkhtb r12, r8, r12, asr #16 // .................*............ - str r12, [r0], #8 // .................*............ - ldrh.w r12, [r2, #3] // ..................e........... - smlabb r8, r6, r5, r7 // ..................*........... - smladx r6, r6, r5, r4 // ...................*.......... - ldrb.w r7, [r2, #5] // ...................e.......... - ldrb.w r4, [r2, #2] // ....................e......... - mul r8, r8, r11 // ....................*......... - ubfx r5, r12, #12, #4 // .....................e........ - mul r6, r6, r11 // .....................*........ - orr r5, r5, r7, lsl #4 // ......................e....... - smlatt r7, r8, r9, r10 // ......................*....... - smlatt r8, r6, r9, r10 // .......................*...... - ldrh.w r6, [r2], #6 // .......................e...... - cmp.w r3, r14 // ........................*..... - pkhtb r8, r8, r7, asr #16 // .........................*.... - str r8, [r0, #-4] // .........................*.... - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - // ldrb.w r6, [r2, #2] // ..e.....'...................~..... - // ldrh.w r7, [r2, #3] // e.......'.................~....... - // ldrb.w r8, [r2, #5] // .e......'..................~...... - // ldrh.w r4, [r2], #6 // .....e..'......................~.. - // ubfx r5, r4, #12, #4 // ........*......................... - // ubfx r4, r4, #0, #12 // ........'.*....................... - // orr r5, r5, r6, lsl #4 // ........'*........................ - // orr r4, r4, r5, lsl #16 // ........'..*...................... - // ubfx r5, r7, #12, #4 // ...e....'....................~.... - // ubfx r6, r7, #0, #12 // ........'...*..................... - // orr r5, r5, r8, lsl #4 // ....e...'.....................~... - // orr r5, r6, r5, lsl #16 // ........'....*.................... - // vmov r6, s2 // ........*......................... - // ldr r12, [r6], #4 // ........'*........................ - // vmov s2, r6 // ........'..*...................... - // ldr r8, [r1], #8 // ........'....*.................... - // ldr r6, [r3], #16 // ........'......*.................. - // smulwt r7, r12, r4 // ........'...*..................... - // smlabt r7, r7, r9, r10 // ........'.....*................... - // smlatt r7, r8, r7, r6 // ........'.......*................. - // smlabb r7, r8, r4, r7 // ........'.........*............... - // mul r7, r7, r11 // ........'...........*............. - // smlatt r7, r7, r9, r10 // ........'.............*........... - // ldr r6, [r3, #-12] // ........'........*................ - // smladx r6, r8, r4, r6 // ........'..........*.............. - // mul r6, r6, r11 // ........'............*............ - // smlatt r6, r6, r9, r10 // ........'..............*.......... - // pkhtb r6, r6, r7, asr #16 // ........'................*........ - // str r6, [r0], #8 // ........'................*........ - // neg r12, r12 // ........'.....*................... - // ldr r8, [r1, #-4] // ........'............*............ - // ldr r6, [r3, #-8] // ........'.............*........... - // smulwt r7, r12, r5 // ........'......*.................. - // smlabt r7, r7, r9, r10 // ........'........*................ - // smlatt r7, r8, r7, r6 // ........'...............*......... - // smlabb r7, r8, r5, r7 // ~.......'.................*....... - // mul r7, r7, r11 // ..~.....'...................*..... - // smlatt r7, r7, r9, r10 // ....~...'.....................*... - // ldr r6, [r3, #-4] // ........'...............*......... - // smladx r6, r8, r5, r6 // .~......'..................*...... - // mul r6, r6, r11 // ...~....'....................*.... - // smlatt r6, r6, r9, r10 // .....~..'......................*.. - // pkhtb r6, r6, r7, asr #16 // .......~'........................* - // str r6, [r0, #-4] // .......~'........................* - // cmp.w r3, r14 // ......~.'.......................*. - - bne 1b - // Instructions: 39 - // Expected cycles: 25 - // Expected IPC: 1.56 + // Instructions: 48 + // Expected cycles: 26 + // Expected IPC: 1.85 // - // Cycle bound: 25.0 - // IPC bound: 1.56 + // Cycle bound: 29.0 + // IPC bound: 1.66 // - // Wall time: 0.95s - // User time: 0.95s + // Wall time: 11.37s + // User time: 11.37s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ubfx r7, r6, #12, #4 // *............................. - vmov r8, s2 // *............................. - orr r4, r7, r4, lsl #4 // .*............................ - ldr r7, [r8], #4 // .*............................ - ubfx r6, r6, #0, #12 // ..*........................... - vmov s2, r8 // ...*.......................... - orr r8, r6, r4, lsl #16 // ...*.......................... - ubfx r6, r12, #0, #12 // ....*......................... - smulwt r4, r7, r8 // ....*......................... - neg r7, r7 // .....*........................ - orr r5, r6, r5, lsl #16 // .....*........................ - smulwt r6, r7, r5 // ......*....................... - ldr r12, [r1], #8 // ......*....................... - smlabt r4, r4, r9, r10 // .......*...................... - ldr r7, [r3], #16 // .......*...................... // @slothy:core - smlabt r6, r6, r9, r10 // ........*..................... - smlatt r4, r12, r4, r7 // .........*.................... - ldr r7, [r3, #-12] // .........*.................... - cmp.w r3, r14 // ..........*................... - smlabb r4, r12, r8, r4 // ..........*................... - smladx r7, r12, r8, r7 // ...........*.................. - mul r4, r4, r11 // ............*................. - ldr r8, [r1, #-4] // ............*................. - mul r12, r7, r11 // .............*................ - smlatt r7, r4, r9, r10 // ..............*............... - smlatt r4, r12, r9, r10 // ...............*.............. - ldr r12, [r3, #-8] // ...............*.............. - smlatt r12, r8, r6, r12 // ................*............. - ldr r6, [r3, #-4] // ................*............. - pkhtb r4, r4, r7, asr #16 // .................*............ - smlabb r12, r8, r5, r12 // .................*............ - smladx r6, r8, r5, r6 // ..................*........... - mul r7, r12, r11 // ...................*.......... - mul r12, r6, r11 // ....................*......... - smlatt r5, r7, r9, r10 // .....................*........ - smlatt r7, r12, r9, r10 // ......................*....... - str r4, [r0], #8 // .......................*...... - pkhtb r8, r7, r5, asr #16 // ........................*..... - str r8, [r0, #-4] // ........................*..... + orr r6, r4, r12, lsl #4 // *............................. + ldr.w r4, [r3], #4 // *............................. + ubfx r5, r5, #0, #12 // .*............................ + vmov r12, s1 // .*............................ + vmov s27, r3 // ..*........................... + orr r5, r5, r6, lsl #16 // ..*........................... + ubfx r3, r7, #12, #4 // ...*.......................... + smulwt r6, r4, r5 // ...*.......................... + neg r4, r4 // ....*......................... + orr r8, r3, r8, lsl #4 // ....*......................... + ubfx r7, r7, #0, #12 // .....*........................ + smlabt r6, r6, r9, r10 // .....*........................ + orr r8, r7, r8, lsl #16 // ......*....................... + ldr r3, [r12], #8 // ......*....................... + ldr r7, [r1], #16 // .......*...................... // @slothy:core // @slothy:before=cmp + smulwt r4, r4, r8 // .......*...................... + smlatt r6, r3, r6, r7 // ........*..................... + ldr r7, [r1, #-12] // .........*.................... + smlabt r4, r4, r9, r10 // .........*.................... + vmov s1, r12 // ..........*................... // @slothy:core + smlabb r6, r3, r5, r6 // ..........*................... + cmp.w r1, r14 // ...........*.................. // @slothy:id=cmp + smladx r7, r3, r5, r7 // ...........*.................. + ldr r5, [r12, #-4] // ............*................. + mul r6, r6, r11 // ............*................. + ldr r3, [r1, #-8] // .............*................ + mul r7, r7, r11 // .............*................ + ldr r12, [r1, #-4] // ..............*............... + smlatt r4, r5, r4, r3 // ..............*............... + smlabb r4, r5, r8, r4 // ...............*.............. + smladx r12, r5, r8, r12 // ................*............. + ldrb.w r8, [r2, #5] // .................e............ + smlatt r6, r6, r9, r10 // .................*............ + smlatt r3, r7, r9, r10 // ..................*........... + ldrh.w r7, [r2, #3] // ...................e.......... + mul r4, r4, r11 // ...................*.......... + pkhtb r3, r3, r6, asr #16 // ....................*......... + mul r6, r12, r11 // ....................*......... + ldrb.w r12, [r2, #2] // .....................e........ + smlatt r4, r4, r9, r10 // .....................*........ + ldrh.w r5, [r2], #6 // ......................e....... + smlatt r6, r6, r9, r10 // ......................*....... + str r3, [r0], #8 // .......................*...... + vmov r3, s27 // .......................*...... + pkhtb r4, r6, r4, asr #16 // ........................*..... + str r4, [r0, #-4] // ........................*..... + ubfx r4, r5, #12, #4 // .........................e.... + bne.w 1b // .........................*.... // @slothy:branch + + // -------- cycle (expected) --------> + // 0 25 + // |------------------------|--------- + // ldr.w r12, [r3], #4 // .........*......................... + // ldrb.w r6, [r2, #2] // ....e....'....................~.... + // ldrh.w r7, [r2, #3] // ..e......'..................~...... + // ldrb.w r8, [r2, #5] // e........'................~........ + // ldrh.w r4, [r2], #6 // .....e...'.....................~... + // ubfx r5, r4, #12, #4 // ........e'......................... + // ubfx r4, r4, #0, #12 // .........'*........................ + // orr r5, r5, r6, lsl #4 // .........*......................... + // orr r4, r4, r5, lsl #16 // .........'.*....................... + // ubfx r5, r7, #12, #4 // .........'..*...................... + // ubfx r6, r7, #0, #12 // .........'....*.................... + // orr r5, r5, r8, lsl #4 // .........'...*..................... + // orr r5, r6, r5, lsl #16 // .........'.....*................... + // vmov s2, r3 // .........'.*....................... + // vmov r3, s1 // .........'*........................ + // ldr r8, [r3], #8 // .........'.....*................... + // ldr r6, [r1], #16 // .........'......*.................. + // smulwt r7, r12, r4 // .........'..*...................... + // smlabt r7, r7, r9, r10 // .........'....*.................... + // smlatt r7, r8, r7, r6 // .........'.......*................. + // smlabb r7, r8, r4, r7 // .........'.........*............... + // mul r7, r7, r11 // .........'...........*............. + // smlatt r7, r7, r9, r10 // ~........'................*........ + // ldr r6, [r1, #-12] // .........'........*................ + // smladx r6, r8, r4, r6 // .........'..........*.............. + // mul r6, r6, r11 // .........'............*............ + // smlatt r6, r6, r9, r10 // .~.......'.................*....... + // pkhtb r6, r6, r7, asr #16 // ...~.....'...................*..... + // str r6, [r0], #8 // ......~..'......................*.. + // neg r12, r12 // .........'...*..................... + // ldr r8, [r3, #-4] // .........'...........*............. + // ldr r6, [r1, #-8] // .........'............*............ + // smulwt r7, r12, r5 // .........'......*.................. + // smlabt r7, r7, r9, r10 // .........'........*................ + // smlatt r7, r8, r7, r6 // .........'.............*........... + // smlabb r7, r8, r5, r7 // .........'..............*.......... + // mul r7, r7, r11 // ..~......'..................*...... + // smlatt r7, r7, r9, r10 // ....~....'....................*.... + // ldr r6, [r1, #-4] // .........'.............*........... + // smladx r6, r8, r5, r6 // .........'...............*......... + // mul r6, r6, r11 // ...~.....'...................*..... + // smlatt r6, r6, r9, r10 // .....~...'.....................*... + // pkhtb r6, r6, r7, asr #16 // .......~.'.......................*. + // str r6, [r0, #-4] // .......~.'.......................*. + // vmov s1, r3 // .........'.........*............... + // cmp.w r1, r14 // .........'..........*.............. + // vmov r3, s2 // ......~..'......................*.. + // bne.w 1b // ........~'........................* - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ubfx r7, r6, #12, #4 // *.............................. - // vmov r8, s2 // *.............................. - // orr r7, r7, r4, lsl #4 // .*............................. - // ldr r4, [r8], #4 // .*............................. - // ubfx r6, r6, #0, #12 // ..*............................ - // orr r6, r6, r7, lsl #16 // ...*........................... - // vmov s2, r8 // ...*........................... - // ubfx r8, r12, #0, #12 // ....*.......................... - // smulwt r7, r4, r6 // ....*.......................... - // orr r5, r8, r5, lsl #16 // .....*......................... - // ldr r12, [r1], #8 // ......*........................ - // neg r4, r4 // .....*......................... - // smlabt r8, r7, r9, r10 // .......*....................... - // ldr r7, [r3], #16 // .......*....................... - // smulwt r4, r4, r5 // ......*........................ - // smlatt r7, r12, r8, r7 // .........*..................... - // smlabt r4, r4, r9, r10 // ........*...................... - // ldr r8, [r3, #-12] // .........*..................... - // smlabb r7, r12, r6, r7 // ..........*.................... - // smladx r8, r12, r6, r8 // ...........*................... - // mul r7, r7, r11 // ............*.................. - // ldr r6, [r1, #-4] // ............*.................. - // mul r8, r8, r11 // .............*................. - // smlatt r12, r7, r9, r10 // ..............*................ - // ldr r7, [r3, #-8] // ...............*............... - // smlatt r8, r8, r9, r10 // ...............*............... - // smlatt r7, r6, r4, r7 // ................*.............. - // ldr r4, [r3, #-4] // ................*.............. - // pkhtb r12, r8, r12, asr #16 // .................*............. - // str r12, [r0], #8 // .......................*....... - // smlabb r8, r6, r5, r7 // .................*............. - // smladx r6, r6, r5, r4 // ..................*............ - // mul r8, r8, r11 // ...................*........... - // mul r6, r6, r11 // ....................*.......... - // smlatt r7, r8, r9, r10 // .....................*......... - // smlatt r8, r6, r9, r10 // ......................*........ - // cmp.w r3, r14 // ..........*.................... - // pkhtb r8, r8, r7, asr #16 // ........................*...... - // str r8, [r0, #-4] // ........................*...... + // Instructions: 43 + // Expected cycles: 26 + // Expected IPC: 1.65 + // + // Cycle bound: 26.0 + // IPC bound: 1.65 + // + // Wall time: 1.77s + // User time: 1.77s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + orr r12, r4, r12, lsl #4 // *............................. + ldr.w r4, [r3], #4 // *............................. + ubfx r5, r5, #0, #12 // .*............................ + vmov r6, s1 // .*............................ + orr r12, r5, r12, lsl #16 // ..*........................... + vmov s27, r3 // ..*........................... + ubfx r5, r7, #12, #4 // ...*.......................... + smulwt r3, r4, r12 // ...*.......................... + orr r8, r5, r8, lsl #4 // ....*......................... + ldr r5, [r6], #8 // ....*......................... + ubfx r7, r7, #0, #12 // .....*........................ + smlabt r3, r3, r9, r10 // .....*........................ + orr r7, r7, r8, lsl #16 // ......*....................... + ldr r8, [r1], #16 // ......*....................... // @slothy:core // @slothy:before=cmp + neg r4, r4 // .......*...................... + smlatt r8, r5, r3, r8 // .......*...................... + ldr r3, [r1, #-12] // ........*..................... + smlabb r8, r5, r12, r8 // ........*..................... + cmp.w r1, r14 // .........*.................... // @slothy:id=cmp + smulwt r4, r4, r7 // .........*.................... + vmov s1, r6 // ..........*................... // @slothy:core + smladx r5, r5, r12, r3 // ..........*................... + ldr r12, [r6, #-4] // ...........*.................. + smlabt r6, r4, r9, r10 // ...........*.................. + ldr r3, [r1, #-8] // ............*................. + mul r8, r8, r11 // ............*................. + ldr r4, [r1, #-4] // .............*................ + smlatt r6, r12, r6, r3 // .............*................ + vmov r3, s27 // ..............*............... + smlabb r6, r12, r7, r6 // ..............*............... + smladx r7, r12, r7, r4 // ...............*.............. + mul r12, r6, r11 // ................*............. + mul r6, r7, r11 // .................*............ + smlatt r12, r12, r9, r10 // ..................*........... + smlatt r7, r6, r9, r10 // ...................*.......... + mul r4, r5, r11 // ....................*......... + pkhtb r5, r7, r12, asr #16 // .....................*........ + smlatt r8, r8, r9, r10 // .....................*........ + smlatt r4, r4, r9, r10 // ......................*....... + pkhtb r6, r4, r8, asr #16 // ........................*..... + str r6, [r0], #8 // ........................*..... + str r5, [r0, #-4] // .........................*.... - // Original code - // ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack - // vmov s1, tmp - // vmov s2, zetaptr - // add ctr, tmp, #64*4*4 - // 1: - // vmov zetaptr, s2 - // ldr.w zeta, [zetaptr], #4 - // deserialize aptr, tmp, tmp2, tmp3, t0, t1 - // vmov s2, zetaptr - // vmov rptr_tmp, s1 - // doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - // vmov s1, rptr_tmp - // cmp.w rptr_tmp, ctr - // bne.w 1b + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // orr r6, r4, r12, lsl #4 // *.............................. + // ldr.w r4, [r3], #4 // *.............................. + // ubfx r5, r5, #0, #12 // .*............................. + // vmov r12, s1 // .*............................. + // vmov s27, r3 // ..*............................ + // orr r5, r5, r6, lsl #16 // ..*............................ + // ubfx r3, r7, #12, #4 // ...*........................... + // smulwt r6, r4, r5 // ...*........................... + // neg r4, r4 // .......*....................... + // orr r8, r3, r8, lsl #4 // ....*.......................... + // ubfx r7, r7, #0, #12 // .....*......................... + // smlabt r6, r6, r9, r10 // .....*......................... + // orr r8, r7, r8, lsl #16 // ......*........................ + // ldr r3, [r12], #8 // ....*.......................... + // ldr r7, [r1], #16 // ......*........................ + // smulwt r4, r4, r8 // .........*..................... + // smlatt r6, r3, r6, r7 // .......*....................... + // ldr r7, [r1, #-12] // ........*...................... + // smlabt r4, r4, r9, r10 // ...........*................... + // vmov s1, r12 // ..........*.................... + // smlabb r6, r3, r5, r6 // ........*...................... + // cmp.w r1, r14 // .........*..................... + // smladx r7, r3, r5, r7 // ..........*.................... + // ldr r5, [r12, #-4] // ...........*................... + // mul r6, r6, r11 // ............*.................. + // ldr r3, [r1, #-8] // ............*.................. + // mul r7, r7, r11 // ....................*.......... + // ldr r12, [r1, #-4] // .............*................. + // smlatt r4, r5, r4, r3 // .............*................. + // smlabb r4, r5, r8, r4 // ..............*................ + // smladx r12, r5, r8, r12 // ...............*............... + // smlatt r6, r6, r9, r10 // .....................*......... + // smlatt r3, r7, r9, r10 // ......................*........ + // mul r4, r4, r11 // ................*.............. + // pkhtb r3, r3, r6, asr #16 // ........................*...... + // mul r6, r12, r11 // .................*............. + // smlatt r4, r4, r9, r10 // ..................*............ + // smlatt r6, r6, r9, r10 // ...................*........... + // str r3, [r0], #8 // ........................*...... + // vmov r3, s27 // ..............*................ + // pkhtb r4, r6, r4, asr #16 // .....................*......... + // str r4, [r0, #-4] // .........................*..... + // bne.w 1b // .........................*..... -pop {r4-r11, pc} +pop {r4-r11, pc} .size frombytes_mul_asm_acc_32_16_opt_m7, .-frombytes_mul_asm_acc_32_16_opt_m7 \ No newline at end of file diff --git a/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s b/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s index 0f35011c..df797e2c 100644 --- a/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s +++ b/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s @@ -32,7 +32,7 @@ // r[1] in upper half of tmp2 pkhtb \tmp, \tmp2, \tmp, asr #16 uadd16 \res0, \res0, \tmp - str \res0, [\rptr], #8 // @slothy:core + str \res0, [\rptr], #8 // @slothy:core // @slothy:before=cmp neg \zeta, \zeta @@ -101,237 +101,249 @@ frombytes_mul_asm_acc_opt_m7: movt qinv, #27560 add ctr, rptr, #64*4*2 - vmov s0, ctr - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldrh.w r8, [r2, #3] // *............................. - ldrb.w r5, [r2, #5] // .*............................ - ldrb.w r4, [r2, #2] // ..*........................... - ldrh.w r7, [r2], #6 // ..*........................... - ubfx r12, r8, #12, #4 // ...*.......................... - orr r12, r12, r5, lsl #4 // ....*......................... + // Instructions: 5 + // Expected cycles: 5 + // Expected IPC: 1.00 + // + // Cycle bound: 5.0 + // IPC bound: 1.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldrb.w r8, [r2, #2] // *............................. + ldrh.w r5, [r2, #3] // *............................. + ldrb.w r7, [r2, #5] // .*............................ + ldrh.w r4, [r2], #6 // .*............................ + ubfx r6, r4, #12, #4 // ....*......................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldrh.w r8, [r2, #3] // *.............................. - // ldrb.w r4, [r2, #5] // .*............................. - // ubfx r12, r8, #12, #4 // ...*........................... - // orr r12, r12, r4, lsl #4 // ....*.......................... - // ldrb.w r4, [r2, #2] // ..*............................ - // ldrh.w r7, [r2], #6 // ..*............................ + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldrb.w r8, [r2, #2] // *.............................. + // ldrh.w r5, [r2, #3] // *.............................. + // ldrb.w r7, [r2, #5] // .*............................. + // ldrh.w r4, [r2], #6 // .*............................. + // ubfx r6, r4, #12, #4 // ....*.......................... - push {ctr} - vmov ctr, s0 - sub ctr, ctr, #8 - vmov s0, ctr - pop {ctr} + push {r14} + vmov r14, s0 + sub r14, r14, #8 + vmov s0, r14 + pop {r14} 1: - // Instructions: 42 - // Expected cycles: 25 - // Expected IPC: 1.68 - // - // Cycle bound: 27.0 - // IPC bound: 1.56 - // - // Wall time: 3.82s - // User time: 3.82s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ubfx r6, r7, #12, #4 // *............................. - ldr.w r5, [r3], #4 // *............................. - orr r6, r6, r4, lsl #4 // .*............................ - ldr r14, [r0] // .*............................ - ubfx r4, r7, #0, #12 // ..*........................... - orr r4, r4, r6, lsl #16 // ...*.......................... - ldr r7, [r1], #8 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smulwt r6, r5, r4 // ....*......................... - orr r12, r8, r12, lsl #16 // .....*........................ - smuadx r8, r7, r4 // .....*........................ - neg r5, r5 // ......*....................... - smlabt r6, r6, r9, r10 // ......*....................... - mul r8, r8, r11 // .......*...................... - smultt r6, r7, r6 // ........*..................... - smlabb r7, r7, r4, r6 // .........*.................... - smulwt r6, r5, r12 // ..........*................... - mul r4, r7, r11 // ...........*.................. - smlabt r5, r6, r9, r10 // ............*................. - ldr r6, [r1, #-4] // .............*................ - smlatt r7, r4, r9, r10 // .............*................ - smlatt r4, r8, r9, r10 // ..............*............... - ldrh.w r8, [r2, #3] // ...............e.............. - smultt r5, r6, r5 // ...............*.............. - pkhtb r7, r4, r7, asr #16 // ................*............. - smlabb r5, r6, r12, r5 // ................*............. - ldrb.w r4, [r2, #5] // .................e............ - smuadx r6, r6, r12 // .................*............ - uadd16 r14, r14, r7 // ..................*........... - mul r7, r5, r11 // ..................*........... - ubfx r12, r8, #12, #4 // ...................e.......... - mul r5, r6, r11 // ...................*.......... - orr r12, r12, r4, lsl #4 // ....................e......... - smlatt r6, r7, r9, r10 // ....................*......... - ldrb.w r4, [r2, #2] // .....................e........ - smlatt r5, r5, r9, r10 // .....................*........ - str r14, [r0], #8 // ......................*....... // @slothy:core - ldrh.w r7, [r2], #6 // ......................e....... - pkhtb r6, r5, r6, asr #16 // .......................*...... - ldr r14, [r0, #-4] // .......................*...... - uadd16 r14, r14, r6 // ........................*..... - str r14, [r0, #-4] // ........................*..... + // Instructions: 46 + // Expected cycles: 26 + // Expected IPC: 1.77 + // + // Cycle bound: 28.0 + // IPC bound: 1.64 + // + // Wall time: 8.16s + // User time: 8.16s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + orr r12, r6, r8, lsl #4 // *............................. + ldr.w r6, [r3], #4 // *............................. + ubfx r8, r4, #0, #12 // .*............................ + ldr r4, [r1], #8 // .*............................ + orr r8, r8, r12, lsl #16 // ..*........................... + vmov s0, r14 // ..*........................... + ubfx r12, r5, #12, #4 // ...*.......................... + smuadx r14, r4, r8 // ...*.......................... + orr r12, r12, r7, lsl #4 // ....*......................... + smulwt r7, r6, r8 // ....*......................... + ubfx r5, r5, #0, #12 // .....*........................ + mul r14, r14, r11 // .....*........................ + orr r12, r5, r12, lsl #16 // ......*....................... + smlabt r5, r7, r9, r10 // ......*....................... + neg r6, r6 // .......*...................... + smlatt r7, r14, r9, r10 // .......*...................... + smultt r5, r4, r5 // ........*..................... + smlabb r4, r4, r8, r5 // .........*.................... + ldr r8, [r1, #-4] // ..........*................... + smulwt r5, r6, r12 // ..........*................... + mul r6, r4, r11 // ...........*.................. + ldr r4, [r0] // ............*................. + smlabt r5, r5, r9, r10 // ............*................. + smlatt r6, r6, r9, r10 // .............*................ + smultt r5, r8, r5 // ..............*............... + vmov r14, s0 // ...............*.............. + smlabb r5, r8, r12, r5 // ...............*.............. + pkhtb r6, r7, r6, asr #16 // ................*............. + smuadx r7, r8, r12 // ................*............. + uadd16 r4, r4, r6 // .................*............ + mul r6, r5, r11 // .................*............ + ldrb.w r8, [r2, #2] // ..................e........... + mul r12, r7, r11 // ..................*........... + ldrh.w r5, [r2, #3] // ...................e.......... + smlatt r6, r6, r9, r10 // ...................*.......... + ldrb.w r7, [r2, #5] // ....................e......... + smlatt r12, r12, r9, r10 // ....................*......... + str r4, [r0], #8 // .....................*........ // @slothy:core // @slothy:before=cmp + ldrh.w r4, [r2], #6 // .....................e........ + pkhtb r6, r12, r6, asr #16 // ......................*....... + ldr r12, [r0, #-4] // ......................*....... + uadd16 r12, r12, r6 // .......................*...... + cmp.w r0, r14 // .......................*...... // @slothy:id=cmp + ubfx r6, r4, #12, #4 // ........................e..... + str r12, [r0, #-4] // ........................*..... + bne.w 1b // .........................*.... // @slothy:branch - // -------- cycle (expected) --------> + // ------- cycle (expected) --------> // 0 25 - // |------------------------|--------- - // ldr.w r12, [r3], #4 // ..........*........................ - // ldrb.w r6, [r2, #2] // ......e...'....................~... - // ldrh.w r7, [r2, #3] // e.........'..............~......... - // ldrb.w r8, [r2, #5] // ..e.......'................~....... - // ldrh.w r4, [r2], #6 // .......e..'.....................~.. - // ubfx r5, r4, #12, #4 // ..........*........................ - // ubfx r4, r4, #0, #12 // ..........'.*...................... - // orr r5, r5, r6, lsl #4 // ..........'*....................... - // orr r4, r4, r5, lsl #16 // ..........'..*..................... - // ubfx r5, r7, #12, #4 // ....e.....'..................~..... - // ubfx r6, r7, #0, #12 // ..........'...*.................... - // orr r5, r5, r8, lsl #4 // .....e....'...................~.... - // orr r5, r6, r5, lsl #16 // ..........'....*................... - // ldr r8, [r1], #8 // ..........'..*..................... - // ldr r14, [r0] // ..........'*....................... - // smulwt r6, r12, r4 // ..........'...*.................... - // smlabt r6, r6, r9, r10 // ..........'.....*.................. - // smultt r6, r8, r6 // ..........'.......*................ - // smlabb r6, r8, r4, r6 // ..........'........*............... - // mul r6, r6, r11 // ..........'..........*............. - // smlatt r6, r6, r9, r10 // ..........'............*........... - // smuadx r7, r8, r4 // ..........'....*................... - // mul r7, r7, r11 // ..........'......*................. - // smlatt r7, r7, r9, r10 // ..........'.............*.......... - // pkhtb r6, r7, r6, asr #16 // .~........'...............*........ - // uadd16 r14, r14, r6 // ...~......'.................*...... - // str r14, [r0], #8 // .......~..'.....................*.. - // neg r12, r12 // ..........'.....*.................. - // ldr r8, [r1, #-4] // ..........'............*........... - // ldr r14, [r0, #-4] // ........~.'......................*. - // smulwt r6, r12, r5 // ..........'.........*.............. - // smlabt r6, r6, r9, r10 // ..........'...........*............ - // smultt r6, r8, r6 // ~.........'..............*......... - // smlabb r6, r8, r5, r6 // .~........'...............*........ - // mul r6, r6, r11 // ...~......'.................*...... - // smlatt r6, r6, r9, r10 // .....~....'...................*.... - // smuadx r7, r8, r5 // ..~.......'................*....... - // mul r7, r7, r11 // ....~.....'..................*..... - // smlatt r7, r7, r9, r10 // ......~...'....................*... - // pkhtb r6, r7, r6, asr #16 // ........~.'......................*. - // uadd16 r14, r14, r6 // .........~'.......................* - // str r14, [r0, #-4] // .........~'.......................* + // |------------------------|-------- + // ldr.w r12, [r3], #4 // ........*......................... + // ldrb.w r6, [r2, #2] // e.......'.................~....... + // ldrh.w r7, [r2, #3] // .e......'..................~...... + // ldrb.w r8, [r2, #5] // ..e.....'...................~..... + // ldrh.w r4, [r2], #6 // ...e....'....................~.... + // ubfx r5, r4, #12, #4 // ......e.'.......................~. + // ubfx r4, r4, #0, #12 // ........'*........................ + // orr r5, r5, r6, lsl #4 // ........*......................... + // orr r4, r4, r5, lsl #16 // ........'.*....................... + // ubfx r5, r7, #12, #4 // ........'..*...................... + // ubfx r6, r7, #0, #12 // ........'....*.................... + // orr r5, r5, r8, lsl #4 // ........'...*..................... + // orr r5, r6, r5, lsl #16 // ........'.....*................... + // vmov s0, r14 // ........'.*....................... + // ldr r8, [r1], #8 // ........'*........................ + // ldr r14, [r0] // ........'...........*............. + // smulwt r6, r12, r4 // ........'...*..................... + // smlabt r6, r6, r9, r10 // ........'.....*................... + // smultt r6, r8, r6 // ........'.......*................. + // smlabb r6, r8, r4, r6 // ........'........*................ + // mul r6, r6, r11 // ........'..........*.............. + // smlatt r6, r6, r9, r10 // ........'............*............ + // smuadx r7, r8, r4 // ........'..*...................... + // mul r7, r7, r11 // ........'....*.................... + // smlatt r7, r7, r9, r10 // ........'......*.................. + // pkhtb r6, r7, r6, asr #16 // ........'...............*......... + // uadd16 r14, r14, r6 // ........'................*........ + // str r14, [r0], #8 // ...~....'....................*.... + // neg r12, r12 // ........'......*.................. + // ldr r8, [r1, #-4] // ........'.........*............... + // ldr r14, [r0, #-4] // ....~...'.....................*... + // smulwt r6, r12, r5 // ........'.........*............... + // smlabt r6, r6, r9, r10 // ........'...........*............. + // smultt r6, r8, r6 // ........'.............*........... + // smlabb r6, r8, r5, r6 // ........'..............*.......... + // mul r6, r6, r11 // ........'................*........ + // smlatt r6, r6, r9, r10 // .~......'..................*...... + // smuadx r7, r8, r5 // ........'...............*......... + // mul r7, r7, r11 // ~.......'.................*....... + // smlatt r7, r7, r9, r10 // ..~.....'...................*..... + // pkhtb r6, r7, r6, asr #16 // ....~...'.....................*... + // uadd16 r14, r14, r6 // .....~..'......................*.. + // str r14, [r0, #-4] // ......~.'.......................*. + // vmov r14, s0 // ........'..............*.......... + // cmp.w r0, r14 // .....~..'......................*.. + // bne.w 1b // .......~'........................* - vmov ctr, s0 - cmp rptr, ctr - bne 1b - // Instructions: 36 - // Expected cycles: 25 - // Expected IPC: 1.44 + + // Instructions: 41 + // Expected cycles: 26 + // Expected IPC: 1.58 // - // Cycle bound: 25.0 - // IPC bound: 1.44 + // Cycle bound: 26.0 + // IPC bound: 1.58 // - // Wall time: 0.44s - // User time: 0.44s + // Wall time: 1.19s + // User time: 1.19s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ubfx r6, r7, #12, #4 // *............................. - ldr r5, [r1], #8 // *............................. - orr r4, r6, r4, lsl #4 // .*............................ + ldr r12, [r1], #8 // *............................. + orr r8, r6, r8, lsl #4 // *............................. + ubfx r4, r4, #0, #12 // .*............................ ldr.w r6, [r3], #4 // .*............................ - ubfx r7, r7, #0, #12 // ..*........................... - ldr r14, [r0] // ..*........................... - orr r7, r7, r4, lsl #16 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smulwt r4, r6, r7 // ....*......................... - orr r8, r8, r12, lsl #16 // .....*........................ - smuadx r12, r5, r7 // .....*........................ - neg r6, r6 // ......*....................... - smlabt r4, r4, r9, r10 // ......*....................... - mul r12, r12, r11 // .......*...................... - smultt r4, r5, r4 // ........*..................... - smlabb r5, r5, r7, r4 // .........*.................... - ldr r7, [r1, #-4] // ..........*................... - smulwt r6, r6, r8 // ..........*................... - mul r4, r5, r11 // ...........*.................. - smlabt r5, r6, r9, r10 // ............*................. - smlatt r4, r4, r9, r10 // .............*................ - smlatt r6, r12, r9, r10 // ..............*............... - smultt r5, r7, r5 // ...............*.............. - pkhtb r12, r6, r4, asr #16 // ................*............. - smlabb r4, r7, r8, r5 // ................*............. - uadd16 r14, r14, r12 // .................*............ - smuadx r8, r7, r8 // .................*............ - mul r4, r4, r11 // ..................*........... - mul r8, r8, r11 // ...................*.......... - smlatt r6, r4, r9, r10 // ....................*......... - smlatt r4, r8, r9, r10 // .....................*........ - str r14, [r0], #8 // ......................*....... // @slothy:core - pkhtb r4, r4, r6, asr #16 // .......................*...... - ldr r14, [r0, #-4] // .......................*...... - uadd16 r14, r14, r4 // ........................*..... - str r14, [r0, #-4] // ........................*..... + vmov s0, r14 // ..*........................... + orr r8, r4, r8, lsl #16 // ..*........................... + ubfx r14, r5, #12, #4 // ...*.......................... + smulwt r4, r6, r8 // ...*.......................... + orr r7, r14, r7, lsl #4 // ....*......................... + smuadx r14, r12, r8 // ....*......................... + ubfx r5, r5, #0, #12 // .....*........................ + smlabt r4, r4, r9, r10 // .....*........................ + orr r5, r5, r7, lsl #16 // ......*....................... + mul r14, r14, r11 // ......*....................... + neg r6, r6 // .......*...................... + smultt r4, r12, r4 // .......*...................... + smlabb r8, r12, r8, r4 // ........*..................... + ldr r12, [r1, #-4] // .........*.................... + smulwt r7, r6, r5 // .........*.................... + mul r8, r8, r11 // ..........*................... + smlabt r7, r7, r9, r10 // ...........*.................. + smlatt r8, r8, r9, r10 // ............*................. + ldr r6, [r0] // .............*................ + smlatt r4, r14, r9, r10 // .............*................ + vmov r14, s0 // ..............*............... + smultt r7, r12, r7 // ..............*............... + pkhtb r4, r4, r8, asr #16 // ...............*.............. + smlabb r8, r12, r5, r7 // ...............*.............. + uadd16 r4, r6, r4 // ................*............. + str r4, [r0], #8 // ................*............. // @slothy:core // @slothy:before=cmp + cmp.w r0, r14 // .................*............ // @slothy:id=cmp + smuadx r4, r12, r5 // .................*............ + ldr r7, [r0, #-4] // ..................*........... + mul r12, r8, r11 // ..................*........... + mul r5, r4, r11 // ...................*.......... + smlatt r12, r12, r9, r10 // ....................*......... + smlatt r5, r5, r9, r10 // .....................*........ + pkhtb r8, r5, r12, asr #16 // .......................*...... + uadd16 r8, r7, r8 // ........................*..... + str r8, [r0, #-4] // ........................*..... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ubfx r6, r7, #12, #4 // *.............................. - // ldr.w r5, [r3], #4 // .*............................. - // orr r6, r6, r4, lsl #4 // .*............................. - // ldr r14, [r0] // ..*............................ - // ubfx r4, r7, #0, #12 // ..*............................ - // orr r4, r4, r6, lsl #16 // ...*........................... - // ldr r7, [r1], #8 // *.............................. - // ubfx r8, r8, #0, #12 // ....*.......................... - // smulwt r6, r5, r4 // ....*.......................... - // orr r12, r8, r12, lsl #16 // .....*......................... - // smuadx r8, r7, r4 // .....*......................... - // neg r5, r5 // ......*........................ - // smlabt r6, r6, r9, r10 // ......*........................ - // mul r8, r8, r11 // .......*....................... - // smultt r6, r7, r6 // ........*...................... - // smlabb r7, r7, r4, r6 // .........*..................... - // smulwt r6, r5, r12 // ..........*.................... - // mul r4, r7, r11 // ...........*................... - // smlabt r5, r6, r9, r10 // ............*.................. - // ldr r6, [r1, #-4] // ..........*.................... - // smlatt r7, r4, r9, r10 // .............*................. - // smlatt r4, r8, r9, r10 // ..............*................ - // smultt r5, r6, r5 // ...............*............... - // pkhtb r7, r4, r7, asr #16 // ................*.............. - // smlabb r5, r6, r12, r5 // ................*.............. - // smuadx r6, r6, r12 // .................*............. - // uadd16 r14, r14, r7 // .................*............. - // mul r7, r5, r11 // ..................*............ - // mul r5, r6, r11 // ...................*........... - // smlatt r6, r7, r9, r10 // ....................*.......... - // smlatt r5, r5, r9, r10 // .....................*......... - // str r14, [r0], #8 // ......................*........ - // pkhtb r6, r5, r6, asr #16 // .......................*....... - // ldr r14, [r0, #-4] // .......................*....... - // uadd16 r14, r14, r6 // ........................*...... - // str r14, [r0, #-4] // ........................*...... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // orr r12, r6, r8, lsl #4 // *.............................. + // ldr.w r6, [r3], #4 // .*............................. + // ubfx r8, r4, #0, #12 // .*............................. + // ldr r4, [r1], #8 // *.............................. + // orr r8, r8, r12, lsl #16 // ..*............................ + // vmov s0, r14 // ..*............................ + // ubfx r12, r5, #12, #4 // ...*........................... + // smuadx r14, r4, r8 // ....*.......................... + // orr r12, r12, r7, lsl #4 // ....*.......................... + // smulwt r7, r6, r8 // ...*........................... + // ubfx r5, r5, #0, #12 // .....*......................... + // mul r14, r14, r11 // ......*........................ + // orr r12, r5, r12, lsl #16 // ......*........................ + // smlabt r5, r7, r9, r10 // .....*......................... + // neg r6, r6 // .......*....................... + // smlatt r7, r14, r9, r10 // .............*................. + // smultt r5, r4, r5 // .......*....................... + // smlabb r4, r4, r8, r5 // ........*...................... + // ldr r8, [r1, #-4] // .........*..................... + // smulwt r5, r6, r12 // .........*..................... + // mul r6, r4, r11 // ..........*.................... + // ldr r4, [r0] // .............*................. + // smlabt r5, r5, r9, r10 // ...........*................... + // smlatt r6, r6, r9, r10 // ............*.................. + // smultt r5, r8, r5 // ..............*................ + // vmov r14, s0 // ..............*................ + // smlabb r5, r8, r12, r5 // ...............*............... + // pkhtb r6, r7, r6, asr #16 // ...............*............... + // smuadx r7, r8, r12 // .................*............. + // uadd16 r4, r4, r6 // ................*.............. + // mul r6, r5, r11 // ..................*............ + // mul r12, r7, r11 // ...................*........... + // smlatt r6, r6, r9, r10 // ....................*.......... + // smlatt r12, r12, r9, r10 // .....................*......... + // str r4, [r0], #8 // ................*.............. + // pkhtb r6, r12, r6, asr #16 // .......................*....... + // ldr r12, [r0, #-4] // ..................*............ + // uadd16 r12, r12, r6 // ........................*...... + // cmp.w r0, r14 // .................*............. + // str r12, [r0, #-4] // ........................*...... + // bne.w 1b // .........................*..... pop {r4-r11, pc} diff --git a/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s b/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s index 3ae57ae4..3cdbce34 100644 --- a/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s +++ b/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s @@ -200,288 +200,297 @@ pqcrystals_dilithium_invntt_tomont_opt_m7: add.w temp_l, ptr_p, #32*strincr // 32 iterations vmov s9, temp_l - // Instructions: 2 - // Expected cycles: 2 - // Expected IPC: 1.00 - // - // Cycle bound: 2.0 - // IPC bound: 1.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - vmov r1, s6 // *............................. - vmov r12, s8 // .*............................ + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Cycle bound: 1.0 + // IPC bound: 1.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r7, [r0, #16] // *............................. - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // vmov r1, s6 // *.............................. - // vmov r12, s8 // .*............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r7, [r0, #16] // *.............................. - push {temp_l} - vmov temp_l, s9 - sub temp_l, temp_l, #32 - vmov s9, temp_l - pop {temp_l} + push {r10} + vmov r10, s9 + sub r10, r10, #32 + vmov s9, r10 + pop {r10} layer123_loop: - // Instructions: 55 - // Expected cycles: 28 - // Expected IPC: 1.96 - // - // Cycle bound: 28.0 - // IPC bound: 1.96 - // - // Wall time: 68.83s - // User time: 68.83s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r5, [r0, #16] // *............................. - ldr.w r6, [r0, #20] // *............................. - ldr.w r4, [r0, #12] // .*............................ - ldr.w r9, [r0, #24] // .*............................ - ldr.w r7, [r0, #28] // ..*........................... - add r5, r6 // ..*........................... - add r9, r7 // ...*.......................... - sub.w r8, r5, r6, lsl #1 // ...*.......................... - add.w r6, r5, r9 // ....*......................... - smull r5, r11, r8, r1 // ....*......................... - sub.w r7, r9, r7, lsl #1 // .....*........................ - smull r14, r10, r8, r12 // .....*........................ - sub.w r8, r6, r9, lsl #1 // ......*....................... - smlal r5, r11, r7, r12 // ......*....................... - ldr.w r12, [r0, #8] // .......*...................... - smlal r14, r10, r7, r1 // .......*...................... - ldr.w r9, [r0] // ........*..................... - mul r7, r5, r2 // ........*..................... - add r12, r4 // .........*.................... - mul r1, r14, r2 // .........*.................... - sub.w r4, r12, r4, lsl #1 // ..........*................... - smlal r5, r11, r7, r3 // ..........*................... - vmov r7, s4 // ...........*.................. - smlal r14, r10, r1, r3 // ...........*.................. - ldr.w r14, [r0, #4] // ............*................. - smull r1, r4, r4, r7 // ............*................. - add r9, r14 // .............*................ - smull r5, r8, r8, r7 // .............*................ - mul r7, r1, r2 // ..............*............... - sub.w r14, r9, r14, lsl #1 // ...............*.............. - add r9, r12 // ...............*.............. - sub.w r12, r9, r12, lsl #1 // ................*............. - smlal r1, r4, r7, r3 // ................*............. - add r9, r6 // .................*............ - mul r1, r5, r2 // .................*............ - add r14, r4 // ..................*........... - sub.w r7, r9, r6, lsl #1 // ..................*........... - sub.w r4, r14, r4, lsl #1 // ...................*.......... - smlal r5, r8, r1, r3 // ...................*.......... - add r4, r10 // ....................*......... - str.w r4, [r0, #12] // ....................*......... - add r14, r11 // .....................*........ - str.w r7, [r0, #16] // .....................*........ - add r12, r8 // ......................*....... - str.w r12, [r0, #8] // ......................*....... - sub.w r11, r14, r11, lsl #1 // .......................*...... - str.w r11, [r0, #20] // .......................*...... - sub.w r8, r12, r8, lsl #1 // ........................*..... - str.w r8, [r0, #24] // ........................*..... - sub.w r6, r4, r10, lsl #1 // .........................*.... - str.w r6, [r0, #28] // .........................*.... - vmov r1, s6 // ..........................e... - str.w r14, [r0, #4] // ..........................*... - vmov r12, s8 // ...........................e.. - str.w r9, [r0], #32 // ...........................*.. + // Instructions: 58 + // Expected cycles: 29 + // Expected IPC: 2.00 + // + // Cycle bound: 29.0 + // IPC bound: 2.00 + // + // Wall time: 300.10s + // User time: 300.10s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r9, [r0, #20] // *............................. + ldr.w r12, [r0, #24] // *............................. + vmov r8, s8 // .*............................ + ldr.w r10, [r0, #28] // .*............................ + add r7, r9 // ..*........................... + add r12, r10 // ..*........................... + vmov r11, s6 // ...*.......................... + sub.w r14, r7, r9, lsl #1 // ...*.......................... + sub.w r1, r12, r10, lsl #1 // ....*......................... + smull r4, r6, r14, r11 // ....*......................... + ldr.w r10, [r0, #8] // .....*........................ + smlal r4, r6, r1, r8 // .....*........................ + add.w r5, r7, r12 // ......*....................... + smull r14, r7, r14, r8 // ......*....................... + ldr.w r9, [r0, #12] // .......*...................... + mul r8, r4, r2 // .......*...................... + add r10, r9 // ........*..................... + smlal r14, r7, r1, r11 // ........*..................... + ldr.w r11, [r0] // .........*.................... + smlal r4, r6, r8, r3 // .........*.................... + ldr.w r8, [r0, #4] // ..........*................... + mul r4, r14, r2 // ..........*................... + sub.w r1, r5, r12, lsl #1 // ...........*.................. + vmov r12, s4 // ...........*.................. + sub.w r9, r10, r9, lsl #1 // ............*................. + smlal r14, r7, r4, r3 // ............*................. + add r11, r8 // .............*................ + smull r4, r9, r9, r12 // .............*................ + sub.w r14, r11, r8, lsl #1 // ..............*............... + add r11, r10 // ..............*............... + sub.w r8, r11, r10, lsl #1 // ...............*.............. + mul r10, r4, r2 // ...............*.............. + add r11, r5 // ................*............. + smull r1, r12, r1, r12 // ................*............. + sub.w r5, r11, r5, lsl #1 // .................*............ + smlal r4, r9, r10, r3 // .................*............ + vmov r10, s9 // ..................*........... + mul r4, r1, r2 // ..................*........... + str.w r11, [r0], #32 // ...................*.......... // @slothy:before=cmp + add r14, r9 // ...................*.......... + sub.w r11, r14, r9, lsl #1 // ....................*......... + add r14, r6 // ....................*......... + sub.w r9, r14, r6, lsl #1 // .....................*........ + smlal r1, r12, r4, r3 // .....................*........ + add r11, r7 // ......................*....... + str r9, [r0, #-12] // ......................*....... + add r8, r12 // .......................*...... + str r5, [r0, #-16] // .......................*...... + sub.w r4, r8, r12, lsl #1 // ........................*..... + str r4, [r0, #-8] // ........................*..... + sub.w r7, r11, r7, lsl #1 // .........................*.... + str r7, [r0, #-4] // .........................*.... + cmp.w r0, r10 // ..........................*... // @slothy:id=cmp + str r14, [r0, #-28] // ..........................*... + str r8, [r0, #-24] // ...........................*.. + ldr.w r7, [r0, #16] // ...........................e.. + str r11, [r0, #-20] // ............................*. + bne.w layer123_loop // ............................*. // @slothy:branch // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w R4, [R0, #4*16/4] // ..*............................ + // ldr.w R4, [R0, #4*16/4] // e.'..........................~. // ldr.w R6, [R0, #5*16/4] // ..*............................ - // ldr.w R12, [R0, #6*16/4] // ..'*........................... - // ldr.w R8, [R0, #7*16/4] // ..'.*.......................... + // ldr.w R12, [R0, #6*16/4] // ..*............................ + // ldr.w R8, [R0, #7*16/4] // ..'*........................... // add R4, R6 // ..'.*.......................... - // add R12, R8 // ..'..*......................... + // add R12, R8 // ..'.*.......................... // sub.w R6, R4, R6, lsl #1 // ..'..*......................... - // sub.w R8, R12, R8, lsl #1 // ..'....*....................... - // add.w R4, R4, R12 // ..'...*........................ - // sub.w R12, R4, R12, lsl #1 // ..'.....*...................... - // vmov R9, s6 // e.'.........................~.. - // vmov R10, s8 // .e'..........................~. + // sub.w R8, R12, R8, lsl #1 // ..'...*........................ + // add.w R4, R4, R12 // ..'.....*...................... + // sub.w R12, R4, R12, lsl #1 // ..'..........*................. + // vmov R9, s6 // ..'..*......................... + // vmov R10, s8 // ..'*........................... // smull R5, R11, R6, R9 // ..'...*........................ - // smlal R5, R11, R8, R10 // ..'.....*...................... - // mul R1, R5, R2 // ..'.......*.................... - // smlal R5, R11, R1, R3 // ..'.........*.................. - // smull R7, R14, R6, R10 // ..'....*....................... - // smlal R7, R14, R8, R9 // ..'......*..................... - // mul R1, R7, R2 // ..'........*................... - // smlal R7, R14, R1, R3 // ..'..........*................. - // ldr.w R5, [R0] // ..'.......*.................... - // ldr.w R6, [R0, #1*16/4] // ..'...........*................ - // ldr.w R7, [R0, #2*16/4] // ..'......*..................... - // ldr.w R8, [R0, #3*16/4] // ..'*........................... + // smlal R5, R11, R8, R10 // ..'....*....................... + // mul R1, R5, R2 // ..'......*..................... + // smlal R5, R11, R1, R3 // ..'........*................... + // smull R7, R14, R6, R10 // ..'.....*...................... + // smlal R7, R14, R8, R9 // ..'.......*.................... + // mul R1, R7, R2 // ..'.........*.................. + // smlal R7, R14, R1, R3 // ..'...........*................ + // ldr.w R5, [R0] // ..'........*................... + // ldr.w R6, [R0, #1*16/4] // ..'.........*.................. + // ldr.w R7, [R0, #2*16/4] // ..'....*....................... + // ldr.w R8, [R0, #3*16/4] // ..'......*..................... // add R5, R6 // ..'............*............... - // add R7, R8 // ..'........*................... - // sub.w R6, R5, R6, lsl #1 // ..'..............*............. - // sub.w R8, R7, R8, lsl #1 // ..'.........*.................. + // add R7, R8 // ..'.......*.................... + // sub.w R6, R5, R6, lsl #1 // ..'.............*.............. + // sub.w R8, R7, R8, lsl #1 // ..'...........*................ // vmov R1, s4 // ..'..........*................. - // smull R9, R8, R8, R1 // ..'...........*................ - // mul R10, R9, R2 // ..'.............*.............. - // smlal R9, R8, R10, R3 // ..'...............*............ - // add R5, R7 // ..'..............*............. - // add R6, R8 // ..'.................*.......... - // sub.w R7, R5, R7, lsl #1 // ..'...............*............ - // sub.w R8, R6, R8, lsl #1 // ..'..................*......... - // smull R9, R12, R12, R1 // ..'............*............... - // mul R10, R9, R2 // ..'................*........... - // smlal R9, R12, R10, R3 // ..'..................*......... - // add R5, R4 // ..'................*........... - // add R6, R11 // ..'....................*....... - // add R7, R12 // ..'.....................*...... - // add R8, R14 // ..'...................*........ - // sub.w R4, R5, R4, lsl #1 // ..'.................*.......... - // sub.w R11, R6, R11, lsl #1 // ..'......................*..... + // smull R9, R8, R8, R1 // ..'............*............... + // mul R10, R9, R2 // ..'..............*............. + // smlal R9, R8, R10, R3 // ..'................*........... + // add R5, R7 // ..'.............*.............. + // add R6, R8 // ..'..................*......... + // sub.w R7, R5, R7, lsl #1 // ..'..............*............. + // sub.w R8, R6, R8, lsl #1 // ..'...................*........ + // smull R9, R12, R12, R1 // ..'...............*............ + // mul R10, R9, R2 // ..'.................*.......... + // smlal R9, R12, R10, R3 // ..'....................*....... + // add R5, R4 // ..'...............*............ + // add R6, R11 // ..'...................*........ + // add R7, R12 // ..'......................*..... + // add R8, R14 // ..'.....................*...... + // sub.w R4, R5, R4, lsl #1 // ..'................*........... + // sub.w R11, R6, R11, lsl #1 // ..'....................*....... // sub.w R12, R7, R12, lsl #1 // ..'.......................*.... // sub.w R14, R8, R14, lsl #1 // ..'........................*... - // str.w R6, [R0, #1*16/4] // ~.'.........................*.. - // str.w R7, [R0, #2*16/4] // ..'.....................*...... - // str.w R8, [R0, #3*16/4] // ..'...................*........ - // str.w R4, [R0, #4*16/4] // ..'....................*....... - // str.w R11, [R0, #5*16/4] // ..'......................*..... + // str.w R6, [R0, #1*16/4] // ..'.........................*.. + // str.w R7, [R0, #2*16/4] // ~.'..........................*. + // str.w R8, [R0, #3*16/4] // .~'...........................* + // str.w R4, [R0, #4*16/4] // ..'......................*..... + // str.w R11, [R0, #5*16/4] // ..'.....................*...... // str.w R12, [R0, #6*16/4] // ..'.......................*.... // str.w R14, [R0, #7*16/4] // ..'........................*... - // str.w R5, [R0], #32 // .~'..........................*. + // str.w R5, [R0], #32 // ..'..................*......... + // vmov R10, s9 // ..'.................*.......... + // cmp.w R0, R10 // ..'.........................*.. + // bne.w layer123_loop // .~'...........................* - vmov temp_l, s9 - cmp ptr_p, temp_l - bne layer123_loop - // Instructions: 53 - // Expected cycles: 28 - // Expected IPC: 1.89 + + // Instructions: 57 + // Expected cycles: 29 + // Expected IPC: 1.97 // - // Cycle bound: 27.0 - // IPC bound: 1.96 + // Cycle bound: 29.0 + // IPC bound: 1.97 // - // Wall time: 289.59s - // User time: 289.59s + // Wall time: 10.94s + // User time: 10.94s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr.w r5, [r0, #16] // *............................. + ldr.w r10, [r0, #24] // *............................. ldr.w r11, [r0, #20] // *............................. - ldr.w r7, [r0, #12] // .*............................ - ldr.w r10, [r0, #24] // .*............................ - ldr.w r14, [r0, #28] // ..*........................... - add r5, r11 // ..*........................... - add r10, r14 // ...*.......................... - sub.w r11, r5, r11, lsl #1 // ...*.......................... - add.w r8, r5, r10 // ....*......................... - smull r9, r5, r11, r1 // ....*......................... - sub.w r4, r10, r14, lsl #1 // .....*........................ - smull r14, r11, r11, r12 // .....*........................ - ldr.w r6, [r0, #8] // ......*....................... - smlal r9, r5, r4, r12 // ......*....................... - sub.w r10, r8, r10, lsl #1 // .......*...................... - smlal r14, r11, r4, r1 // .......*...................... - add r6, r7 // ........*..................... - mul r12, r9, r2 // ........*..................... - sub.w r1, r6, r7, lsl #1 // .........*.................... - mul r7, r14, r2 // .........*.................... - ldr.w r4, [r0, #4] // ..........*................... - smlal r9, r5, r12, r3 // ..........*................... - vmov r9, s4 // ...........*.................. - smlal r14, r11, r7, r3 // ...........*.................. - ldr.w r12, [r0] // ............*................. - smull r1, r14, r1, r9 // ............*................. - smull r10, r7, r10, r9 // .............*................ - add r12, r4 // ..............*............... - mul r9, r1, r2 // ..............*............... - sub.w r4, r12, r4, lsl #1 // ...............*.............. - add r12, r6 // ...............*.............. - sub.w r6, r12, r6, lsl #1 // ................*............. - add r12, r8 // ................*............. - sub.w r8, r12, r8, lsl #1 // .................*............ - smlal r1, r14, r9, r3 // .................*............ - mul r1, r10, r2 // ..................*........... - str.w r8, [r0, #16] // ...................*.......... - add r4, r14 // ...................*.......... - sub.w r8, r4, r14, lsl #1 // ....................*......... - smlal r10, r7, r1, r3 // ....................*......... - add r4, r5 // .....................*........ - str.w r4, [r0, #4] // .....................*........ - add r6, r7 // ......................*....... - str.w r6, [r0, #8] // ......................*....... - sub.w r9, r4, r5, lsl #1 // .......................*...... - str.w r9, [r0, #20] // .......................*...... - sub.w r9, r6, r7, lsl #1 // ........................*..... - str.w r9, [r0, #24] // ........................*..... - add r8, r11 // .........................*.... - str.w r8, [r0, #12] // .........................*.... - sub.w r9, r8, r11, lsl #1 // ..........................*... - str.w r9, [r0, #28] // ..........................*... - str.w r12, [r0], #32 // ...........................*.. + vmov r1, s6 // .*............................ + ldr.w r5, [r0, #28] // .*............................ + add r10, r5 // ..*........................... + add r7, r11 // ..*........................... + sub.w r14, r7, r11, lsl #1 // ...*.......................... + vmov r12, s8 // ...*.......................... + sub.w r5, r10, r5, lsl #1 // ....*......................... + smull r4, r6, r14, r1 // ....*......................... + ldr.w r11, [r0] // .....*........................ + smlal r4, r6, r5, r12 // .....*........................ + ldr.w r9, [r0, #4] // ......*....................... + smull r14, r12, r14, r12 // ......*....................... + add r11, r9 // .......*...................... + mul r8, r4, r2 // .......*...................... + add.w r7, r7, r10 // ........*..................... + smlal r14, r12, r5, r1 // ........*..................... + sub.w r1, r11, r9, lsl #1 // .........*.................... + smlal r4, r6, r8, r3 // .........*.................... + sub.w r10, r7, r10, lsl #1 // ..........*................... + mul r8, r14, r2 // ..........*................... + ldr.w r9, [r0, #12] // ...........*.................. + ldr.w r4, [r0, #8] // ...........*.................. + vmov r5, s4 // ............*................. + smlal r14, r12, r8, r3 // ............*................. + add r4, r9 // .............*................ + smull r8, r10, r10, r5 // .............*................ + sub.w r9, r4, r9, lsl #1 // ..............*............... + add r11, r4 // ..............*............... + sub.w r4, r11, r4, lsl #1 // ...............*.............. + smull r9, r5, r9, r5 // ...............*.............. + add r11, r7 // ................*............. + mul r14, r8, r2 // ................*............. + sub.w r7, r11, r7, lsl #1 // .................*............ + str.w r11, [r0], #32 // .................*............ // @slothy:before=cmp + smlal r8, r10, r14, r3 // ..................*........... + vmov r8, s9 // ...................*.......... + mul r14, r9, r2 // ...................*.......... + str r7, [r0, #-16] // ....................*......... + add r4, r10 // ....................*......... + cmp.w r0, r8 // .....................*........ // @slothy:id=cmp + smlal r9, r5, r14, r3 // .....................*........ + sub.w r10, r4, r10, lsl #1 // ......................*....... + str r4, [r0, #-24] // ......................*....... + add r1, r5 // .......................*...... + str r10, [r0, #-8] // .......................*...... + sub.w r9, r1, r5, lsl #1 // ........................*..... + add r1, r6 // ........................*..... + add r9, r12 // .........................*.... + str r9, [r0, #-20] // .........................*.... + sub.w r7, r1, r6, lsl #1 // ..........................*... + str r1, [r0, #-28] // ..........................*... + str r7, [r0, #-12] // ...........................*.. + sub.w r14, r9, r12, lsl #1 // ...........................*.. + str r14, [r0, #-4] // ............................*. - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w r5, [r0, #16] // *.............................. - // ldr.w r6, [r0, #20] // *.............................. - // ldr.w r4, [r0, #12] // .*............................. - // ldr.w r9, [r0, #24] // .*............................. - // ldr.w r7, [r0, #28] // ..*............................ - // add r5, r6 // ..*............................ - // add r9, r7 // ...*........................... - // sub.w r8, r5, r6, lsl #1 // ...*........................... - // add.w r6, r5, r9 // ....*.......................... - // smull r5, r11, r8, r1 // ....*.......................... - // sub.w r7, r9, r7, lsl #1 // .....*......................... - // smull r14, r10, r8, r12 // .....*......................... - // sub.w r8, r6, r9, lsl #1 // .......*....................... - // smlal r5, r11, r7, r12 // ......*........................ - // ldr.w r12, [r0, #8] // ......*........................ - // smlal r14, r10, r7, r1 // .......*....................... - // ldr.w r9, [r0] // ............*.................. - // mul r7, r5, r2 // ........*...................... - // add r12, r4 // ........*...................... - // mul r1, r14, r2 // .........*..................... - // sub.w r4, r12, r4, lsl #1 // .........*..................... - // smlal r5, r11, r7, r3 // ..........*.................... - // vmov r7, s4 // ...........*................... - // smlal r14, r10, r1, r3 // ...........*................... - // ldr.w r14, [r0, #4] // ..........*.................... - // smull r1, r4, r4, r7 // ............*.................. - // add r9, r14 // ..............*................ - // smull r5, r8, r8, r7 // .............*................. - // mul r7, r1, r2 // ..............*................ - // sub.w r14, r9, r14, lsl #1 // ...............*............... - // add r9, r12 // ...............*............... - // sub.w r12, r9, r12, lsl #1 // ................*.............. - // smlal r1, r4, r7, r3 // .................*............. - // add r9, r6 // ................*.............. - // mul r1, r5, r2 // ..................*............ - // add r14, r4 // ...................*........... - // sub.w r7, r9, r6, lsl #1 // .................*............. - // sub.w r4, r14, r4, lsl #1 // ....................*.......... - // smlal r5, r8, r1, r3 // ....................*.......... - // add r4, r10 // .........................*..... - // str.w r4, [r0, #12] // .........................*..... - // add r14, r11 // .....................*......... - // str.w r7, [r0, #16] // ...................*........... - // add r12, r8 // ......................*........ - // str.w r12, [r0, #8] // ......................*........ - // sub.w r11, r14, r11, lsl #1 // .......................*....... - // str.w r11, [r0, #20] // .......................*....... - // sub.w r8, r12, r8, lsl #1 // ........................*...... - // str.w r8, [r0, #24] // ........................*...... - // sub.w r6, r4, r10, lsl #1 // ..........................*.... - // str.w r6, [r0, #28] // ..........................*.... - // str.w r14, [r0, #4] // .....................*......... - // str.w r9, [r0], #32 // ...........................*... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r9, [r0, #20] // *.............................. + // ldr.w r12, [r0, #24] // *.............................. + // vmov r8, s8 // ...*........................... + // ldr.w r10, [r0, #28] // .*............................. + // add r7, r9 // ..*............................ + // add r12, r10 // ..*............................ + // vmov r11, s6 // .*............................. + // sub.w r14, r7, r9, lsl #1 // ...*........................... + // sub.w r1, r12, r10, lsl #1 // ....*.......................... + // smull r4, r6, r14, r11 // ....*.......................... + // ldr.w r10, [r0, #8] // ...........*................... + // smlal r4, r6, r1, r8 // .....*......................... + // add.w r5, r7, r12 // ........*...................... + // smull r14, r7, r14, r8 // ......*........................ + // ldr.w r9, [r0, #12] // ...........*................... + // mul r8, r4, r2 // .......*....................... + // add r10, r9 // .............*................. + // smlal r14, r7, r1, r11 // ........*...................... + // ldr.w r11, [r0] // .....*......................... + // smlal r4, r6, r8, r3 // .........*..................... + // ldr.w r8, [r0, #4] // ......*........................ + // mul r4, r14, r2 // ..........*.................... + // sub.w r1, r5, r12, lsl #1 // ..........*.................... + // vmov r12, s4 // ............*.................. + // sub.w r9, r10, r9, lsl #1 // ..............*................ + // smlal r14, r7, r4, r3 // ............*.................. + // add r11, r8 // .......*....................... + // smull r4, r9, r9, r12 // ...............*............... + // sub.w r14, r11, r8, lsl #1 // .........*..................... + // add r11, r10 // ..............*................ + // sub.w r8, r11, r10, lsl #1 // ...............*............... + // mul r10, r4, r2 // ...................*........... + // add r11, r5 // ................*.............. + // smull r1, r12, r1, r12 // .............*................. + // sub.w r5, r11, r5, lsl #1 // .................*............. + // smlal r4, r9, r10, r3 // .....................*......... + // vmov r10, s9 // ...................*........... + // mul r4, r1, r2 // ................*.............. + // str.w r11, [r0], #32 // .................*............. + // add r14, r9 // .......................*....... + // sub.w r11, r14, r9, lsl #1 // ........................*...... + // add r14, r6 // ........................*...... + // sub.w r9, r14, r6, lsl #1 // ..........................*.... + // smlal r1, r12, r4, r3 // ..................*............ + // add r11, r7 // .........................*..... + // str r9, [r0, #-12] // ...........................*... + // add r8, r12 // ....................*.......... + // str r5, [r0, #-16] // ....................*.......... + // sub.w r4, r8, r12, lsl #1 // ......................*........ + // str r4, [r0, #-8] // .......................*....... + // sub.w r7, r11, r7, lsl #1 // ...........................*... + // str r7, [r0, #-4] // ............................*.. + // cmp.w r0, r10 // .....................*......... + // str r14, [r0, #-28] // ..........................*.... + // str r8, [r0, #-24] // ......................*........ + // str r11, [r0, #-20] // .........................*..... + // bne.w layer123_loop // ............................*.. sub ptr_p, #32*strincr @@ -503,137 +512,139 @@ layer123_loop: // Expected cycles: 0 // Expected IPC: 0.00 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.00s + // User time: 0.00s // layer456_first_loop: - // Instructions: 55 - // Expected cycles: 29 - // Expected IPC: 1.90 - // - // Cycle bound: 28.0 - // IPC bound: 1.96 - // - // Wall time: 4.92s - // User time: 4.92s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - vmov r12, s8 // *............................. - ldr.w r9, [r0, #128] // *............................. - vmov r4, s6 // .*............................ - ldr.w r14, [r0, #160] // .*............................ - add r9, r14 // ..*........................... - ldr.w r7, [r0, #192] // ..*........................... - ldr.w r6, [r0, #224] // ...*.......................... - add r7, r6 // ....*......................... - sub.w r10, r9, r14, lsl #1 // ....*......................... - add.w r1, r9, r7 // .....*........................ - smull r5, r11, r10, r4 // .....*........................ - sub.w r6, r7, r6, lsl #1 // ......*....................... - smull r14, r10, r10, r12 // ......*....................... - ldr.w r9, [r0], #128 // .......*...................... - smlal r14, r10, r6, r4 // .......*...................... - sub.w r7, r1, r7, lsl #1 // ........*..................... - smlal r5, r11, r6, r12 // ........*..................... - ldr r12, [r0, #-64] // .........*.................... - mul r8, r14, r2 // .........*.................... - ldr r4, [r0, #-32] // ..........*................... - mul r6, r5, r2 // ..........*................... - add r12, r4 // ...........*.................. - smlal r14, r10, r8, r3 // ...........*.................. - vmov r8, s4 // ............*................. - smlal r5, r11, r6, r3 // ............*................. - ldr r5, [r0, #-96] // .............*................ - smull r7, r6, r7, r8 // .............*................ - add r9, r5 // ..............*............... - sub.w r14, r12, r4, lsl #1 // ...............*.............. - mul r4, r7, r2 // ...............*.............. - sub.w r5, r9, r5, lsl #1 // ................*............. - smull r14, r8, r14, r8 // ................*............. - add r9, r12 // .................*............ - smlal r7, r6, r4, r3 // .................*............ - sub.w r4, r9, r12, lsl #1 // ..................*........... - mul r12, r14, r2 // ..................*........... - add r4, r6 // ...................*.......... - str r4, [r0, #-64] // ...................*.......... - sub.w r4, r4, r6, lsl #1 // ....................*......... - smlal r14, r8, r12, r3 // ....................*......... - str.w r4, [r0, #64] // .....................*........ - add r9, r1 // .....................*........ - add r5, r8 // ......................*....... - str r9, [r0, #-128] // ......................*....... - sub.w r14, r5, r8, lsl #1 // .......................*...... - add r5, r11 // .......................*...... - sub.w r7, r5, r11, lsl #1 // ........................*..... - str.w r7, [r0, #32] // ........................*..... - str r5, [r0, #-96] // .........................*.... - add r14, r10 // .........................*.... - str r14, [r0, #-32] // ..........................*... - sub.w r14, r14, r10, lsl #1 // ..........................*... - str.w r14, [r0, #96] // ...........................*.. - sub.w r5, r9, r1, lsl #1 // ...........................*.. - str.w r5, [r0], #128 // ............................*. + // Instructions: 56 + // Expected cycles: 29 + // Expected IPC: 1.93 + // + // Cycle bound: 28.0 + // IPC bound: 2.00 + // + // Wall time: 48.24s + // User time: 48.24s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + vmov r12, s8 // *............................. + ldr.w r1, [r0, #128] // *............................. + vmov r8, s6 // .*............................ + ldr.w r5, [r0, #160] // .*............................ + add r1, r5 // ..*........................... + ldr.w r11, [r0, #192] // ..*........................... + ldr.w r7, [r0, #224] // ...*.......................... + add r11, r7 // ....*......................... + sub.w r14, r1, r5, lsl #1 // ....*......................... + add.w r1, r1, r11 // .....*........................ + smull r4, r6, r14, r8 // .....*........................ + sub.w r5, r11, r7, lsl #1 // ......*....................... + smull r14, r7, r14, r12 // ......*....................... + sub.w r10, r1, r11, lsl #1 // .......*...................... + smlal r4, r6, r5, r12 // .......*...................... + ldr r12, [r0, #96] // ........*..................... + smlal r14, r7, r5, r8 // ........*..................... + ldr r9, [r0, #64] // .........*.................... + mul r11, r4, r2 // .........*.................... + ldr r8, [r0, #32] // ..........*................... + mul r5, r14, r2 // ..........*................... + add r9, r12 // ...........*.................. + smlal r4, r6, r11, r3 // ...........*.................. + ldr.w r11, [r0] // ............*................. + smlal r14, r7, r5, r3 // ............*................. + sub.w r14, r9, r12, lsl #1 // .............*................ + vmov r12, s4 // .............*................ + add r11, r8 // ..............*............... + smull r14, r5, r14, r12 // ..............*............... + sub.w r4, r11, r8, lsl #1 // ...............*.............. + add r11, r9 // ...............*.............. + sub.w r8, r11, r9, lsl #1 // ................*............. + mul r9, r14, r2 // ................*............. + add r11, r1 // .................*............ + str r11, [r0] // .................*............ + smlal r14, r5, r9, r3 // ..................*........... + sub.w r9, r11, r1, lsl #1 // ...................*.......... + smull r12, r10, r10, r12 // ...................*.......... + str.w r9, [r0, #128] // ....................*......... + add r4, r5 // ....................*......... + sub.w r9, r4, r5, lsl #1 // .....................*........ + mul r5, r12, r2 // .....................*........ + add r4, r6 // ......................*....... + str r4, [r0, #32] // ......................*....... + sub.w r14, r4, r6, lsl #1 // .......................*...... + str.w r14, [r0, #160] // .......................*...... + add r9, r7 // ........................*..... + smlal r12, r10, r5, r3 // ........................*..... + sub.w r4, r9, r7, lsl #1 // .........................*.... + str.w r4, [r0, #224] // .........................*.... + add r8, r10 // ..........................*... + str r8, [r0, #64] // ..........................*... + sub.w r11, r8, r10, lsl #1 // ...........................*.. + str.w r11, [r0, #192] // ...........................*.. + str r9, [r0, #96] // ............................*. + add.w r0, r0, #256 // ............................*. - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w R4, [R0, #4*128/4] // *............................~. - // ldr.w R6, [R0, #5*128/4] // .*...........................'. - // ldr.w R12, [R0, #6*128/4] // ..*..........................'. - // ldr.w R8, [R0, #7*128/4] // ...*.........................'. - // add R4, R6 // ..*..........................'. - // add R12, R8 // ....*........................'. - // sub.w R6, R4, R6, lsl #1 // ....*........................'. - // sub.w R8, R12, R8, lsl #1 // ......*......................'. - // add.w R4, R4, R12 // .....*.......................'. - // sub.w R12, R4, R12, lsl #1 // ........*....................'. - // vmov R9, s6 // .*...........................'. - // vmov R10, s8 // *............................~. - // smull R5, R11, R6, R9 // .....*.......................'. - // smlal R5, R11, R8, R10 // ........*....................'. - // mul R1, R5, R2 // ..........*..................'. - // smlal R5, R11, R1, R3 // ............*................'. - // smull R7, R14, R6, R10 // ......*......................'. - // smlal R7, R14, R8, R9 // .......*.....................'. - // mul R1, R7, R2 // .........*...................'. - // smlal R7, R14, R1, R3 // ...........*.................'. - // ldr.w R5, [R0], #128 // .......*.....................'. - // ldr R6, [R0, #1*128/4-128] // .............*...............'. - // ldr R7, [R0, #2*128/4-128] // .........*...................'. - // ldr R8, [R0, #3*128/4-128] // ..........*..................'. - // add R5, R6 // ..............*..............'. - // add R7, R8 // ...........*.................'. - // sub.w R6, R5, R6, lsl #1 // ................*............'. - // sub.w R8, R7, R8, lsl #1 // ...............*.............'. - // vmov R1, s4 // ............*................'. - // smull R9, R8, R8, R1 // ................*............'. - // mul R10, R9, R2 // ..................*..........'. - // smlal R9, R8, R10, R3 // ....................*........'. - // add R5, R7 // .................*...........'. - // add R6, R8 // ......................*......'. - // sub.w R7, R5, R7, lsl #1 // ..................*..........'. - // sub.w R8, R6, R8, lsl #1 // .......................*.....'. - // smull R9, R12, R12, R1 // .............*...............'. - // mul R10, R9, R2 // ...............*.............'. - // smlal R9, R12, R10, R3 // .................*...........'. - // add R5, R4 // .....................*.......'. - // add R6, R11 // .......................*.....'. - // add R7, R12 // ...................*.........'. - // add R8, R14 // .........................*...'. - // sub.w R4, R5, R4, lsl #1 // ...........................*.'. - // sub.w R11, R6, R11, lsl #1 // ........................*....'. - // sub.w R12, R7, R12, lsl #1 // ....................*........'. - // sub.w R14, R8, R14, lsl #1 // ..........................*..'. - // str R6, [R0, #1*128/4-128] // .........................*...'. - // str R7, [R0, #2*128/4-128] // ...................*.........'. - // str R8, [R0, #3*128/4-128] // ..........................*..'. - // str.w R11, [R0, #5*128/4-128] // ........................*....'. - // str.w R12, [R0, #6*128/4-128] // .....................*.......'. - // str.w R14, [R0, #7*128/4-128] // ...........................*.'. - // str R5, [R0, #-128] // ......................*......'. - // str.w R4, [R0], #128 // ............................*'. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w R4, [R0, #4*128/4] // *............................~. + // ldr.w R6, [R0, #5*128/4] // .*...........................'. + // ldr.w R12, [R0, #6*128/4] // ..*..........................'. + // ldr.w R8, [R0, #7*128/4] // ...*.........................'. + // add R4, R6 // ..*..........................'. + // add R12, R8 // ....*........................'. + // sub.w R6, R4, R6, lsl #1 // ....*........................'. + // sub.w R8, R12, R8, lsl #1 // ......*......................'. + // add.w R4, R4, R12 // .....*.......................'. + // sub.w R12, R4, R12, lsl #1 // .......*.....................'. + // vmov R9, s6 // .*...........................'. + // vmov R10, s8 // *............................~. + // smull R5, R11, R6, R9 // .....*.......................'. + // smlal R5, R11, R8, R10 // .......*.....................'. + // mul R1, R5, R2 // .........*...................'. + // smlal R5, R11, R1, R3 // ...........*.................'. + // smull R7, R14, R6, R10 // ......*......................'. + // smlal R7, R14, R8, R9 // ........*....................'. + // mul R1, R7, R2 // ..........*..................'. + // smlal R7, R14, R1, R3 // ............*................'. + // ldr.w R5, [R0] // ............*................'. + // ldr R6, [R0, #1*128/4] // ..........*..................'. + // ldr R7, [R0, #2*128/4] // .........*...................'. + // ldr R8, [R0, #3*128/4] // ........*....................'. + // add R5, R6 // ..............*..............'. + // add R7, R8 // ...........*.................'. + // sub.w R6, R5, R6, lsl #1 // ...............*.............'. + // sub.w R8, R7, R8, lsl #1 // .............*...............'. + // vmov R1, s4 // .............*...............'. + // smull R9, R8, R8, R1 // ..............*..............'. + // mul R10, R9, R2 // ................*............'. + // smlal R9, R8, R10, R3 // ..................*..........'. + // add R5, R7 // ...............*.............'. + // add R6, R8 // ....................*........'. + // sub.w R7, R5, R7, lsl #1 // ................*............'. + // sub.w R8, R6, R8, lsl #1 // .....................*.......'. + // smull R9, R12, R12, R1 // ...................*.........'. + // mul R10, R9, R2 // .....................*.......'. + // smlal R9, R12, R10, R3 // ........................*....'. + // add R5, R4 // .................*...........'. + // add R6, R11 // ......................*......'. + // add R7, R12 // ..........................*..'. + // add R8, R14 // ........................*....'. + // sub.w R4, R5, R4, lsl #1 // ...................*.........'. + // sub.w R11, R6, R11, lsl #1 // .......................*.....'. + // sub.w R12, R7, R12, lsl #1 // ...........................*.'. + // sub.w R14, R8, R14, lsl #1 // .........................*...'. + // str R6, [R0, #1*128/4] // ......................*......'. + // str R7, [R0, #2*128/4] // ..........................*..'. + // str R8, [R0, #3*128/4] // ............................*'. + // str.w R4, [R0, #4*128/4] // ....................*........'. + // str.w R11, [R0, #5*128/4] // .......................*.....'. + // str.w R12, [R0, #6*128/4] // ...........................*.'. + // str.w R14, [R0, #7*128/4] // .........................*...'. + // str R5, [R0] // .................*...........'. + // add.w R0, R0, #256 // ............................*'. vmov temp_l, s10 cmp ptr_p, temp_l @@ -642,8 +653,8 @@ layer456_first_loop: // Expected cycles: 0 // Expected IPC: 0.00 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.00s + // User time: 0.00s // sub.w ptr_p, #4*256-4 @@ -662,193 +673,195 @@ layer456_first_loop: // Expected cycles: 0 // Expected IPC: 0.00 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.00s + // User time: 0.00s // layer456_loop: - // Instructions: 83 - // Expected cycles: 49 - // Expected IPC: 1.69 - // - // Cycle bound: 42.0 - // IPC bound: 1.98 - // - // Wall time: 6.79s - // User time: 6.79s - // - // --------------- cycle (expected) ---------------> - // 0 25 - // |------------------------|----------------------- - vmov r5, s2 // *................................................ - ldr.w r9, [r0], #128 // *................................................ - ldr r1, [r0, #-32] // ..*.............................................. - ldr.w r14, [r0, #96] // ...*............................................. - ldr.w r10, [r0, #64] // ....*............................................ - smull r7, r4, r1, r5 // ....*............................................ - ldr r11, [r0, #-96] // .....*........................................... - smull r6, r1, r14, r5 // .....*........................................... - mul r12, r7, r2 // ......*.......................................... - ldr r14, [r0, #-64] // .......*......................................... - mul r8, r6, r2 // .......*......................................... - smlal r7, r4, r12, r3 // ........*........................................ - ldr.w r7, [r0, #32] // .........*....................................... - smull r12, r11, r11, r5 // .........*....................................... - add r14, r4 // ..........*...................................... - smlal r6, r1, r8, r3 // ..........*...................................... - sub.w r6, r14, r4, lsl #1 // ...........*..................................... - mul r4, r12, r2 // ...........*..................................... - add r10, r1 // ............*.................................... - smull r5, r8, r7, r5 // ............*.................................... - sub.w r1, r10, r1, lsl #1 // .............*................................... - smlal r12, r11, r4, r3 // .............*................................... - vmov r4, s4 // ..............*.................................. - mul r12, r5, r2 // ..............*.................................. - add r9, r11 // ...............*................................. - smull r7, r1, r1, r4 // ...............*................................. - smlal r5, r8, r12, r3 // ................*................................ - ldr.w r5, [r0, #0] // .................*............................... - mul r12, r7, r2 // .................*............................... - smull r4, r6, r6, r4 // ..................*.............................. - add r5, r8 // ...................*............................. - smlal r7, r1, r12, r3 // ...................*............................. - vmov r12, s6 // ....................*............................ - sub.w r8, r5, r8, lsl #1 // ....................*............................ - add r8, r1 // .....................*........................... - mul r7, r4, r2 // .....................*........................... - sub.w r1, r8, r1, lsl #1 // ......................*.......................... - smull r12, r8, r8, r12 // ......................*.......................... - smlal r4, r6, r7, r3 // .......................*......................... - vmov r7, s8 // ........................*........................ - mul r4, r12, r2 // ........................*........................ - sub.w r11, r9, r11, lsl #1 // .........................*....................... - smull r7, r1, r1, r7 // .........................*....................... - add r11, r6 // ..........................*...................... - smlal r12, r8, r4, r3 // ..........................*...................... - sub.w r12, r11, r6, lsl #1 // ...........................*..................... - mul r6, r7, r2 // ...........................*..................... - add r11, r8 // ............................*.................... - str r11, [r0, #-96] // ............................*.................... - sub.w r11, r11, r8, lsl #1 // .............................*................... - smlal r7, r1, r6, r3 // .............................*................... - str.w r11, [r0, #32] // ..............................*.................. - vmov r7, s3 // ..............................*.................. - smull r6, r10, r10, r7 // ...............................*................. - smull r11, r14, r14, r7 // ................................*................ - mul r4, r6, r2 // .................................*............... - vmov r8, s7 // ..................................*.............. - mul r7, r11, r2 // ..................................*.............. - add r12, r1 // ...................................*............. - smlal r6, r10, r4, r3 // ...................................*............. - vmov r6, s5 // ....................................*............ - str r12, [r0, #-32] // ....................................*............ - add r5, r10 // .....................................*........... - smlal r11, r14, r7, r3 // .....................................*........... - sub.w r11, r5, r10, lsl #1 // ......................................*.......... - smull r10, r6, r5, r6 // ......................................*.......... - add r9, r14 // .......................................*......... - smull r7, r4, r11, r8 // .......................................*......... - mul r5, r10, r2 // ........................................*........ - sub.w r8, r9, r14, lsl #1 // .........................................*....... - mul r11, r7, r2 // .........................................*....... - smlal r10, r6, r5, r3 // ..........................................*...... - sub.w r14, r12, r1, lsl #1 // ...........................................*..... - smlal r7, r4, r11, r3 // ...........................................*..... - add r9, r6 // ............................................*.... - str r9, [r0, #-128] // ............................................*.... - add r8, r4 // .............................................*... - str r8, [r0, #-64] // .............................................*... - sub.w r11, r8, r4, lsl #1 // ..............................................*.. - str.w r11, [r0, #64] // ..............................................*.. - sub.w r11, r9, r6, lsl #1 // ...............................................*. - str.w r14, [r0, #96] // ...............................................*. - str.w r11, [r0], #128 // ................................................* + // Instructions: 84 + // Expected cycles: 47 + // Expected IPC: 1.79 + // + // Cycle bound: 42.0 + // IPC bound: 2.00 + // + // Wall time: 61.15s + // User time: 61.15s + // + // -------------- cycle (expected) --------------> + // 0 25 + // |------------------------|--------------------- + ldr.w r6, [r0, #224] // *.............................................. + vmov r14, s2 // *.............................................. + ldr r11, [r0, #96] // .*............................................. + ldr r4, [r0, #64] // ..*............................................ + smull r5, r7, r6, r14 // ..*............................................ + smull r1, r12, r11, r14 // ...*........................................... + mul r10, r5, r2 // ....*.......................................... + ldr.w r11, [r0, #160] // .....*......................................... + mul r8, r1, r2 // .....*......................................... + ldr.w r6, [r0, #192] // ......*........................................ + smlal r5, r7, r10, r3 // ......*........................................ + vmov r10, s3 // .......*....................................... + smull r11, r9, r11, r14 // .......*....................................... + add r6, r7 // ........*...................................... + smlal r1, r12, r8, r3 // ........*...................................... + sub.w r5, r6, r7, lsl #1 // .........*..................................... + mul r7, r11, r2 // .........*..................................... + add r4, r12 // ..........*.................................... + smull r8, r1, r6, r10 // ..........*.................................... + ldr r6, [r0, #32] // ...........*................................... + smlal r11, r9, r7, r3 // ...........*................................... + ldr.w r11, [r0, #128] // ............*.................................. + mul r7, r8, r2 // ............*.................................. + sub.w r12, r4, r12, lsl #1 // .............*................................. + smull r6, r14, r6, r14 // .............*................................. + add r11, r9 // ..............*................................ + smlal r8, r1, r7, r3 // ..............*................................ + sub.w r8, r11, r9, lsl #1 // ...............*............................... + mul r7, r6, r2 // ...............*............................... + add r11, r1 // ................*.............................. + smull r4, r10, r4, r10 // ................*.............................. + sub.w r1, r11, r1, lsl #1 // .................*............................. + smlal r6, r14, r7, r3 // .................*............................. + vmov r6, s5 // ..................*............................ + mul r7, r4, r2 // ..................*............................ + smull r6, r9, r11, r6 // ...................*........................... + ldr.w r11, [r0] // ....................*.......................... + smlal r4, r10, r7, r3 // ....................*.......................... + vmov r7, s4 // .....................*......................... + mul r4, r6, r2 // .....................*......................... + add r11, r14 // .......................*....................... + smlal r6, r9, r4, r3 // .......................*....................... + sub.w r6, r11, r14, lsl #1 // ........................*...................... + smull r14, r5, r5, r7 // ........................*...................... + add r11, r10 // .........................*..................... + smull r12, r7, r12, r7 // .........................*..................... + sub.w r10, r11, r10, lsl #1 // ..........................*.................... + mul r4, r14, r2 // ..........................*.................... + add r11, r9 // ...........................*................... + str r11, [r0] // ...........................*................... + sub.w r9, r11, r9, lsl #1 // ............................*.................. + smlal r14, r5, r4, r3 // ............................*.................. + vmov r11, s6 // .............................*................. + str.w r9, [r0, #128] // .............................*................. + add r8, r5 // ..............................*................ + mul r14, r12, r2 // ..............................*................ + sub.w r5, r8, r5, lsl #1 // ...............................*............... + smull r4, r11, r8, r11 // ...............................*............... + vmov r9, s8 // ................................*.............. + smlal r12, r7, r14, r3 // ................................*.............. + vmov r12, s7 // .................................*............. + smull r5, r9, r5, r9 // .................................*............. + add r6, r7 // ..................................*............ + mul r14, r4, r2 // ..................................*............ + sub.w r7, r6, r7, lsl #1 // ...................................*........... + mul r8, r5, r2 // ...................................*........... + smlal r4, r11, r14, r3 // ....................................*.......... + smull r4, r1, r1, r12 // .....................................*......... + add r6, r11 // ......................................*........ + smlal r5, r9, r8, r3 // ......................................*........ + sub.w r5, r6, r11, lsl #1 // .......................................*....... + mul r14, r4, r2 // .......................................*....... + add r7, r9 // ........................................*...... + str r7, [r0, #96] // ........................................*...... + sub.w r12, r7, r9, lsl #1 // .........................................*..... + smlal r4, r1, r14, r3 // .........................................*..... + str.w r12, [r0, #224] // ..........................................*.... + add r10, r1 // ...........................................*... + str r10, [r0, #64] // ...........................................*... + sub.w r4, r10, r1, lsl #1 // ............................................*.. + str.w r4, [r0, #192] // ............................................*.. + str r6, [r0, #32] // .............................................*. + str.w r5, [r0, #160] // ..............................................* + add.w r0, r0, #256 // ..............................................* - // --------------- cycle (expected) ---------------> - // 0 25 - // |------------------------|----------------------- - // ldr.w R5, [R0], #128 // *................................................ - // ldr R6, [R0, #1*128/4-128] // .....*........................................... - // ldr R7, [R0, #2*128/4-128] // .......*......................................... - // ldr R8, [R0, #3*128/4-128] // ..*.............................................. - // ldr.w R4, [R0, #4*128/4-128] // .................*............................... - // ldr.w R11, [R0, #5*128/4-128] // .........*....................................... - // ldr.w R12, [R0, #6*128/4-128] // ....*............................................ - // ldr.w R14, [R0, #7*128/4-128] // ...*............................................. - // vmov R1, s2 // *................................................ - // smull R9, R6, R6, R1 // .........*....................................... - // mul R10, R9, R2 // ...........*..................................... - // smlal R9, R6, R10, R3 // .............*................................... - // smull R9, R8, R8, R1 // ....*............................................ - // mul R10, R9, R2 // ......*.......................................... - // smlal R9, R8, R10, R3 // ........*........................................ - // smull R9, R11, R11, R1 // ............*.................................... - // mul R10, R9, R2 // ..............*.................................. - // smlal R9, R11, R10, R3 // ................*................................ - // smull R9, R14, R14, R1 // .....*........................................... - // mul R10, R9, R2 // .......*......................................... - // smlal R9, R14, R10, R3 // ..........*...................................... - // add R5, R6 // ...............*................................. - // add R7, R8 // ..........*...................................... - // add R4, R11 // ...................*............................. - // add R12, R14 // ............*.................................... - // sub.w R6, R5, R6, lsl #1 // .........................*....................... - // sub.w R8, R7, R8, lsl #1 // ...........*..................................... - // sub.w R11, R4, R11, lsl #1 // ....................*............................ - // sub.w R14, R12, R14, lsl #1 // .............*................................... - // vmov R1, s3 // ..............................*.................. - // smull R9, R7, R7, R1 // ................................*................ - // mul R10, R9, R2 // ..................................*.............. - // smlal R9, R7, R10, R3 // .....................................*........... - // smull R9, R12, R12, R1 // ...............................*................. - // mul R10, R9, R2 // .................................*............... - // smlal R9, R12, R10, R3 // ...................................*............. - // vmov R1, s4 // ..............*.................................. - // smull R9, R8, R8, R1 // ..................*.............................. - // mul R10, R9, R2 // .....................*........................... - // smlal R9, R8, R10, R3 // .......................*......................... - // smull R9, R14, R14, R1 // ...............*................................. - // mul R10, R9, R2 // .................*............................... - // smlal R9, R14, R10, R3 // ...................*............................. - // add R5, R7 // .......................................*......... - // add R6, R8 // ..........................*...................... - // add R4, R12 // .....................................*........... - // add R11, R14 // .....................*........................... - // sub.w R7, R5, R7, lsl #1 // .........................................*....... - // sub.w R8, R6, R8, lsl #1 // ...........................*..................... - // sub.w R12, R4, R12, lsl #1 // ......................................*.......... - // sub.w R14, R11, R14, lsl #1 // ......................*.......................... - // vmov R1, s5 // ....................................*............ - // smull R9, R4, R4, R1 // ......................................*.......... - // mul R10, R9, R2 // ........................................*........ - // smlal R9, R4, R10, R3 // ..........................................*...... - // vmov R1, s6 // ....................*............................ - // smull R9, R11, R11, R1 // ......................*.......................... - // mul R10, R9, R2 // ........................*........................ - // smlal R9, R11, R10, R3 // ..........................*...................... - // vmov R1, s7 // ..................................*.............. - // smull R9, R12, R12, R1 // .......................................*......... - // mul R10, R9, R2 // .........................................*....... - // smlal R9, R12, R10, R3 // ...........................................*..... - // vmov R1, s8 // ........................*........................ - // smull R9, R14, R14, R1 // .........................*....................... - // mul R10, R9, R2 // ...........................*..................... - // smlal R9, R14, R10, R3 // .............................*................... - // add R5, R4 // ............................................*.... - // add R6, R11 // ............................*.................... - // add R7, R12 // .............................................*... - // add R8, R14 // ...................................*............. - // sub.w R4, R5, R4, lsl #1 // ...............................................*. - // sub.w R11, R6, R11, lsl #1 // .............................*................... - // sub.w R12, R7, R12, lsl #1 // ..............................................*.. - // sub.w R14, R8, R14, lsl #1 // ...........................................*..... - // str R6, [R0, #1*128/4-128] // ............................*.................... - // str R7, [R0, #2*128/4-128] // .............................................*... - // str R8, [R0, #3*128/4-128] // ....................................*............ - // str.w R11, [R0, #5*128/4-128] // ..............................*.................. - // str.w R12, [R0, #6*128/4-128] // ..............................................*.. - // str.w R14, [R0, #7*128/4-128] // ...............................................*. - // str R5, [R0, #-128] // ............................................*.... - // str.w R4, [R0], #128 // ................................................* + // -------------- cycle (expected) --------------> + // 0 25 + // |------------------------|--------------------- + // ldr.w R5, [R0] // ....................*.......................... + // ldr R6, [R0, #1*128/4] // ...........*................................... + // ldr R7, [R0, #2*128/4] // ..*............................................ + // ldr R8, [R0, #3*128/4] // .*............................................. + // ldr.w R4, [R0, #4*128/4] // ............*.................................. + // ldr.w R11, [R0, #5*128/4] // .....*......................................... + // ldr.w R12, [R0, #6*128/4] // ......*........................................ + // ldr.w R14, [R0, #7*128/4] // *.............................................. + // vmov R1, s2 // *.............................................. + // smull R9, R6, R6, R1 // .............*................................. + // mul R10, R9, R2 // ...............*............................... + // smlal R9, R6, R10, R3 // .................*............................. + // smull R9, R8, R8, R1 // ...*........................................... + // mul R10, R9, R2 // .....*......................................... + // smlal R9, R8, R10, R3 // ........*...................................... + // smull R9, R11, R11, R1 // .......*....................................... + // mul R10, R9, R2 // .........*..................................... + // smlal R9, R11, R10, R3 // ...........*................................... + // smull R9, R14, R14, R1 // ..*............................................ + // mul R10, R9, R2 // ....*.......................................... + // smlal R9, R14, R10, R3 // ......*........................................ + // add R5, R6 // .......................*....................... + // add R7, R8 // ..........*.................................... + // add R4, R11 // ..............*................................ + // add R12, R14 // ........*...................................... + // sub.w R6, R5, R6, lsl #1 // ........................*...................... + // sub.w R8, R7, R8, lsl #1 // .............*................................. + // sub.w R11, R4, R11, lsl #1 // ...............*............................... + // sub.w R14, R12, R14, lsl #1 // .........*..................................... + // vmov R1, s3 // .......*....................................... + // smull R9, R7, R7, R1 // ................*.............................. + // mul R10, R9, R2 // ..................*............................ + // smlal R9, R7, R10, R3 // ....................*.......................... + // smull R9, R12, R12, R1 // ..........*.................................... + // mul R10, R9, R2 // ............*.................................. + // smlal R9, R12, R10, R3 // ..............*................................ + // vmov R1, s4 // .....................*......................... + // smull R9, R8, R8, R1 // .........................*..................... + // mul R10, R9, R2 // ..............................*................ + // smlal R9, R8, R10, R3 // ................................*.............. + // smull R9, R14, R14, R1 // ........................*...................... + // mul R10, R9, R2 // ..........................*.................... + // smlal R9, R14, R10, R3 // ............................*.................. + // add R5, R7 // .........................*..................... + // add R6, R8 // ..................................*............ + // add R4, R12 // ................*.............................. + // add R11, R14 // ..............................*................ + // sub.w R7, R5, R7, lsl #1 // ..........................*.................... + // sub.w R8, R6, R8, lsl #1 // ...................................*........... + // sub.w R12, R4, R12, lsl #1 // .................*............................. + // sub.w R14, R11, R14, lsl #1 // ...............................*............... + // vmov R1, s5 // ..................*............................ + // smull R9, R4, R4, R1 // ...................*........................... + // mul R10, R9, R2 // .....................*......................... + // smlal R9, R4, R10, R3 // .......................*....................... + // vmov R1, s6 // .............................*................. + // smull R9, R11, R11, R1 // ...............................*............... + // mul R10, R9, R2 // ..................................*............ + // smlal R9, R11, R10, R3 // ....................................*.......... + // vmov R1, s7 // .................................*............. + // smull R9, R12, R12, R1 // .....................................*......... + // mul R10, R9, R2 // .......................................*....... + // smlal R9, R12, R10, R3 // .........................................*..... + // vmov R1, s8 // ................................*.............. + // smull R9, R14, R14, R1 // .................................*............. + // mul R10, R9, R2 // ...................................*........... + // smlal R9, R14, R10, R3 // ......................................*........ + // add R5, R4 // ...........................*................... + // add R6, R11 // ......................................*........ + // add R7, R12 // ...........................................*... + // add R8, R14 // ........................................*...... + // sub.w R4, R5, R4, lsl #1 // ............................*.................. + // sub.w R11, R6, R11, lsl #1 // .......................................*....... + // sub.w R12, R7, R12, lsl #1 // ............................................*.. + // sub.w R14, R8, R14, lsl #1 // .........................................*..... + // str R6, [R0, #1*128/4] // .............................................*. + // str R7, [R0, #2*128/4] // ...........................................*... + // str R8, [R0, #3*128/4] // ........................................*...... + // str.w R4, [R0, #4*128/4] // .............................*................. + // str.w R11, [R0, #5*128/4] // ..............................................* + // str.w R12, [R0, #6*128/4] // ............................................*.. + // str.w R14, [R0, #7*128/4] // ..........................................*.... + // str R5, [R0] // ...........................*................... + // add.w R0, R0, #256 // ..............................................* vmov temp_l, s10 cmp ptr_p, temp_l @@ -857,8 +870,8 @@ layer456_loop: // Expected cycles: 0 // Expected IPC: 0.00 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.00s + // User time: 0.00s // sub.w ptr_p, #4*strincr2-4 @@ -874,256 +887,265 @@ layer456_loop: add.w cntr, ptr_p, #64*strincr3 // 64 iterations vmov s9, cntr - // Instructions: 2 - // Expected cycles: 1 - // Expected IPC: 2.00 - // - // Cycle bound: 1.0 - // IPC bound: 2.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r8, [r1, #8] // *............................. - ldr.w r6, [r1, #4] // *............................. + // Instructions: 9 + // Expected cycles: 7 + // Expected IPC: 1.29 + // + // Cycle bound: 7.0 + // IPC bound: 1.29 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r4, [r0, #768] // *............................. + ldr r9, [r1], #12 // *............................. + ldr r5, [r1, #-8] // .*............................ + ldr.w r8, [r1], #16 // ..*........................... + smull r6, r7, r4, r9 // ..*........................... + ldr.w r10, [r0, #256] // ...*.......................... + ldr.w r4, [r0, #512] // ....*......................... + mul r14, r6, r2 // ....*......................... + smlal r6, r7, r14, r3 // ......*....................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w r8, [r1, #8] // *.............................. - // ldr.w r6, [r1, #4] // *.............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r9, [r1], #12 // *.............................. + // ldr.w r4, [r0, #768] // *.............................. + // smull r4, r7, r4, r9 // ..*............................ + // mul r11, r4, r2 // ....*.......................... + // smlal r4, r7, r11, r3 // ......*........................ + // ldr r5, [r1, #-8] // .*............................. + // ldr.w r8, [r1], #16 // ..*............................ + // ldr.w r10, [r0, #256] // ...*........................... + // ldr.w r4, [r0, #512] // ....*.......................... - push {cntr} - vmov cntr, s9 - sub cntr, cntr, #4 - vmov s9, cntr - pop {cntr} + push {r11} + vmov r11, s9 + sub r11, r11, #4 + vmov s9, r11 + pop {r11} layer78_loop: - // Instructions: 47 - // Expected cycles: 32 - // Expected IPC: 1.47 + // Instructions: 50 + // Expected cycles: 28 + // Expected IPC: 1.79 + // + // Cycle bound: 34.0 + // IPC bound: 1.47 + // + // Wall time: 24.35s + // User time: 24.35s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add r4, r7 // *............................. + smull r11, r6, r10, r9 // *............................. + smull r10, r14, r4, r5 // .*............................ + sub.w r4, r4, r7, lsl #1 // ..*........................... + mul r5, r11, r2 // ..*........................... + ldr r12, [r1, #-20] // ...*.......................... + mul r7, r10, r2 // ...*.......................... + ldr r9, [r1], #12 // ....e......................... + smlal r11, r6, r5, r3 // ....*......................... + ldr.w r11, [r0] // .....*........................ + smull r5, r12, r4, r12 // .....*........................ + ldr.w r4, [r0, #772] // ......e....................... + smlal r10, r14, r7, r3 // ......*....................... + add r11, r6 // .......*...................... + mul r10, r5, r2 // .......*...................... + sub.w r6, r11, r6, lsl #1 // ........*..................... + smull r4, r7, r4, r9 // ........e..................... + add r11, r14 // .........*.................... + smlal r5, r12, r10, r3 // .........*.................... + sub.w r14, r11, r14, lsl #1 // ..........*................... + smull r8, r10, r11, r8 // ..........*................... + mul r11, r4, r2 // ...........e.................. + add r6, r12 // ............*................. + mul r5, r8, r2 // ............*................. + smlal r4, r7, r11, r3 // .............e................ + ldr r11, [r1, #-24] // ..............*............... + smlal r8, r10, r5, r3 // ..............*............... + ldr r8, [r1, #-16] // ...............*.............. + str r10, [r0], #4 // ...............*.............. // @slothy:core // @slothy:before=cmp + sub.w r10, r6, r12, lsl #1 // ................*............. + smull r5, r6, r6, r11 // ................*............. + ldr r12, [r1, #-20] // .................*............ + smull r10, r8, r10, r8 // .................*............ + mul r4, r5, r2 // ..................*........... + vmov r11, s9 // ...................*.......... + smull r12, r14, r14, r12 // ...................*.......... + cmp.w r0, r11 // ....................*......... // @slothy:id=cmp + mul r11, r10, r2 // ....................*......... + smlal r5, r6, r4, r3 // .....................*........ + ldr r5, [r1, #-8] // ......................e....... + smlal r10, r8, r11, r3 // ......................*....... + str.w r8, [r0, #764] // .......................*...... + ldr.w r8, [r1], #16 // .......................e...... + mul r11, r12, r2 // ........................*..... + ldr.w r10, [r0, #256] // .........................e.... + str.w r6, [r0, #252] // .........................*.... + ldr.w r4, [r0, #512] // ..........................e... + smlal r12, r14, r11, r3 // ..........................*... + str.w r14, [r0, #508] // ...........................*.. + bne.w layer78_loop // ...........................*.. // @slothy:branch + + // ---------------- cycle (expected) -----------------> + // 0 25 50 + // |------------------------|------------------------|- + // ldr.w R12, [R1, #4] // ..................e.....'.....................~..... + // ldr.w R14, [R1, #8] // ........................'..*........................ + // ldr R11, [R1], #12 // e.......................'...~....................... + // ldr.w R5, [R0] // .~......................'....*...................... + // ldr.w R6, [R0, #256] // .....................e..'........................~.. + // ldr.w R7, [R0, #512] // ......................e.'.........................~. + // ldr.w R8, [R0, #768] // ..e.....................'.....~..................... + // smull R9, R6, R6, R11 // ........................*........................... + // mul R10, R9, R2 // ........................'.*......................... + // smlal R9, R6, R10, R3 // ~.......................'...*....................... + // smull R9, R8, R8, R11 // ....e...................'.......~................... + // mul R10, R9, R2 // .......e................'..........~................ + // smlal R9, R8, R10, R3 // .........e..............'............~.............. + // add R5, R6 // ...~....................'......*.................... + // add R7, R8 // ........................*........................... + // sub.w R6, R5, R6, lsl #1 // ....~...................'.......*................... + // sub.w R8, R7, R8, lsl #1 // ........................'.*......................... + // smull R9, R7, R7, R12 // ........................'*.......................... + // mul R10, R9, R2 // ........................'..*........................ + // smlal R9, R7, R10, R3 // ..~.....................'.....*..................... + // smull R9, R8, R8, R14 // .~......................'....*...................... + // mul R10, R9, R2 // ...~....................'......*.................... + // smlal R9, R8, R10, R3 // .....~..................'........*.................. + // add R5, R7 // .....~..................'........*.................. + // add R6, R8 // ........~...............'...........*............... + // sub.w R7, R5, R7, lsl #1 // ......~.................'.........*................. + // sub.w R8, R6, R8, lsl #1 // ............~...........'...............*........... + // ldr.w R12, [R1, #4] // ..........~.............'.............*............. + // ldr.w R14, [R1, #8] // .............~..........'................*.......... + // ldr.w R11, [R1, #12] // ...........~............'..............*............ + // ldr.w R4, [R1], #16 // ...................e....'......................~.... + // smull R9, R5, R5, R4 // ......~.................'.........*................. + // mul R10, R9, R2 // ........~...............'...........*............... + // smlal R9, R5, R10, R3 // ..........~.............'.............*............. + // smull R9, R6, R6, R12 // ............~...........'...............*........... + // mul R10, R9, R2 // ..............~.........'.................*......... + // smlal R9, R6, R10, R3 // .................~......'....................*...... + // smull R9, R7, R7, R14 // ...............~........'..................*........ + // mul R10, R9, R2 // ....................~...'.......................*... + // smlal R9, R7, R10, R3 // ......................~.'.........................*. + // smull R9, R8, R8, R11 // .............~..........'................*.......... + // mul R10, R9, R2 // ................~.......'...................*....... + // smlal R9, R8, R10, R3 // ..................~.....'.....................*..... + // str.w R6, [R0, #256] // .....................~..'........................*.. + // str.w R7, [R0, #512] // .......................~'..........................* + // str.w R8, [R0, #768] // ...................~....'......................*.... + // str R5, [R0], #4 // ...........~............'..............*............ + // vmov R4, s9 // ...............~........'..................*........ + // cmp.w R0, R4 // ................~.......'...................*....... + // bne.w layer78_loop // .......................~'..........................* + + + // Instructions: 41 + // Expected cycles: 26 + // Expected IPC: 1.58 // // Cycle bound: 26.0 - // IPC bound: 1.81 + // IPC bound: 1.58 // - // Wall time: 6.05s - // User time: 6.05s + // Wall time: 1.42s + // User time: 1.42s // - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - ldr.w r7, [r0, #768] // *............................... - ldr r11, [r1], #12 // *............................... - ldr.w r5, [r0, #256] // .*.............................. - smull r9, r10, r7, r11 // ..*............................. - smull r11, r14, r5, r11 // ...*............................ - ldr.w r5, [r0, #512] // ....*........................... - mul r12, r9, r2 // ....*........................... - mul r4, r11, r2 // .....*.......................... - smlal r9, r10, r12, r3 // ......*......................... - ldr.w r9, [r0] // .......*........................ - smlal r11, r14, r4, r3 // .......*........................ - add r5, r10 // ........*....................... - sub.w r7, r5, r10, lsl #1 // .........*...................... - smull r12, r6, r5, r6 // .........*...................... - add r9, r14 // ..........*..................... - smull r5, r7, r7, r8 // ..........*..................... - ldr.w r8, [r1, #12] // ...........*.................... - mul r11, r12, r2 // ...........*.................... - sub.w r4, r9, r14, lsl #1 // ............*................... - mul r14, r5, r2 // ............*................... - ldr.w r10, [r1, #8] // .............*.................. - smlal r12, r6, r11, r3 // .............*.................. - ldr.w r12, [r1, #4] // ..............*................. - ldr.w r11, [r1], #16 // ..............*................. - add r9, r6 // ...............*................ - smlal r5, r7, r14, r3 // ...............*................ - sub.w r6, r9, r6, lsl #1 // ................*............... - smull r5, r9, r9, r11 // ................*............... - add r4, r7 // .................*.............. - smull r10, r6, r6, r10 // .................*.............. - sub.w r11, r4, r7, lsl #1 // ..................*............. - smull r14, r7, r4, r12 // ..................*............. - mul r4, r10, r2 // ...................*............ - smull r12, r11, r11, r8 // ....................*........... - ldr.w r8, [r1, #8] // .....................e.......... - smlal r10, r6, r4, r3 // .....................*.......... - mul r10, r12, r2 // ......................*......... - mul r4, r14, r2 // .......................*........ - smlal r12, r11, r10, r3 // ........................*....... - str.w r11, [r0, #768] // .........................*...... - smlal r14, r7, r4, r3 // ..........................*..... - str.w r7, [r0, #256] // ...........................*.... - mul r10, r5, r2 // ............................*... - str.w r6, [r0, #512] // .............................*.. - ldr.w r6, [r1, #4] // .............................e.. - smlal r5, r9, r10, r3 // ..............................*. - str r9, [r0], #4 // ...............................* // @slothy:core - - // ------------ cycle (expected) ------------> + // ----- cycle (expected) ------> // 0 25 - // |------------------------|----------------- - // ldr.w R12, [R1, #4] // ........e..'............................~.. - // ldr.w R14, [R1, #8] // e..........'....................~.......... - // ldr R11, [R1], #12 // ...........*............................... - // ldr.w R5, [R0] // ...........'......*........................ - // ldr.w R6, [R0, #256] // ...........'*.............................. - // ldr.w R7, [R0, #512] // ...........'...*........................... - // ldr.w R8, [R0, #768] // ...........*............................... - // smull R9, R6, R6, R11 // ...........'..*............................ - // mul R10, R9, R2 // ...........'....*.......................... - // smlal R9, R6, R10, R3 // ...........'......*........................ - // smull R9, R8, R8, R11 // ...........'.*............................. - // mul R10, R9, R2 // ...........'...*........................... - // smlal R9, R8, R10, R3 // ...........'.....*......................... - // add R5, R6 // ...........'.........*..................... - // add R7, R8 // ...........'.......*....................... - // sub.w R6, R5, R6, lsl #1 // ...........'...........*................... - // sub.w R8, R7, R8, lsl #1 // ...........'........*...................... - // smull R9, R7, R7, R12 // ...........'........*...................... - // mul R10, R9, R2 // ...........'..........*.................... - // smlal R9, R7, R10, R3 // ...........'............*.................. - // smull R9, R8, R8, R14 // ...........'.........*..................... - // mul R10, R9, R2 // ...........'...........*................... - // smlal R9, R8, R10, R3 // ...........'..............*................ - // add R5, R7 // ...........'..............*................ - // add R6, R8 // ...........'................*.............. - // sub.w R7, R5, R7, lsl #1 // ...........'...............*............... - // sub.w R8, R6, R8, lsl #1 // ...........'.................*............. - // ldr.w R12, [R1, #4] // ...........'.............*................. - // ldr.w R14, [R1, #8] // ...........'............*.................. - // ldr.w R11, [R1, #12] // ...........'..........*.................... - // ldr.w R4, [R1], #16 // ...........'.............*................. - // smull R9, R5, R5, R4 // ...........'...............*............... - // mul R10, R9, R2 // .......~...'...........................*... - // smlal R9, R5, R10, R3 // .........~.'.............................*. - // smull R9, R6, R6, R12 // ...........'.................*............. - // mul R10, R9, R2 // ..~........'......................*........ - // smlal R9, R6, R10, R3 // .....~.....'.........................*..... - // smull R9, R7, R7, R14 // ...........'................*.............. - // mul R10, R9, R2 // ...........'..................*............ - // smlal R9, R7, R10, R3 // ~..........'....................*.......... - // smull R9, R8, R8, R11 // ...........'...................*........... - // mul R10, R9, R2 // .~.........'.....................*......... - // smlal R9, R8, R10, R3 // ...~.......'.......................*....... - // str.w R6, [R0, #256] // ......~....'..........................*.... - // str.w R7, [R0, #512] // ........~..'............................*.. - // str.w R8, [R0, #768] // ....~......'........................*...... - // str R5, [R0], #4 // ..........~'..............................* + // |------------------------|---- + add r4, r7 // *............................. + smull r11, r14, r10, r9 // *............................. + sub.w r6, r4, r7, lsl #1 // .*............................ + smull r10, r7, r4, r5 // .*............................ + ldr r12, [r1, #-20] // ..*........................... + mul r5, r11, r2 // ..*........................... + ldr.w r9, [r0] // ...*.......................... + mul r4, r10, r2 // ...*.......................... + smlal r11, r14, r5, r3 // ....*......................... + ldr r11, [r1, #-8] // .....*........................ + smlal r10, r7, r4, r3 // .....*........................ + add r9, r14 // ......*....................... + smull r10, r5, r6, r12 // ......*....................... + sub.w r6, r9, r14, lsl #1 // .......*...................... + add r9, r7 // .......*...................... + sub.w r4, r9, r7, lsl #1 // ........*..................... + mul r7, r10, r2 // ........*..................... + ldr r12, [r1, #-12] // .........*.................... + smull r11, r4, r4, r11 // .........*.................... + smlal r10, r5, r7, r3 // ..........*................... + ldr r10, [r1, #-4] // ...........*.................. + mul r7, r11, r2 // ...........*.................. + add r6, r5 // ............*................. + smull r14, r8, r9, r8 // ............*................. + sub.w r9, r6, r5, lsl #1 // .............*................ + smlal r11, r4, r7, r3 // .............*................ + str.w r4, [r0, #512] // ..............*............... + smull r10, r7, r9, r10 // ...............*.............. + smull r11, r5, r6, r12 // ................*............. + mul r9, r10, r2 // .................*............ + mul r4, r11, r2 // ..................*........... + smlal r10, r7, r9, r3 // ...................*.......... + mul r12, r14, r2 // ....................*......... + smlal r11, r5, r4, r3 // .....................*........ + vmov r11, s9 // ......................*....... + smlal r14, r8, r12, r3 // ......................*....... + str r8, [r0], #4 // .......................*...... // @slothy:core // @slothy:before=cmp + cmp.w r0, r11 // .......................*...... // @slothy:id=cmp + str.w r7, [r0, #764] // ........................*..... + str.w r5, [r0, #252] // .........................*.... - vmov cntr, s9 - cmp ptr_p, cntr - bne layer78_loop - // Instructions: 45 - // Expected cycles: 32 - // Expected IPC: 1.41 - // - // Cycle bound: 32.0 - // IPC bound: 1.41 - // - // Wall time: 1.13s - // User time: 1.13s - // - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - ldr.w r9, [r0, #768] // *............................... - ldr r4, [r1], #12 // *............................... - ldr.w r10, [r0, #256] // .*.............................. - ldr.w r12, [r1, #8] // ..*............................. - smull r9, r11, r9, r4 // ..*............................. - smull r5, r14, r10, r4 // ...*............................ - mul r4, r9, r2 // ....*........................... - mul r7, r5, r2 // .....*.......................... - ldr.w r10, [r0, #512] // ......*......................... - smlal r9, r11, r4, r3 // ......*......................... - ldr.w r4, [r1, #12] // .......*........................ - smlal r5, r14, r7, r3 // .......*........................ - add r10, r11 // ........*....................... - sub.w r11, r10, r11, lsl #1 // .........*...................... - smull r10, r9, r10, r6 // .........*...................... - ldr.w r6, [r0] // ..........*..................... - smull r5, r8, r11, r8 // ..........*..................... - mul r7, r10, r2 // ...........*.................... - add r6, r14 // ............*................... - mul r11, r5, r2 // ............*................... - sub.w r14, r6, r14, lsl #1 // .............*.................. - smlal r10, r9, r7, r3 // .............*.................. - ldr.w r7, [r1, #4] // ..............*................. - ldr.w r10, [r1], #16 // ..............*................. - add r6, r9 // ...............*................ - smlal r5, r8, r11, r3 // ...............*................ - sub.w r9, r6, r9, lsl #1 // ................*............... - smull r6, r10, r6, r10 // ................*............... - add r14, r8 // .................*.............. - smull r12, r9, r9, r12 // .................*.............. - sub.w r11, r14, r8, lsl #1 // ..................*............. - mul r5, r6, r2 // ..................*............. - smull r8, r14, r14, r7 // ...................*............ - smlal r6, r10, r5, r3 // ....................*........... - smull r4, r11, r11, r4 // .....................*.......... - mul r6, r12, r2 // ......................*......... - mul r7, r4, r2 // .......................*........ - smlal r12, r9, r6, r3 // ........................*....... - smlal r4, r11, r7, r3 // .........................*...... - str.w r11, [r0, #768] // ..........................*..... - mul r12, r8, r2 // ...........................*.... - str.w r9, [r0, #512] // ............................*... - smlal r8, r14, r12, r3 // .............................*.. - str.w r14, [r0, #256] // ..............................*. - str r10, [r0], #4 // ...............................* // @slothy:core - - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - // ldr.w r7, [r0, #768] // *............................... - // ldr r11, [r1], #12 // *............................... - // ldr.w r5, [r0, #256] // .*.............................. - // smull r9, r10, r7, r11 // ..*............................. - // smull r11, r14, r5, r11 // ...*............................ - // ldr.w r5, [r0, #512] // ......*......................... - // mul r12, r9, r2 // ....*........................... - // mul r4, r11, r2 // .....*.......................... - // smlal r9, r10, r12, r3 // ......*......................... - // ldr.w r9, [r0] // ..........*..................... - // smlal r11, r14, r4, r3 // .......*........................ - // add r5, r10 // ........*....................... - // sub.w r7, r5, r10, lsl #1 // .........*...................... - // smull r12, r6, r5, r6 // .........*...................... - // add r9, r14 // ............*................... - // smull r5, r7, r7, r8 // ..........*..................... - // ldr.w r8, [r1, #12] // .......*........................ - // mul r11, r12, r2 // ...........*.................... - // sub.w r4, r9, r14, lsl #1 // .............*.................. - // mul r14, r5, r2 // ............*................... - // ldr.w r10, [r1, #8] // ..*............................. - // smlal r12, r6, r11, r3 // .............*.................. - // ldr.w r12, [r1, #4] // ..............*................. - // ldr.w r11, [r1], #16 // ..............*................. - // add r9, r6 // ...............*................ - // smlal r5, r7, r14, r3 // ...............*................ - // sub.w r6, r9, r6, lsl #1 // ................*............... - // smull r5, r9, r9, r11 // ................*............... - // add r4, r7 // .................*.............. - // smull r10, r6, r6, r10 // .................*.............. - // sub.w r11, r4, r7, lsl #1 // ..................*............. - // smull r14, r7, r4, r12 // ...................*............ - // mul r4, r10, r2 // ......................*......... - // smull r12, r11, r11, r8 // .....................*.......... - // smlal r10, r6, r4, r3 // ........................*....... - // mul r10, r12, r2 // .......................*........ - // mul r4, r14, r2 // ...........................*.... - // smlal r12, r11, r10, r3 // .........................*...... - // str.w r11, [r0, #768] // ..........................*..... - // smlal r14, r7, r4, r3 // .............................*.. - // str.w r7, [r0, #256] // ..............................*. - // mul r10, r5, r2 // ..................*............. - // str.w r6, [r0, #512] // ............................*... - // smlal r5, r9, r10, r3 // ....................*........... - // str r9, [r0], #4 // ...............................* + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // add r4, r7 // *.............................. + // smull r11, r6, r10, r9 // *.............................. + // smull r10, r14, r4, r5 // .*............................. + // sub.w r4, r4, r7, lsl #1 // .*............................. + // mul r5, r11, r2 // ..*............................ + // ldr r12, [r1, #-20] // ..*............................ + // mul r7, r10, r2 // ...*........................... + // smlal r11, r6, r5, r3 // ....*.......................... + // ldr.w r11, [r0] // ...*........................... + // smull r5, r12, r4, r12 // ......*........................ + // smlal r10, r14, r7, r3 // .....*......................... + // add r11, r6 // ......*........................ + // mul r10, r5, r2 // ........*...................... + // sub.w r6, r11, r6, lsl #1 // .......*....................... + // add r11, r14 // .......*....................... + // smlal r5, r12, r10, r3 // ..........*.................... + // sub.w r14, r11, r14, lsl #1 // ........*...................... + // smull r8, r10, r11, r8 // ............*.................. + // add r6, r12 // ............*.................. + // mul r5, r8, r2 // ....................*.......... + // ldr r11, [r1, #-12] // .........*..................... + // smlal r8, r10, r5, r3 // ......................*........ + // ldr r8, [r1, #-4] // ...........*................... + // str r10, [r0], #4 // .......................*....... + // sub.w r10, r6, r12, lsl #1 // .............*................. + // smull r5, r6, r6, r11 // ................*.............. + // ldr r12, [r1, #-8] // .....*......................... + // smull r10, r8, r10, r8 // ...............*............... + // mul r4, r5, r2 // ..................*............ + // vmov r11, s9 // ......................*........ + // smull r12, r14, r14, r12 // .........*..................... + // cmp.w r0, r11 // .......................*....... + // mul r11, r10, r2 // .................*............. + // smlal r5, r6, r4, r3 // .....................*......... + // smlal r10, r8, r11, r3 // ...................*........... + // str.w r8, [r0, #764] // ........................*...... + // mul r11, r12, r2 // ...........*................... + // str.w r6, [r0, #252] // .........................*..... + // smlal r12, r14, r11, r3 // .............*................. + // str.w r14, [r0, #508] // ..............*................ + // bne.w layer78_loop // .........................*..... // restore registers diff --git a/examples/opt/armv7m/keccakf1600_adomnicai_m7_opt_m7.s b/examples/opt/armv7m/keccakf1600_adomnicai_m7_opt_m7.s index 24fa9a88..121787c7 100644 --- a/examples/opt/armv7m/keccakf1600_adomnicai_m7_opt_m7.s +++ b/examples/opt/armv7m/keccakf1600_adomnicai_m7_opt_m7.s @@ -2826,4 +2826,4 @@ KeccakP1600_Permute_Round1Mod4: add sp, #mSize pop { r4 - r12, pc } -.size KeccakP1600_Permute, .-KeccakP1600_Permute +.size KeccakF1600_StatePermute_adomnicai_m7_opt_m7, .-KeccakF1600_StatePermute_adomnicai_m7_opt_m7 diff --git a/examples/opt/armv7m/loop_subs_opt_m7.s b/examples/opt/armv7m/loop_subs_opt_m7.s index 7603484d..e432f804 100644 --- a/examples/opt/armv7m/loop_subs_opt_m7.s +++ b/examples/opt/armv7m/loop_subs_opt_m7.s @@ -26,15 +26,15 @@ start2: // Cycle bound: 3.0 // IPC bound: 1.67 // - // Wall time: 0.02s - // User time: 0.02s + // Wall time: 0.01s + // User time: 0.01s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - eor.w r0, r0, r7 // *............................. + eor.w r6, r0, r7 // *............................. subs.w r5, r5, #1 // .*............................ - mul r1, r0, r8 // .*............................ + mul r1, r6, r8 // .*............................ eor.w r0, r1, r4 // ...*.......................... bne.w start2 // ...*.......................... // @slothy:branch diff --git a/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s b/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s index 3af47ba0..cfc7bfc2 100644 --- a/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s +++ b/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s @@ -146,628 +146,948 @@ small_ntt_asm_769_opt_m7: // s24: tmp // s25: twiddle_ptr vmov s24, tmp - vmov s25, twiddle_ptr layer1234_loop: - // Instructions: 299 - // Expected cycles: 153 - // Expected IPC: 1.95 + // Instructions: 302 + // Expected cycles: 159 + // Expected IPC: 1.90 // - // ------------------------------------------------------------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - vmov r6, s8 // *.......................................................................................................................................................................................................................................................................................................... - ldr.w r3, [r0, #352] // .*......................................................................................................................................................................................................................................................................................................... - ldr.w r11, [r0, #96] // ....*...................................................................................................................................................................................................................................................................................................... - vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................ - ldr.w r1, [r0, #288] // ........*.................................................................................................................................................................................................................................................................................................. - smulwb r10, r6, r3 // .....*..................................................................................................................................................................................................................................................................................................... - ldr.w r2, [r0, #480] // ..............*............................................................................................................................................................................................................................................................................................ - smulwt r7, r6, r3 // ...........*............................................................................................................................................................................................................................................................................................... - ldr.w r4, [r0, #32] // ............*.............................................................................................................................................................................................................................................................................................. - smulwb r8, r6, r1 // ...............*........................................................................................................................................................................................................................................................................................... - ldr.w r5, [r0, #416] // ...*....................................................................................................................................................................................................................................................................................................... - smulwb r14, r6, r2 // ...........................*............................................................................................................................................................................................................................................................................... - ldr.w r9, [r0, #160] // ......*.................................................................................................................................................................................................................................................................................................... - smulwt r2, r6, r2 // .............................*............................................................................................................................................................................................................................................................................. - ldr.w r3, [r0, #224] // ..........*................................................................................................................................................................................................................................................................................................ - smulwt r1, r6, r1 // .............*............................................................................................................................................................................................................................................................................................. - movw r0, #24608 // ................*.......................................................................................................................................................................................................................................................................................... - smlabt r2, r2, r12, r0 // .................................*......................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ...............................*........................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r8, r8, r12, r0 // .....................*..................................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r1, r1, r12, r0 // .......................*................................................................................................................................................................................................................................................................................... - pkhtb r14, r2, r14, asr #16 // ....................................*...................................................................................................................................................................................................................................................................... - smlabt r2, r10, r12, r0 // .........................*................................................................................................................................................................................................................................................................................. - pkhtb r8, r1, r8, asr #16 // ..........................*................................................................................................................................................................................................................................................................................ - smulwb r10, r6, r5 // .......*................................................................................................................................................................................................................................................................................................... - usub16 r1, r3, r14 // ......................................*.................................................................................................................................................................................................................................................................... - smulwt r6, r6, r5 // .........*................................................................................................................................................................................................................................................................................................. - uadd16 r14, r3, r14 // ........................................*.................................................................................................................................................................................................................................................................. - smlabt r3, r7, r12, r0 // ...................*....................................................................................................................................................................................................................................................................................... - vmov r5, s9 // ................................*.......................................................................................................................................................................................................................................................................... - smlabt r7, r6, r12, r0 // ..................*........................................................................................................................................................................................................................................................................................ - pkhtb r3, r3, r2, asr #16 // ..................................*........................................................................................................................................................................................................................................................................ - smlabt r6, r10, r12, r0 // .................*......................................................................................................................................................................................................................................................................................... - usub16 r10, r11, r3 // ..................................................*........................................................................................................................................................................................................................................................ - smulwb r2, r5, r14 // ..........................................*................................................................................................................................................................................................................................................................ - uadd16 r11, r11, r3 // ................................................*.......................................................................................................................................................................................................................................................... - smulwt r3, r5, r14 // ............................................*.............................................................................................................................................................................................................................................................. - pkhtb r7, r7, r6, asr #16 // ....................*...................................................................................................................................................................................................................................................................................... - smlabt r6, r2, r12, r0 // ..............................................*............................................................................................................................................................................................................................................................ - uadd16 r2, r9, r7 // ........................*.................................................................................................................................................................................................................................................................................. - smlabt r14, r3, r12, r0 // ...................................................*....................................................................................................................................................................................................................................................... - usub16 r7, r9, r7 // ......................*.................................................................................................................................................................................................................................................................................... - smulwt r3, r5, r2 // .....................................*..................................................................................................................................................................................................................................................................... - uadd16 r9, r4, r8 // ..............................*............................................................................................................................................................................................................................................................................ - smulwb r5, r5, r2 // ...................................*....................................................................................................................................................................................................................................................................... - usub16 r2, r4, r8 // ............................*.............................................................................................................................................................................................................................................................................. - smlabt r4, r3, r12, r0 // .........................................*................................................................................................................................................................................................................................................................. - vmov r3, s10 // .............................................*............................................................................................................................................................................................................................................................. - smlabt r8, r5, r12, r0 // .......................................*................................................................................................................................................................................................................................................................... - pkhtb r5, r14, r6, asr #16 // .....................................................*..................................................................................................................................................................................................................................................... - smulwb r6, r3, r7 // ........................................................*.................................................................................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwt r14, r3, r7 // ..........................................................*................................................................................................................................................................................................................................................ - pkhtb r8, r4, r8, asr #16 // ...........................................*............................................................................................................................................................................................................................................................... - smulwb r7, r3, r1 // ...............................................*........................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwt r4, r3, r1 // .................................................*......................................................................................................................................................................................................................................................... - usub16 r1, r11, r5 // .......................................................*................................................................................................................................................................................................................................................... - smlabt r3, r7, r12, r0 // ....................................................*...................................................................................................................................................................................................................................................... - vmov r7, s12 // ...........................................................*............................................................................................................................................................................................................................................... - smlabt r4, r4, r12, r0 // ......................................................*.................................................................................................................................................................................................................................................... - uadd16 r11, r11, r5 // ...............................................................*........................................................................................................................................................................................................................................... - smulwb r5, r7, r1 // ..................................................................*........................................................................................................................................................................................................................................ - pkhtb r4, r4, r3, asr #16 // .........................................................*................................................................................................................................................................................................................................................. - smulwt r3, r7, r1 // ................................................................*.......................................................................................................................................................................................................................................... - usub16 r7, r10, r4 // .....................................................................*..................................................................................................................................................................................................................................... - smlabt r1, r5, r12, r0 // ......................................................................*.................................................................................................................................................................................................................................... - uadd16 r4, r10, r4 // .............................................................*............................................................................................................................................................................................................................................. - smlabt r3, r3, r12, r0 // ....................................................................*...................................................................................................................................................................................................................................... - vmov r5, s11 // .............................................................................*............................................................................................................................................................................................................................. - smlabt r6, r6, r12, r0 // ............................................................*.............................................................................................................................................................................................................................................. - pkhtb r10, r3, r1, asr #16 // ...............................................................................*........................................................................................................................................................................................................................... - smulwt r3, r5, r11 // ..................................................................................*........................................................................................................................................................................................................................ - uadd16 r1, r9, r8 // ...........................................................................*............................................................................................................................................................................................................................... - smulwb r11, r5, r11 // ................................................................................*.......................................................................................................................................................................................................................... - usub16 r5, r9, r8 // ...................................................................................*....................................................................................................................................................................................................................... - smlabt r8, r14, r12, r0 // ..............................................................*............................................................................................................................................................................................................................................ - vmov r9, s13 // ...................................................................*....................................................................................................................................................................................................................................... - smlabt r14, r11, r12, r0 // ....................................................................................*...................................................................................................................................................................................................................... - pkhtb r6, r8, r6, asr #16 // .................................................................*......................................................................................................................................................................................................................................... - smlabt r8, r3, r12, r0 // ......................................................................................*.................................................................................................................................................................................................................... - uadd16 r11, r2, r6 // .........................................................................*................................................................................................................................................................................................................................. - smulwb r3, r9, r4 // ........................................................................*.................................................................................................................................................................................................................................. - pkhtb r8, r8, r14, asr #16 // .........................................................................................*................................................................................................................................................................................................................. - smulwt r14, r9, r4 // ..........................................................................*................................................................................................................................................................................................................................ - usub16 r2, r2, r6 // .......................................................................*................................................................................................................................................................................................................................... - smlabt r6, r3, r12, r0 // ............................................................................*.............................................................................................................................................................................................................................. - vmov r3, s14 // .....................................................................................*..................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................ - vmov r4, s20 // ..............................................................................................*............................................................................................................................................................................................................ - smulwb r9, r3, r7 // ........................................................................................*.................................................................................................................................................................................................................. - pkhtb r6, r14, r6, asr #16 // .................................................................................*......................................................................................................................................................................................................................... - smulwt r3, r3, r7 // ...........................................................................................*............................................................................................................................................................................................................... - uadd16 r7, r5, r10 // ............................................................................................................................*.............................................................................................................................................................................. - smlabt r9, r9, r12, r0 // .............................................................................................*............................................................................................................................................................................................................. - vmov r14, s17 // ..............................................................................................................................*............................................................................................................................................................................ - smlabt r3, r3, r12, r0 // ...............................................................................................*........................................................................................................................................................................................................... - usub16 r10, r5, r10 // ...........................................................................................................................................*............................................................................................................................................................... - smulwt r5, r14, r7 // ................................................................................................................................*.......................................................................................................................................................................... - pkhtb r3, r3, r9, asr #16 // .................................................................................................*......................................................................................................................................................................................................... - smulwb r14, r14, r7 // ....................................................................................................................................*...................................................................................................................................................................... - vmov r7, s19 // ..................................................................................................................*........................................................................................................................................................................................ - smlabt r5, r5, r12, r0 // ...................................................................................................................................*....................................................................................................................................................................... - usub16 r9, r11, r6 // .......................................................................................*................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ........................................................................................................................................*.................................................................................................................................................................. - uadd16 r6, r11, r6 // ............................................................................................*.............................................................................................................................................................................................................. - smulwt r11, r4, r9 // ................................................................................................*.......................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwb r4, r4, r9 // ..................................................................................................*........................................................................................................................................................................................................ - pkhtb r5, r5, r14, asr #16 // ...............................................................................................................................................*........................................................................................................................................................... - smulwt r9, r7, r6 // .........................................................................................................................*................................................................................................................................................................................. - uadd16 r14, r2, r3 // .....................................................................................................*..................................................................................................................................................................................................... - smlabt r4, r4, r12, r0 // .....................................................................................................................*..................................................................................................................................................................................... - usub16 r2, r2, r3 // ...................................................................................................*....................................................................................................................................................................................................... - smlabt r11, r11, r12, r0 // ....................................................................................................*...................................................................................................................................................................................................... - vmov r3, s18 // .............................................................................................................................................*............................................................................................................................................................. - smulwb r6, r7, r6 // .............................................................................................................................*............................................................................................................................................................................. - pkhtb r7, r11, r4, asr #16 // ..........................................................................................................................*................................................................................................................................................................................ - smulwt r4, r3, r10 // ................................................................................................................................................*.......................................................................................................................................................... - vmov r11, s21 // ..........................................................................................................*................................................................................................................................................................................................ - smulwb r3, r3, r10 // ......................................................................................................................................................*.................................................................................................................................................... - vmov s5, r7 // ..................................................................................................................................*........................................................................................................................................................................ - smlabt r7, r9, r12, r0 // ...............................................................................................................................*........................................................................................................................................................................... - vmov s2, r5 // .................................................................................................................................................*......................................................................................................................................................... - smlabt r5, r6, r12, r0 // .................................................................................................................................*......................................................................................................................................................................... - uadd16 r9, r1, r8 // ........................................................................................................*.................................................................................................................................................................................................. - smlabt r6, r3, r12, r0 // ..........................................................................................................................................................*................................................................................................................................................ - usub16 r3, r1, r8 // ............................................................................................................*.............................................................................................................................................................................................. - smulwb r1, r11, r14 // .............................................................................................................*............................................................................................................................................................................................. - pkhtb r8, r7, r5, asr #16 // .....................................................................................................................................*..................................................................................................................................................................... - smulwt r5, r11, r14 // ...............................................................................................................*........................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r10, r1, r12, r0 // .................................................................................................................*......................................................................................................................................................................................... - vmov r1, s15 // .......................................................................................................................................*................................................................................................................................................................... - smlabt r14, r5, r12, r0 // ...................................................................................................................*....................................................................................................................................................................................... - vmov s4, r8 // .................................................................................................................................................................*......................................................................................................................................... - smulwb r11, r1, r9 // ..........................................................................................................................................*................................................................................................................................................................ - vmov r5, s16 // ....................................................................................................................*...................................................................................................................................................................................... - smulwt r8, r1, r9 // ..................................................................................................................................................*........................................................................................................................................................ - vmov r1, s22 // ..........................................................................................*................................................................................................................................................................................................................ - smlabt r4, r4, r12, r0 // ....................................................................................................................................................*...................................................................................................................................................... - pkhtb r7, r14, r10, asr #16 // ......................................................................................................................*.................................................................................................................................................................................... - smlabt r8, r8, r12, r0 // ........................................................................................................................................................*.................................................................................................................................................. - pkhtb r4, r4, r6, asr #16 // ...................................................................................................................................................................*....................................................................................................................................... - smlabt r9, r11, r12, r0 // ..............................................................................................................................................*............................................................................................................................................................ - vmov r11, s23 // .........................................................................................................................................*................................................................................................................................................................. - smulwt r14, r1, r2 // .......................................................................................................*................................................................................................................................................................................................... - vmov s6, r7 // ........................................................................................................................*.................................................................................................................................................................................. - smulwb r1, r1, r2 // ......................................................................................................*.................................................................................................................................................................................................... - pkhtb r10, r8, r9, asr #16 // ...........................................................................................................................................................*............................................................................................................................................... - smlabt r9, r14, r12, r0 // ...........................................................................................................*............................................................................................................................................................................................... - vmov r8, s8 // .....................................................................................................................................................*..................................................................................................................................................... - smlabt r14, r1, r12, r0 // .........................................................................................................*................................................................................................................................................................................................. - ldr.w r7, [r11, #320] // .......................................................................................................................................................*................................................................................................................................................... - smulwt r6, r5, r3 // .......................................................................................................................*................................................................................................................................................................................... - ldr.w r2, [r11, #256] // .............................................................................................................................................................*............................................................................................................................................. - smulwb r1, r5, r3 // ......................................................................................................................................*.................................................................................................................................................................... - pkhtb r14, r9, r14, asr #16 // ..............................................................................................................*............................................................................................................................................................................................ - smlabt r5, r6, r12, r0 // ...........................................................................................................................*............................................................................................................................................................................... - vmov s7, r14 // ................................................................................................................*.......................................................................................................................................................................................... - smulwb r14, r8, r2 // ..................................................................................................................................................................*........................................................................................................................................ - ldr.w r3, [r11, #0] // .........................................................................................................................................................*................................................................................................................................................. - smlabt r1, r1, r12, r0 // ............................................................................................................................................*.............................................................................................................................................................. - vmov s3, r4 // .........................................................................................................................................................................*................................................................................................................................. - smulwb r4, r8, r7 // ............................................................................................................................................................*.............................................................................................................................................. - vmov s0, r10 // ...................................................................................................................................................................................*....................................................................................................................... - smulwt r6, r8, r2 // ....................................................................................................................................................................*...................................................................................................................................... - pkhtb r5, r5, r1, asr #16 // ...................................................................................................................................................*....................................................................................................................................................... - smulwt r1, r8, r7 // ..............................................................................................................................................................*............................................................................................................................................ - movw r0, #24608 // ...............................................................................................................................................................*........................................................................................................................................... - smlabt r7, r14, r12, r0 // ......................................................................................................................................................................*.................................................................................................................................... - ldr.w r9, [r11, #384] // .....................................................................................................................................................................*..................................................................................................................................... - smlabt r6, r6, r12, r0 // ........................................................................................................................................................................*.................................................................................................................................. - ldr.w r10, [r11, #448] // ....................................................................................................................................................................................*...................................................................................................................... - smlabt r14, r4, r12, r0 // ................................................................................................................................................................*.......................................................................................................................................... - pkhtb r7, r6, r7, asr #16 // ...........................................................................................................................................................................*............................................................................................................................... - smulwb r4, r8, r9 // ..........................................................................................................................................................................*................................................................................................................................ - usub16 r6, r3, r7 // .............................................................................................................................................................................*............................................................................................................................. - smulwb r2, r8, r10 // ........................................................................................................................................................................................*.................................................................................................................. - vmov s1, r5 // ......................................................................................................................................................................................*.................................................................................................................... - smlabt r1, r1, r12, r0 // ..............................................................................................................................................................................*............................................................................................................................ - uadd16 r3, r3, r7 // ...............................................................................................................................................................................*........................................................................................................................... - smulwt r7, r8, r9 // ............................................................................................................................................................................*.............................................................................................................................. - pkhtb r9, r1, r14, asr #16 // .................................................................................................................................................................................*......................................................................................................................... - smulwt r1, r8, r10 // ..........................................................................................................................................................................................*................................................................................................................ - ldr.w r8, [r11, #64] // .......................................................................................................................................................................*................................................................................................................................... - smlabt r5, r4, r12, r0 // ..................................................................................................................................................................................*........................................................................................................................ - ldr.w r4, [r11, #128] // .....................................................................................................................................................................................*..................................................................................................................... - smlabt r7, r7, r12, r0 // ................................................................................................................................................................................*.......................................................................................................................... - ldr.w r10, [r11, #192] // ...........................................................................................................................................................................................*............................................................................................................... - smlabt r14, r2, r12, r0 // ............................................................................................................................................................................................*.............................................................................................................. - pkhtb r11, r7, r5, asr #16 // .......................................................................................................................................................................................*................................................................................................................... - vmov r2, s9 // .............................................................................................................................................................................................*............................................................................................................. - uadd16 r7, r4, r11 // .........................................................................................................................................................................................*................................................................................................................. - smlabt r1, r1, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................ - usub16 r5, r4, r11 // ...............................................................................................................................................................................................*........................................................................................................... - smulwb r11, r2, r7 // ................................................................................................................................................................................................*.......................................................................................................... - pkhtb r14, r1, r14, asr #16 // .................................................................................................................................................................................................*......................................................................................................... - smulwt r4, r2, r7 // ..................................................................................................................................................................................................*........................................................................................................ - usub16 r7, r10, r14 // ...................................................................................................................................................................................................*....................................................................................................... - smlabt r1, r11, r12, r0 // ....................................................................................................................................................................................................*...................................................................................................... - uadd16 r10, r10, r14 // .....................................................................................................................................................................................................*..................................................................................................... - smlabt r4, r4, r12, r0 // ......................................................................................................................................................................................................*.................................................................................................... - vmov r11, s10 // .......................................................................................................................................................................................................*................................................................................................... - smulwb r14, r2, r10 // ........................................................................................................................................................................................................*.................................................................................................. - pkhtb r1, r4, r1, asr #16 // .........................................................................................................................................................................................................*................................................................................................. - smulwt r10, r2, r10 // ..........................................................................................................................................................................................................*................................................................................................ - usub16 r4, r3, r1 // ...........................................................................................................................................................................................................*............................................................................................... - smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................*.............................................................................................. - uadd16 r1, r3, r1 // .............................................................................................................................................................................................................*............................................................................................. - smlabt r3, r10, r12, r0 // ..............................................................................................................................................................................................................*............................................................................................ - usub16 r10, r8, r9 // .................................................................................................................................................................................................................*......................................................................................... - smulwt r2, r11, r7 // ..................................................................................................................................................................................................................*........................................................................................ - uadd16 r8, r8, r9 // ...............................................................................................................................................................................................................*........................................................................................... - smulwb r9, r11, r5 // ................................................................................................................................................................................................................*.......................................................................................... - pkhtb r3, r3, r14, asr #16 // ...................................................................................................................................................................................................................*....................................................................................... - smulwt r14, r11, r5 // ....................................................................................................................................................................................................................*...................................................................................... - usub16 r5, r8, r3 // .....................................................................................................................................................................................................................*..................................................................................... - smlabt r9, r9, r12, r0 // ......................................................................................................................................................................................................................*.................................................................................... - uadd16 r3, r8, r3 // .......................................................................................................................................................................................................................*................................................................................... - smlabt r8, r14, r12, r0 // ........................................................................................................................................................................................................................*.................................................................................. - vmov r14, s12 // .........................................................................................................................................................................................................................*................................................................................. - smulwb r7, r11, r7 // ..........................................................................................................................................................................................................................*................................................................................ - pkhtb r9, r8, r9, asr #16 // ...........................................................................................................................................................................................................................*............................................................................... - smulwb r11, r14, r5 // ............................................................................................................................................................................................................................*.............................................................................. - usub16 r8, r6, r9 // .............................................................................................................................................................................................................................*............................................................................. - smlabt r7, r7, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................ - uadd16 r9, r6, r9 // ...............................................................................................................................................................................................................................*........................................................................... - smlabt r6, r2, r12, r0 // .................................................................................................................................................................................................................................*......................................................................... - vmov r2, s13 // ................................................................................................................................................................................................................................*.......................................................................... - smulwt r14, r14, r5 // ..................................................................................................................................................................................................................................*........................................................................ - pkhtb r7, r6, r7, asr #16 // ...................................................................................................................................................................................................................................*....................................................................... - smlabt r5, r11, r12, r0 // ....................................................................................................................................................................................................................................*...................................................................... - uadd16 r6, r10, r7 // .....................................................................................................................................................................................................................................*..................................................................... - smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................................................*.................................................................... - usub16 r11, r10, r7 // .......................................................................................................................................................................................................................................*................................................................... - smulwb r7, r2, r6 // ........................................................................................................................................................................................................................................*.................................................................. - pkhtb r5, r14, r5, asr #16 // ...........................................................................................................................................................................................................................................*............................................................... - smulwt r6, r2, r6 // ..........................................................................................................................................................................................................................................*................................................................ - uadd16 r2, r4, r5 // .............................................................................................................................................................................................................................................*............................................................. - smlabt r10, r7, r12, r0 // ............................................................................................................................................................................................................................................*.............................................................. - vmov r7, s11 // .........................................................................................................................................................................................................................................*................................................................. - smlabt r14, r6, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................ - usub16 r5, r4, r5 // ...............................................................................................................................................................................................................................................*........................................................... - smulwb r4, r7, r3 // ................................................................................................................................................................................................................................................*.......................................................... - pkhtb r10, r14, r10, asr #16 // .................................................................................................................................................................................................................................................*......................................................... - smulwt r14, r7, r3 // ..................................................................................................................................................................................................................................................*........................................................ - usub16 r6, r9, r10 // ...................................................................................................................................................................................................................................................*....................................................... - smlabt r4, r4, r12, r0 // ....................................................................................................................................................................................................................................................*...................................................... - vmov r7, s14 // .....................................................................................................................................................................................................................................................*..................................................... - smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................................................................*.................................................... - uadd16 r9, r9, r10 // .......................................................................................................................................................................................................................................................*................................................... - smulwt r10, r7, r11 // ..........................................................................................................................................................................................................................................................*................................................ - pkhtb r3, r14, r4, asr #16 // .........................................................................................................................................................................................................................................................*................................................. - smulwb r7, r7, r11 // ........................................................................................................................................................................................................................................................*.................................................. - usub16 r14, r1, r3 // .............................................................................................................................................................................................................................................................*............................................. - smlabt r4, r10, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................ - vmov r11, s1 // ...........................................................................................................................................................................................................................................................*............................................... - smlabt r7, r7, r12, r0 // ............................................................................................................................................................................................................................................................*.............................................. - usub16 r10, r14, r11 // ...............................................................................................................................................................................................................................................................*........................................... - vmov r0, s23 // ................................................................................................................................................................................................................................................................*.......................................... - str.w r10, [r0, #96] // ..................................................................................................................................................................................................................................................................*........................................ - vmov r10, s2 // ....................................................................................................................................................................................................................................................................*...................................... - pkhtb r4, r4, r7, asr #16 // .......................................................................................................................................................................................................................................................................*................................... - vmov r7, s5 // ........................................................................................................................................................................................................................................................................*.................................. - uadd16 r14, r14, r11 // .................................................................................................................................................................................................................................................................*......................................... - str.w r14, [r0, #64] // ...................................................................................................................................................................................................................................................................*....................................... - usub16 r14, r2, r10 // .....................................................................................................................................................................................................................................................................*..................................... - str.w r14, [r0, #160] // ......................................................................................................................................................................................................................................................................*.................................... - uadd16 r14, r6, r7 // .........................................................................................................................................................................................................................................................................*................................. - str.w r14, [r0, #320] // ..........................................................................................................................................................................................................................................................................*................................ - usub16 r6, r6, r7 // ...........................................................................................................................................................................................................................................................................*............................... - vmov r14, s7 // ..............................................................................................................................................................................................................................................................................*............................ - usub16 r11, r8, r4 // .............................................................................................................................................................................................................................................................................*............................. - str.w r6, [r0, #352] // ............................................................................................................................................................................................................................................................................*.............................. - usub16 r6, r11, r14 // ...............................................................................................................................................................................................................................................................................*........................... - str.w r6, [r0, #480] // ..................................................................................................................................................................................................................................................................................*........................ - uadd16 r11, r11, r14 // .................................................................................................................................................................................................................................................................................*......................... - vmov r6, s3 // ....................................................................................................................................................................................................................................................................................*...................... - uadd16 r14, r1, r3 // ...................................................................................................................................................................................................................................................................................*....................... - vmov r3, s0 // ................................................................................................................................................................................................................................................................................*.......................... - usub16 r1, r5, r6 // ...........................................................................................................................................................................................................................................................................................*............... - str.w r1, [r0, #224] // ............................................................................................................................................................................................................................................................................................*.............. - uadd16 r7, r2, r10 // .....................................................................................................................................................................................................................................................................................*..................... - str.w r7, [r0, #128] // ......................................................................................................................................................................................................................................................................................*.................... - uadd16 r1, r5, r6 // .......................................................................................................................................................................................................................................................................................*................... - str.w r1, [r0, #192] // ........................................................................................................................................................................................................................................................................................*.................. - usub16 r1, r14, r3 // .........................................................................................................................................................................................................................................................................................*................. - str.w r1, [r0, #32] // ..........................................................................................................................................................................................................................................................................................*................ - uadd16 r14, r14, r3 // .............................................................................................................................................................................................................................................................................................*............. - str.w r14, [r0], #4 // ................................................................................................................................................................................................................................................................................................*.......... // @slothy:core - uadd16 r6, r8, r4 // ...............................................................................................................................................................................................................................................................................................*........... - vmov r5, s6 // ..............................................................................................................................................................................................................................................................................................*............ - usub16 r1, r6, r5 // .................................................................................................................................................................................................................................................................................................*......... - str.w r1, [r0, #412] // ..................................................................................................................................................................................................................................................................................................*........ - uadd16 r10, r6, r5 // ...................................................................................................................................................................................................................................................................................................*....... - str.w r11, [r0, #444] // .....................................................................................................................................................................................................................................................................................................*..... - str.w r10, [r0, #380] // ....................................................................................................................................................................................................................................................................................................*...... - vmov r7, s4 // ......................................................................................................................................................................................................................................................................................................*.... - usub16 r5, r9, r7 // .......................................................................................................................................................................................................................................................................................................*... - str.w r5, [r0, #284] // ........................................................................................................................................................................................................................................................................................................*.. - uadd16 r6, r9, r7 // .........................................................................................................................................................................................................................................................................................................*. - str.w r6, [r0, #252] // ..........................................................................................................................................................................................................................................................................................................* + // --------------------------------------------------------------------------------------------------------------------------------------------- original position ---------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + vmov r14, s8 // ...*.......................................................................................................................................................................................................................................................................................................... + ldr.w r11, [r0, #416] // ......*....................................................................................................................................................................................................................................................................................................... + vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + ldr.w r5, [r0, #352] // ....*......................................................................................................................................................................................................................................................................................................... + ldr.w r9, [r0, #96] // .*............................................................................................................................................................................................................................................................................................................ + // gap // .............................................................................................................................................................................................................................................................................................................. + ldr.w r2, [r0, #288] // ........*..................................................................................................................................................................................................................................................................................................... + smulwb r7, r14, r5 // .............*................................................................................................................................................................................................................................................................................................ + ldr.w r10, [r0, #32] // ..........*................................................................................................................................................................................................................................................................................................... + smulwt r6, r14, r5 // .......*...................................................................................................................................................................................................................................................................................................... + ldr.w r5, [r0, #480] // .....*........................................................................................................................................................................................................................................................................................................ + smulwb r4, r14, r2 // ....................*......................................................................................................................................................................................................................................................................................... + ldr.w r8, [r0, #160] // ............*................................................................................................................................................................................................................................................................................................. + smulwt r3, r14, r2 // ......................*....................................................................................................................................................................................................................................................................................... + ldr.w r2, [r0, #224] // *............................................................................................................................................................................................................................................................................................................. + movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r4, r4, r12, r0 // ........................*..................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r3, r3, r12, r0 // ..........................*................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r6, r6, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + pkhtb r4, r3, r4, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + smlabt r7, r7, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r3, r14, r5 // .........*.................................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r5, r14, r5 // ...........*.................................................................................................................................................................................................................................................................................................. + pkhtb r7, r6, r7, asr #16 // .....................*........................................................................................................................................................................................................................................................................................ + smlabt r3, r3, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + usub16 r6, r10, r4 // ..............................*............................................................................................................................................................................................................................................................................... + smlabt r5, r5, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + uadd16 r4, r10, r4 // ................................*............................................................................................................................................................................................................................................................................. + smulwb r10, r14, r11 // ...........................*.................................................................................................................................................................................................................................................................................. + pkhtb r3, r5, r3, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + smulwt r14, r14, r11 // .............................*................................................................................................................................................................................................................................................................................ + usub16 r5, r2, r3 // ......................................*....................................................................................................................................................................................................................................................................... + vmov r11, s9 // ...................................*.......................................................................................................................................................................................................................................................................... + uadd16 r2, r2, r3 // ..................................*........................................................................................................................................................................................................................................................................... + smlabt r3, r10, r12, r0 // ...............................*.............................................................................................................................................................................................................................................................................. + uadd16 r10, r9, r7 // .........................*.................................................................................................................................................................................................................................................................................... + smlabt r14, r14, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + usub16 r7, r9, r7 // .......................*...................................................................................................................................................................................................................................................................................... + smulwb r9, r11, r2 // .....................................*........................................................................................................................................................................................................................................................................ + pkhtb r3, r14, r3, asr #16 // ....................................*......................................................................................................................................................................................................................................................................... + smulwt r2, r11, r2 // .......................................*...................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r14, r9, r12, r0 // .........................................*.................................................................................................................................................................................................................................................................... + uadd16 r9, r8, r3 // ..........................................*................................................................................................................................................................................................................................................................... + smlabt r2, r2, r12, r0 // ...........................................*.................................................................................................................................................................................................................................................................. + usub16 r8, r8, r3 // ........................................*..................................................................................................................................................................................................................................................................... + smulwb r3, r11, r9 // ..............................................*............................................................................................................................................................................................................................................................... + pkhtb r2, r2, r14, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + smulwt r9, r11, r9 // ............................................*................................................................................................................................................................................................................................................................. + uadd16 r11, r10, r2 // ...............................................*.............................................................................................................................................................................................................................................................. + smlabt r3, r3, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + vmov r14, s11 // ................................................................*............................................................................................................................................................................................................................................. + smlabt r9, r9, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + usub16 r10, r10, r2 // ...................................................*.......................................................................................................................................................................................................................................................... + smulwb r2, r14, r11 // ...................................................................*.......................................................................................................................................................................................................................................... + pkhtb r9, r9, r3, asr #16 // ......................................................*....................................................................................................................................................................................................................................................... + smulwt r3, r14, r11 // .....................................................................*........................................................................................................................................................................................................................................ + uadd16 r14, r4, r9 // ........................................................*..................................................................................................................................................................................................................................................... + smlabt r11, r2, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + vmov r2, s12 // ......................................................................*....................................................................................................................................................................................................................................... + smlabt r3, r3, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + usub16 r4, r4, r9 // ..........................................................*................................................................................................................................................................................................................................................... + smulwt r9, r2, r10 // .............................................................................*................................................................................................................................................................................................................................ + pkhtb r11, r3, r11, asr #16 // ............................................................................*................................................................................................................................................................................................................................. + smulwb r2, r2, r10 // ...........................................................................*.................................................................................................................................................................................................................................. + usub16 r3, r14, r11 // ..................................................................................*........................................................................................................................................................................................................................... + smlabt r10, r9, r12, r0 // .................................................................................*............................................................................................................................................................................................................................ + vmov r9, s16 // .............................................................................................*................................................................................................................................................................................................................ + smlabt r2, r2, r12, r0 // ...............................................................................*.............................................................................................................................................................................................................................. + uadd16 r11, r14, r11 // ................................................................................*............................................................................................................................................................................................................................. + smulwb r14, r9, r3 // ................................................................................................*............................................................................................................................................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r9, r9, r3 // ...............................................................................................*.............................................................................................................................................................................................................. + pkhtb r3, r10, r2, asr #16 // ....................................................................................*......................................................................................................................................................................................................................... + smlabt r14, r14, r12, r0 // ....................................................................................................*......................................................................................................................................................................................................... + vmov r2, s10 // .................................................*............................................................................................................................................................................................................................................................ + smlabt r9, r9, r12, r0 // ..................................................................................................*........................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r10, r2, r5 // ...........................................................*.................................................................................................................................................................................................................................................. + pkhtb r14, r9, r14, asr #16 // ........................................................................................................*..................................................................................................................................................................................................... + smulwt r9, r2, r5 // .............................................................*................................................................................................................................................................................................................................................ + vmov s1, r14 // ..........................................................................................................*................................................................................................................................................................................................... + smlabt r10, r10, r12, r0 // ...............................................................*.............................................................................................................................................................................................................................................. + usub16 r5, r4, r3 // ........................................................................................*..................................................................................................................................................................................................................... + smlabt r14, r9, r12, r0 // .................................................................*............................................................................................................................................................................................................................................ + uadd16 r4, r4, r3 // ..........................................................................................*................................................................................................................................................................................................................... + smulwb r3, r2, r8 // ....................................................*......................................................................................................................................................................................................................................................... + pkhtb r10, r14, r10, asr #16 // ....................................................................*......................................................................................................................................................................................................................................... + smulwt r8, r2, r8 // .....................................................*........................................................................................................................................................................................................................................................ + usub16 r9, r7, r10 // ..........................................................................*................................................................................................................................................................................................................................... + smlabt r2, r3, r12, r0 // .......................................................*...................................................................................................................................................................................................................................................... + vmov r14, s14 // ..............................................................................*............................................................................................................................................................................................................................... + smlabt r3, r8, r12, r0 // .........................................................*.................................................................................................................................................................................................................................................... + uadd16 r10, r7, r10 // ........................................................................*..................................................................................................................................................................................................................................... + smulwb r8, r14, r9 // ...................................................................................*.......................................................................................................................................................................................................................... + vmov r7, s18 // ............................................................................................................*................................................................................................................................................................................................. + smulwt r14, r14, r9 // .....................................................................................*........................................................................................................................................................................................................................ + pkhtb r9, r3, r2, asr #16 // ............................................................*................................................................................................................................................................................................................................................. + smlabt r8, r8, r12, r0 // .......................................................................................*...................................................................................................................................................................................................................... + usub16 r3, r6, r9 // ..............................................................*............................................................................................................................................................................................................................................... + smlabt r2, r14, r12, r0 // .........................................................................................*.................................................................................................................................................................................................................... + uadd16 r6, r6, r9 // ..................................................................*........................................................................................................................................................................................................................................... + smulwb r14, r7, r5 // ...............................................................................................................*.............................................................................................................................................................................................. + pkhtb r9, r2, r8, asr #16 // ...........................................................................................*.................................................................................................................................................................................................................. + smulwt r8, r7, r5 // .................................................................................................................*............................................................................................................................................................................................ + uadd16 r7, r3, r9 // .................................................................................................*............................................................................................................................................................................................................ + smlabt r5, r14, r12, r0 // .....................................................................................................................*........................................................................................................................................................................................ + vmov r14, s21 // ....................................................................................................................*......................................................................................................................................................................................... + smlabt r8, r8, r12, r0 // ......................................................................................................................................*....................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r2, r14, r7 // ......................................................................................................................*....................................................................................................................................................................................... + pkhtb r8, r8, r5, asr #16 // ........................................................................................................................................*..................................................................................................................................................................... + smulwt r7, r14, r7 // .......................................................................................................................*...................................................................................................................................................................................... + vmov r14, s17 // .......................................................................................................................................*...................................................................................................................................................................... + smlabt r5, r2, r12, r0 // .........................................................................................................................*.................................................................................................................................................................................... + vmov s3, r8 // ..........................................................................................................................................*................................................................................................................................................................... + smulwb r8, r14, r4 // ...........................................................................................................................................*.................................................................................................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r2, r14, r4 // ...............................................................................................................................................*.............................................................................................................................................................. + usub16 r4, r3, r9 // ...................................................................................................*.......................................................................................................................................................................................................... + smlabt r14, r7, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + vmov r7, s13 // ......................................................................................*....................................................................................................................................................................................................................... + smlabt r9, r8, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + pkhtb r5, r14, r5, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + smulwb r3, r7, r10 // ............................................................................................*................................................................................................................................................................................................................. + vmov s6, r5 // ..................................................................................................................................................*........................................................................................................................................................... + smulwt r7, r7, r10 // ..............................................................................................*............................................................................................................................................................................................................... + vmov r10, s23 // .........................................................................................................................................*.................................................................................................................................................................... + smlabt r3, r3, r12, r0 // .......................................................................................................*...................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r8, r7, r12, r0 // .....................................................................................................*........................................................................................................................................................................................................ + vmov r7, s22 // ......................................................................................................*....................................................................................................................................................................................................... + smlabt r5, r2, r12, r0 // ...................................................................................................................................................*.......................................................................................................................................................... + pkhtb r2, r8, r3, asr #16 // ..............................................................................................................*............................................................................................................................................................................................... + smulwb r8, r7, r4 // .........................................................................................................*.................................................................................................................................................................................................... + pkhtb r9, r5, r9, asr #16 // ......................................................................................................................................................*....................................................................................................................................................... + smulwt r7, r7, r4 // ...........................................................................................................*.................................................................................................................................................................................................. + usub16 r3, r6, r2 // ................................................................................................................*............................................................................................................................................................................................. + smlabt r8, r8, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + vmov r4, s15 // ..........................................................................................................................*................................................................................................................................................................................... + smlabt r7, r7, r12, r0 // ...................................................................................................................*.......................................................................................................................................................................................... + uadd16 r2, r6, r2 // ..................................................................................................................*........................................................................................................................................................................................... + smulwt r14, r4, r11 // .............................................................................................................................*................................................................................................................................................................................ + vmov r5, s19 // ................................................................................................................................*............................................................................................................................................................................. + smulwb r6, r4, r11 // ............................................................................................................................*................................................................................................................................................................................. + pkhtb r7, r7, r8, asr #16 // ........................................................................................................................*..................................................................................................................................................................................... + smulwb r8, r5, r2 // ..................................................................................................................................*........................................................................................................................................................................... + vmov r4, s20 // ........................................................................................................................................................*..................................................................................................................................................... + smulwt r11, r5, r2 // ....................................................................................................................................*......................................................................................................................................................................... + vmov s2, r9 // ............................................................................................................................................................*................................................................................................................................................. + smlabt r9, r8, r12, r0 // .............................................................................................................................................*................................................................................................................................................................ + vmov s7, r7 // ............................................................................................................................................*................................................................................................................................................................. + smlabt r11, r11, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + vmov r5, s8 // ....................................................................................................................................................*......................................................................................................................................................... + smulwt r8, r4, r3 // .............................................................................................................................................................*................................................................................................................................................ + pkhtb r9, r11, r9, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + smlabt r11, r6, r12, r0 // ...............................................................................................................................*.............................................................................................................................................................................. + ldr.w r6, [r10, #256] // ..............................................................................................................................................................*............................................................................................................................................... + smlabt r14, r14, r12, r0 // .................................................................................................................................*............................................................................................................................................................................ + vmov s4, r9 // .......................................................................................................................................................................................*...................................................................................................................... + smulwb r7, r4, r3 // ...........................................................................................................................................................*.................................................................................................................................................. + pkhtb r11, r14, r11, asr #16 // ...................................................................................................................................*.......................................................................................................................................................................... + smulwt r2, r5, r6 // ...........................................................................................................................................................................*.................................................................................................................................. + ldr.w r14, [r10, #448] // ................................................................................................................................................................................*............................................................................................................................. + smlabt r7, r7, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + vmov s0, r11 // .....................................................................................................................................*........................................................................................................................................................................ + smlabt r11, r8, r12, r0 // .................................................................................................................................................................*............................................................................................................................................ + ldr.w r8, [r10, #320] // ................................................................................................................................................*............................................................................................................................................................. + smulwt r3, r5, r14 // ......................................................................................................................................................................................*....................................................................................................................... + pkhtb r7, r11, r7, asr #16 // ......................................................................................................................................................................*....................................................................................................................................... + smulwb r6, r5, r6 // .........................................................................................................................................................................*.................................................................................................................................... + vmov s5, r7 // ........................................................................................................................................................................*..................................................................................................................................... + smulwb r7, r5, r8 // .......................................................................................................................................................*...................................................................................................................................................... + movw r0, #24608 // ..................................................................................................................................................................*........................................................................................................................................... + smlabt r6, r6, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + ldr.w r9, [r10, #0] // ....................................................................................................................................................................*......................................................................................................................................... + smlabt r2, r2, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + ldr.w r4, [r10, #384] // ................................................................................................................................................................*............................................................................................................................................. + smulwt r8, r5, r8 // .........................................................................................................................................................*.................................................................................................................................................... + pkhtb r11, r2, r6, asr #16 // ..................................................................................................................................................................................*........................................................................................................................... + smlabt r7, r7, r12, r0 // .....................................................................................................................................................................*........................................................................................................................................ + uadd16 r2, r9, r11 // ....................................................................................................................................................................................*......................................................................................................................... + smlabt r8, r8, r12, r0 // ...................................................................................................................................................................*.......................................................................................................................................... + usub16 r6, r9, r11 // .....................................................................................................................................................................................*........................................................................................................................ + smulwb r9, r5, r4 // .......................................................................................................................................................................*...................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r11, r5, r14 // ........................................................................................................................................................................................*..................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r4, r5, r4 // .................................................................................................................................................................................*............................................................................................................................ + pkhtb r7, r8, r7, asr #16 // ..........................................................................................................................................................................*................................................................................................................................... + smlabt r9, r9, r12, r0 // ...................................................................................................................................................................................*.......................................................................................................................... + ldr.w r5, [r10, #128] // .........................................................................................................................................................................................*.................................................................................................................... + smlabt r8, r4, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................... + ldr.w r14, [r10, #192] // ...........................................................................................................................................................................................*.................................................................................................................. + smlabt r4, r11, r12, r0 // ............................................................................................................................................................................................*................................................................................................................. + pkhtb r11, r8, r9, asr #16 // .............................................................................................................................................................................................*................................................................................................................ + smlabt r8, r3, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................... + usub16 r9, r5, r11 // ...................................................................................................................................................................................................*.......................................................................................................... + vmov r3, s10 // ..............................................................................................................................................................................................................*............................................................................................... + pkhtb r8, r8, r4, asr #16 // .................................................................................................................................................................................................*............................................................................................................ + smulwb r4, r3, r9 // .................................................................................................................................................................................................................*............................................................................................ + uadd16 r11, r5, r11 // ...............................................................................................................................................................................................*.............................................................................................................. + smulwt r5, r3, r9 // ...................................................................................................................................................................................................................*.......................................................................................... + ldr.w r9, [r10, #64] // ..............................................................................................................................................*............................................................................................................................................................... + smlabt r4, r4, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + usub16 r10, r14, r8 // .....................................................................................................................................................................................................*........................................................................................................ + smlabt r5, r5, r12, r0 // .......................................................................................................................................................................................................................*...................................................................................... + uadd16 r8, r14, r8 // .......................................................................................................................................................................................................*...................................................................................................... + smulwb r14, r3, r10 // ........................................................................................................................................................................................................................*..................................................................................... + pkhtb r4, r5, r4, asr #16 // .........................................................................................................................................................................................................................*.................................................................................... + smulwt r3, r3, r10 // ..........................................................................................................................................................................................................................*................................................................................... + uadd16 r10, r9, r7 // ............................................................................................................................................................................*................................................................................................................................. + smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + vmov r5, s9 // ................................................................................................................................................................................................*............................................................................................................. + smlabt r3, r3, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................... + usub16 r9, r9, r7 // ..............................................................................................................................................................................*............................................................................................................................... + smulwb r7, r5, r8 // .........................................................................................................................................................................................................*.................................................................................................... + pkhtb r3, r3, r14, asr #16 // .................................................................................................................................................................................................................................*............................................................................ + smulwt r14, r5, r8 // ...........................................................................................................................................................................................................*.................................................................................................. + usub16 r8, r6, r4 // ...........................................................................................................................................................................................................................*.................................................................................. + smlabt r7, r7, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + uadd16 r6, r6, r4 // ...............................................................................................................................................................................................................................*.............................................................................. + smlabt r14, r14, r12, r0 // ...............................................................................................................................................................................................................*.............................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r4, r5, r11 // ..................................................................................................................................................................................................*........................................................................................................... + pkhtb r14, r14, r7, asr #16 // ..................................................................................................................................................................................................................*........................................................................................... + smulwt r7, r5, r11 // ....................................................................................................................................................................................................*......................................................................................................... + usub16 r11, r10, r14 // ....................................................................................................................................................................................................................*......................................................................................... + smlabt r4, r4, r12, r0 // ......................................................................................................................................................................................................*....................................................................................................... + vmov r5, s12 // .............................................................................................................................................................................................................................*................................................................................ + smlabt r7, r7, r12, r0 // ........................................................................................................................................................................................................*..................................................................................................... + uadd16 r10, r10, r14 // ......................................................................................................................................................................................................................*....................................................................................... + smulwt r14, r5, r11 // ...................................................................................................................................................................................................................................*.......................................................................... + pkhtb r7, r7, r4, asr #16 // ..........................................................................................................................................................................................................*................................................................................................... + smulwb r4, r5, r11 // ................................................................................................................................................................................................................................*............................................................................. + usub16 r11, r2, r7 // ............................................................................................................................................................................................................*................................................................................................. + smlabt r5, r14, r12, r0 // .......................................................................................................................................................................................................................................*...................................................................... + uadd16 r2, r2, r7 // ................................................................................................................................................................................................................*............................................................................................. + smlabt r4, r4, r12, r0 // .....................................................................................................................................................................................................................................*........................................................................ + usub16 r7, r9, r3 // ..................................................................................................................................................................................................................................*........................................................................... + vmov r14, s14 // ......................................................................................................................................................................................................................................*....................................................................... + uadd16 r9, r9, r3 // ....................................................................................................................................................................................................................................*......................................................................... + smulwb r3, r14, r7 // ........................................................................................................................................................................................................................................*..................................................................... + pkhtb r5, r5, r4, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + smulwt r4, r14, r7 // ..........................................................................................................................................................................................................................................*................................................................... + uadd16 r14, r11, r5 // ...............................................................................................................................................................................................................................................*.............................................................. + smlabt r7, r3, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + vmov r3, s11 // .............................................................................................................................................................................................................................................*................................................................ + smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + usub16 r5, r11, r5 // ...........................................................................................................................................................................................................................................*.................................................................. + smulwb r11, r3, r10 // ................................................................................................................................................................................................................................................*............................................................. + pkhtb r4, r4, r7, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + smulwt r10, r3, r10 // ..................................................................................................................................................................................................................................................*........................................................... + usub16 r3, r8, r4 // ...................................................................................................................................................................................................................................................*.......................................................... + smlabt r11, r11, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + vmov r7, s13 // .....................................................................................................................................................................................................................................................*........................................................ + smlabt r10, r10, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + uadd16 r8, r8, r4 // .......................................................................................................................................................................................................................................................*...................................................... + smulwb r4, r7, r9 // ........................................................................................................................................................................................................................................................*..................................................... + pkhtb r11, r10, r11, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + smulwt r10, r7, r9 // ..........................................................................................................................................................................................................................................................*................................................... + vmov r7, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + smlabt r4, r4, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + usub16 r9, r14, r7 // .............................................................................................................................................................................................................................................................*................................................ + smlabt r0, r10, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + uadd16 r14, r14, r7 // ...............................................................................................................................................................................................................................................................*.............................................. + vmov r10, s5 // ................................................................................................................................................................................................................................................................*............................................. + pkhtb r7, r0, r4, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + usub16 r4, r2, r11 // ...................................................................................................................................................................................................................................................................*.......................................... + str.w r9, [r0, #160] // ....................................................................................................................................................................................................................................................................*......................................... + usub16 r9, r6, r7 // .......................................................................................................................................................................................................................................................................*...................................... + str.w r14, [r0, #128] // ........................................................................................................................................................................................................................................................................*..................................... + uadd16 r7, r6, r7 // ...........................................................................................................................................................................................................................................................................*.................................. + vmov r6, s7 // ............................................................................................................................................................................................................................................................................*................................. + usub16 r14, r9, r10 // .........................................................................................................................................................................................................................................................................*.................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + uadd16 r11, r2, r11 // .....................................................................................................................................................................................................................................................................*........................................ + str.w r14, [r0, #352] // ..........................................................................................................................................................................................................................................................................*................................... + uadd16 r14, r9, r10 // .............................................................................................................................................................................................................................................................................*................................ + str.w r14, [r0, #320] // ..............................................................................................................................................................................................................................................................................*............................... + uadd16 r10, r3, r6 // ...............................................................................................................................................................................................................................................................................*.............................. + vmov r14, s24 // ................................................................................................................................................................................................................................................................................*............................. + usub16 r3, r3, r6 // ..................................................................................................................................................................................................................................................................................*........................... + str.w r3, [r0, #480] // ...........................................................................................................................................................................................................................................................................................*.................. + str.w r10, [r0, #448] // .................................................................................................................................................................................................................................................................................*............................ + vmov r3, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + uadd16 r9, r5, r3 // ....................................................................................................................................................................................................................................................................................*......................... + str.w r9, [r0, #192] // .....................................................................................................................................................................................................................................................................................*........................ + usub16 r5, r5, r3 // ......................................................................................................................................................................................................................................................................................*....................... + vmov r3, s4 // ......................................................................................................................................................................................................................................................................*....................................... + uadd16 r9, r7, r3 // ..............................................................................................................................................................................................................................................................................................*............... + vmov r10, s0 // ...................................................................................................................................................................................................................................................................................................*.......... + usub16 r7, r7, r3 // ........................................................................................................................................................................................................................................................................................*..................... + str.w r7, [r0, #288] // .........................................................................................................................................................................................................................................................................................*.................... + uadd16 r7, r11, r10 // ....................................................................................................................................................................................................................................................................................................*......... + vmov r6, s1 // ...............................................................................................................................................................................................................................................................................................*.............. + uadd16 r2, r4, r6 // ................................................................................................................................................................................................................................................................................................*............. + str.w r2, [r0, #64] // .................................................................................................................................................................................................................................................................................................*............ + usub16 r4, r4, r6 // ..................................................................................................................................................................................................................................................................................................*........... + str.w r9, [r0, #256] // .........................................................................................................................................................................................................................................................................................................*.... + str.w r7, [r0], #4 // .......................................................................................................................................................................................................................................................................................................*...... // @slothy:core // @slothy:before=cmp + cmp.w r0, r14 // ........................................................................................................................................................................................................................................................................................................*..... // @slothy:id=cmp + usub16 r9, r11, r10 // ......................................................................................................................................................................................................................................................................................................*....... + str.w r4, [r0, #92] // .....................................................................................................................................................................................................................................................................................................*........ + str.w r9, [r0, #28] // ............................................................................................................................................................................................................................................................................................................*. + vmov r11, s6 // ..........................................................................................................................................................................................................................................................................................*................... + uadd16 r9, r8, r11 // ..........................................................................................................................................................................................................................................................................................................*... + str.w r9, [r0, #380] // ...........................................................................................................................................................................................................................................................................................................*.. + usub16 r9, r8, r11 // ............................................................................................................................................................................................................................................................................................*................. + str.w r5, [r0, #220] // .......................................................................................................................................................................................................................................................................................*...................... + str.w r9, [r0, #412] // .............................................................................................................................................................................................................................................................................................*................ + bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* // @slothy:branch + + // ----------------------------------------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr.w r3, [r0, #224] // .............*................................................................................................................................................................................................................................................................................................ + // ldr.w r10, [r0, #96] // ....*......................................................................................................................................................................................................................................................................................................... + // vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + // vmov r5, s8 // *............................................................................................................................................................................................................................................................................................................. + // ldr.w r6, [r0, #352] // ...*.......................................................................................................................................................................................................................................................................................................... + // ldr.w r8, [r0, #480] // .........*.................................................................................................................................................................................................................................................................................................... + // ldr.w r11, [r0, #416] // .*............................................................................................................................................................................................................................................................................................................ + // smulwt r4, r5, r6 // ........*..................................................................................................................................................................................................................................................................................................... + // ldr.w r2, [r0, #288] // .....*........................................................................................................................................................................................................................................................................................................ + // smulwb r7, r5, r8 // ....................*......................................................................................................................................................................................................................................................................................... + // ldr.w r14, [r0, #32] // .......*...................................................................................................................................................................................................................................................................................................... + // smulwt r9, r5, r8 // .....................*........................................................................................................................................................................................................................................................................................ + // ldr.w r8, [r0, #160] // ...........*.................................................................................................................................................................................................................................................................................................. + // smulwb r6, r5, r6 // ......*....................................................................................................................................................................................................................................................................................................... + // movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // smlabt r7, r7, r12, r0 // .......................*...................................................................................................................................................................................................................................................................................... + // smlabt r9, r9, r12, r0 // .........................*.................................................................................................................................................................................................................................................................................... + // smlabt r6, r6, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + // pkhtb r9, r9, r7, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + // smlabt r7, r4, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // smulwb r4, r5, r2 // ..........*................................................................................................................................................................................................................................................................................................... + // pkhtb r6, r7, r6, asr #16 // ......................*....................................................................................................................................................................................................................................................................................... + // smulwt r2, r5, r2 // ............*................................................................................................................................................................................................................................................................................................. + // usub16 r7, r10, r6 // ....................................*......................................................................................................................................................................................................................................................................... + // smlabt r4, r4, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + // uadd16 r10, r10, r6 // ..................................*........................................................................................................................................................................................................................................................................... + // smlabt r6, r2, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + // smulwb r2, r5, r11 // ...........................*.................................................................................................................................................................................................................................................................................. + // pkhtb r4, r6, r4, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + // smulwt r5, r5, r11 // .............................*................................................................................................................................................................................................................................................................................ + // usub16 r6, r14, r4 // ........................*..................................................................................................................................................................................................................................................................................... + // smlabt r2, r2, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + // uadd16 r4, r14, r4 // ..........................*................................................................................................................................................................................................................................................................................... + // smlabt r11, r5, r12, r0 // ...................................*.......................................................................................................................................................................................................................................................................... + // uadd16 r14, r3, r9 // ................................*............................................................................................................................................................................................................................................................................. + // vmov r5, s9 // ...............................*.............................................................................................................................................................................................................................................................................. + // pkhtb r11, r11, r2, asr #16 // ......................................*....................................................................................................................................................................................................................................................................... + // smulwb r2, r5, r14 // .....................................*........................................................................................................................................................................................................................................................................ + // usub16 r9, r3, r9 // ..............................*............................................................................................................................................................................................................................................................................... + // smulwt r3, r5, r14 // .......................................*...................................................................................................................................................................................................................................................................... + // usub16 r14, r8, r11 // ...........................................*.................................................................................................................................................................................................................................................................. + // smlabt r2, r2, r12, r0 // ........................................*..................................................................................................................................................................................................................................................................... + // uadd16 r8, r8, r11 // .........................................*.................................................................................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // ..........................................*................................................................................................................................................................................................................................................................... + // smulwt r11, r5, r8 // ..............................................*............................................................................................................................................................................................................................................................... + // pkhtb r2, r3, r2, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + // smulwb r8, r5, r8 // ............................................*................................................................................................................................................................................................................................................................. + // uadd16 r3, r10, r2 // ...............................................*.............................................................................................................................................................................................................................................................. + // smlabt r11, r11, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + // vmov r5, s10 // ........................................................................*..................................................................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + // usub16 r10, r10, r2 // ...................................................*.......................................................................................................................................................................................................................................................... + // smulwb r2, r5, r14 // ..................................................................................*........................................................................................................................................................................................................................... + // smulwt r14, r5, r14 // ....................................................................................*......................................................................................................................................................................................................................... + // pkhtb r11, r11, r8, asr #16 // .....................................................*........................................................................................................................................................................................................................................................ + // smlabt r2, r2, r12, r0 // ......................................................................................*....................................................................................................................................................................................................................... + // uadd16 r8, r4, r11 // .......................................................*...................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // ........................................................................................*..................................................................................................................................................................................................................... + // usub16 r4, r4, r11 // ...........................................................*.................................................................................................................................................................................................................................................. + // smulwb r11, r5, r9 // ..........................................................................*................................................................................................................................................................................................................................... + // pkhtb r2, r14, r2, asr #16 // .............................................................................................*................................................................................................................................................................................................................ + // smulwt r5, r5, r9 // ............................................................................*................................................................................................................................................................................................................................. + // usub16 r9, r6, r2 // ...............................................................................................*.............................................................................................................................................................................................................. + // smlabt r11, r11, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................... + // vmov r14, s11 // .................................................*............................................................................................................................................................................................................................................................ + // smlabt r5, r5, r12, r0 // ................................................................................*............................................................................................................................................................................................................................. + // uadd16 r6, r6, r2 // .................................................................................................*............................................................................................................................................................................................................ + // smulwb r2, r14, r3 // ....................................................*......................................................................................................................................................................................................................................................... + // pkhtb r11, r5, r11, asr #16 // ...................................................................................*.......................................................................................................................................................................................................................... + // smulwt r5, r14, r3 // ......................................................*....................................................................................................................................................................................................................................................... + // vmov r14, s12 // .........................................................*.................................................................................................................................................................................................................................................... + // smlabt r3, r2, r12, r0 // ........................................................*..................................................................................................................................................................................................................................................... + // uadd16 r2, r7, r11 // .........................................................................................*.................................................................................................................................................................................................................... + // smlabt r5, r5, r12, r0 // ..........................................................*................................................................................................................................................................................................................................................... + // usub16 r7, r7, r11 // .....................................................................................*........................................................................................................................................................................................................................ + // smulwb r11, r14, r10 // ..............................................................*............................................................................................................................................................................................................................................... + // pkhtb r3, r5, r3, asr #16 // .............................................................*................................................................................................................................................................................................................................................ + // smulwt r10, r14, r10 // ............................................................*................................................................................................................................................................................................................................................. + // vmov r5, s14 // .......................................................................................*...................................................................................................................................................................................................................... + // smlabt r14, r11, r12, r0 // ..................................................................*........................................................................................................................................................................................................................................... + // uadd16 r11, r8, r3 // ...................................................................*.......................................................................................................................................................................................................................................... + // smlabt r10, r10, r12, r0 // ................................................................*............................................................................................................................................................................................................................................. + // usub16 r3, r8, r3 // ...............................................................*.............................................................................................................................................................................................................................................. + // smulwb r8, r5, r7 // ..........................................................................................*................................................................................................................................................................................................................... + // pkhtb r10, r10, r14, asr #16 // ......................................................................*....................................................................................................................................................................................................................................... + // smulwt r14, r5, r7 // ............................................................................................*................................................................................................................................................................................................................. + // vmov r7, s13 // ...................................................................................................................*.......................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ..............................................................................................*............................................................................................................................................................................................................... + // usub16 r5, r4, r10 // ...............................................................................*.............................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ................................................................................................*............................................................................................................................................................................................................. + // uadd16 r4, r4, r10 // .................................................................................*............................................................................................................................................................................................................................ + // pkhtb r14, r14, r8, asr #16 // ...................................................................................................*.......................................................................................................................................................................................................... + // smulwb r10, r7, r2 // ......................................................................................................................*....................................................................................................................................................................................... + // vmov r8, s16 // .................................................................*............................................................................................................................................................................................................................................ + // smulwt r2, r7, r2 // ........................................................................................................................*..................................................................................................................................................................................... + // smulwt r7, r8, r3 // .....................................................................*........................................................................................................................................................................................................................................ + // smulwb r3, r8, r3 // ....................................................................*......................................................................................................................................................................................................................................... + // uadd16 r8, r9, r14 // .....................................................................................................*........................................................................................................................................................................................................ + // smlabt r7, r7, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + // usub16 r14, r9, r14 // .................................................................................................................*............................................................................................................................................................................................ + // smlabt r9, r3, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + // smlabt r3, r2, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + // vmov r2, s22 // ............................................................................................................................*................................................................................................................................................................................. + // smlabt r10, r10, r12, r0 // ..........................................................................................................................*................................................................................................................................................................................... + // pkhtb r9, r7, r9, asr #16 // ...........................................................................*.................................................................................................................................................................................................................................. + // smulwb r7, r2, r14 // ...............................................................................................................................*.............................................................................................................................................................................. + // vmov s1, r9 // .............................................................................*................................................................................................................................................................................................................................ + // smulwt r14, r2, r14 // .................................................................................................................................*............................................................................................................................................................................ + // vmov r2, s18 // ...........................................................................................*.................................................................................................................................................................................................................. + // smlabt r7, r7, r12, r0 // ...................................................................................................................................*.......................................................................................................................................................................... + // pkhtb r9, r3, r10, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + // smulwb r10, r2, r5 // ..................................................................................................*........................................................................................................................................................................................................... + // usub16 r3, r6, r9 // ..................................................................................................................................*........................................................................................................................................................................... + // smulwt r2, r2, r5 // ....................................................................................................*......................................................................................................................................................................................................... + // uadd16 r5, r6, r9 // ......................................................................................................................................*....................................................................................................................................................................... + // smlabt r9, r14, r12, r0 // .....................................................................................................................................*........................................................................................................................................................................ + // vmov r14, s21 // .......................................................................................................*...................................................................................................................................................................................................... + // smlabt r10, r10, r12, r0 // ......................................................................................................*....................................................................................................................................................................................................... + // smulwb r6, r14, r8 // .........................................................................................................*.................................................................................................................................................................................................... + // smulwt r8, r14, r8 // ...........................................................................................................*.................................................................................................................................................................................................. + // pkhtb r9, r9, r7, asr #16 // ..........................................................................................................................................*................................................................................................................................................................... + // smlabt r7, r6, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + // vmov r6, s15 // ....................................................................................................................................*......................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ..................................................................................................................*........................................................................................................................................................................................... + // smulwb r14, r6, r11 // .........................................................................................................................................*.................................................................................................................................................................... + // smulwt r11, r6, r11 // .......................................................................................................................................*...................................................................................................................................................................... + // pkhtb r8, r8, r7, asr #16 // .....................................................................................................................*........................................................................................................................................................................................ + // smlabt r7, r14, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + // vmov r14, s19 // ........................................................................................................................................*..................................................................................................................................................................... + // smlabt r11, r11, r12, r0 // .......................................................................................................................................................*...................................................................................................................................................... + // smulwb r6, r14, r5 // ...........................................................................................................................................*.................................................................................................................................................................. + // pkhtb r11, r11, r7, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + // smulwt r14, r14, r5 // .............................................................................................................................................*................................................................................................................................................................ + // vmov s0, r11 // ..............................................................................................................................................................*............................................................................................................................................... + // smlabt r11, r2, r12, r0 // ........................................................................................................*..................................................................................................................................................................................................... + // vmov r2, s17 // ............................................................................................................*................................................................................................................................................................................................. + // pkhtb r5, r11, r10, asr #16 // ..........................................................................................................*................................................................................................................................................................................................... + // vmov r10, s23 // .........................................................................................................................*.................................................................................................................................................................................... + // vmov s3, r5 // ..............................................................................................................*............................................................................................................................................................................................... + // smulwb r5, r2, r4 // ...............................................................................................................*.............................................................................................................................................................................................. + // vmov s7, r9 // ................................................................................................................................................*............................................................................................................................................................. + // smlabt r6, r6, r12, r0 // ...............................................................................................................................................*.............................................................................................................................................................. + // ldr.w r9, [r10, #64] // ..................................................................................................................................................................................................*........................................................................................................... + // smulwt r2, r2, r4 // ................................................................................................................*............................................................................................................................................................................................. + // ldr.w r4, [r10, #320] // ................................................................................................................................................................*............................................................................................................................................. + // smlabt r7, r5, r12, r0 // ....................................................................................................................*......................................................................................................................................................................................... + // vmov s6, r8 // .......................................................................................................................*...................................................................................................................................................................................... + // smlabt r2, r2, r12, r0 // .............................................................................................................................*................................................................................................................................................................................ + // vmov r11, s8 // ..................................................................................................................................................*........................................................................................................................................................... + // smlabt r5, r14, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + // pkhtb r8, r2, r7, asr #16 // ................................................................................................................................*............................................................................................................................................................................. + // smulwb r7, r11, r4 // .....................................................................................................................................................................*........................................................................................................................................ + // vmov r2, s20 // ............................................................................................................................................*................................................................................................................................................................. + // smulwt r4, r11, r4 // ...........................................................................................................................................................................*.................................................................................................................................. + // pkhtb r14, r5, r6, asr #16 // ....................................................................................................................................................*......................................................................................................................................................... + // smulwb r5, r2, r3 // .........................................................................................................................................................*.................................................................................................................................................... + // vmov s2, r8 // ..............................................................................................................................................*............................................................................................................................................................... + // smulwt r3, r2, r3 // ...................................................................................................................................................*.......................................................................................................................................................... + // ldr.w r2, [r10, #256] // ......................................................................................................................................................*....................................................................................................................................................... + // smlabt r8, r5, r12, r0 // .............................................................................................................................................................*................................................................................................................................................ + // ldr.w r5, [r10, #384] // ..........................................................................................................................................................................*................................................................................................................................... + // smlabt r6, r3, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + // movw r0, #24608 // ......................................................................................................................................................................*....................................................................................................................................... + // smlabt r3, r4, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + // ldr.w r4, [r10, #0] // ........................................................................................................................................................................*..................................................................................................................................... + // smlabt r7, r7, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + // pkhtb r6, r6, r8, asr #16 // ..................................................................................................................................................................*........................................................................................................................................... + // smulwb r8, r11, r5 // .................................................................................................................................................................................*............................................................................................................................ + // vmov s5, r6 // ....................................................................................................................................................................*......................................................................................................................................... + // smulwb r6, r11, r2 // ...................................................................................................................................................................*.......................................................................................................................................... + // pkhtb r7, r3, r7, asr #16 // ....................................................................................................................................................................................*......................................................................................................................... + // smulwt r2, r11, r2 // ...........................................................................................................................................................*.................................................................................................................................................. + // uadd16 r3, r9, r7 // ..........................................................................................................................................................................................................*................................................................................................... + // smlabt r6, r6, r12, r0 // .......................................................................................................................................................................*...................................................................................................................................... + // usub16 r7, r9, r7 // ..............................................................................................................................................................................................................*............................................................................................... + // smlabt r2, r2, r12, r0 // .........................................................................................................................................................................*.................................................................................................................................... + // ldr.w r9, [r10, #448] // ............................................................................................................................................................*................................................................................................................................................. + // smulwt r5, r11, r5 // ...................................................................................................................................................................................*.......................................................................................................................... + // pkhtb r6, r2, r6, asr #16 // ............................................................................................................................................................................*................................................................................................................................. + // smlabt r8, r8, r12, r0 // .....................................................................................................................................................................................*........................................................................................................................ + // uadd16 r2, r4, r6 // ..............................................................................................................................................................................*............................................................................................................................... + // usub16 r6, r4, r6 // ................................................................................................................................................................................*............................................................................................................................. + // smulwt r4, r11, r9 // .................................................................................................................................................................*............................................................................................................................................ + // vmov s4, r14 // ........................................................................................................................................................*..................................................................................................................................................... + // smulwb r11, r11, r9 // ..................................................................................................................................................................................*........................................................................................................................... + // ldr.w r14, [r10, #128] // ......................................................................................................................................................................................*....................................................................................................................... + // smlabt r9, r5, r12, r0 // .......................................................................................................................................................................................*...................................................................................................................... + // ldr.w r5, [r10, #192] // ........................................................................................................................................................................................*..................................................................................................................... + // smlabt r10, r11, r12, r0 // .........................................................................................................................................................................................*.................................................................................................................... + // pkhtb r8, r9, r8, asr #16 // ..........................................................................................................................................................................................*................................................................................................................... + // smlabt r9, r4, r12, r0 // ...........................................................................................................................................................................................*.................................................................................................................. + // uadd16 r4, r14, r8 // ................................................................................................................................................................................................*............................................................................................................. + // vmov r11, s9 // ............................................................................................................................................................................................................*................................................................................................. + // pkhtb r9, r9, r10, asr #16 // ..............................................................................................................................................................................................*............................................................................................................... + // smulwb r10, r11, r4 // ......................................................................................................................................................................................................................*....................................................................................... + // usub16 r14, r14, r8 // ............................................................................................................................................................................................*................................................................................................................. + // smulwt r4, r11, r4 // ........................................................................................................................................................................................................................*..................................................................................... + // usub16 r8, r5, r9 // ....................................................................................................................................................................................................*......................................................................................................... + // smlabt r10, r10, r12, r0 // ..........................................................................................................................................................................................................................*................................................................................... + // uadd16 r5, r5, r9 // ......................................................................................................................................................................................................*....................................................................................................... + // smlabt r4, r4, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + // smulwb r9, r11, r5 // ...............................................................................................................................................................................................................*.............................................................................................. + // pkhtb r10, r4, r10, asr #16 // ...............................................................................................................................................................................................................................*.............................................................................. + // smulwt r5, r11, r5 // .................................................................................................................................................................................................................*............................................................................................ + // usub16 r11, r2, r10 // .................................................................................................................................................................................................................................*............................................................................ + // smlabt r9, r9, r12, r0 // ...................................................................................................................................................................................................................*.......................................................................................... + // vmov r4, s10 // .............................................................................................................................................................................................*................................................................................................................ + // smlabt r5, r5, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + // uadd16 r2, r2, r10 // ...................................................................................................................................................................................................................................*.......................................................................... + // smulwb r10, r4, r14 // ...............................................................................................................................................................................................*.............................................................................................................. + // pkhtb r5, r5, r9, asr #16 // .......................................................................................................................................................................................................................*...................................................................................... + // smulwt r14, r4, r14 // .................................................................................................................................................................................................*............................................................................................................ + // usub16 r9, r3, r5 // .........................................................................................................................................................................................................................*.................................................................................... + // smlabt r10, r10, r12, r0 // ...................................................................................................................................................................................................*.......................................................................................................... + // uadd16 r3, r3, r5 // .............................................................................................................................................................................................................................*................................................................................ + // smlabt r5, r14, r12, r0 // .....................................................................................................................................................................................................*........................................................................................................ + // smulwb r14, r4, r8 // .......................................................................................................................................................................................................*...................................................................................................... + // pkhtb r5, r5, r10, asr #16 // ........................................................................................................................................................................................................*..................................................................................................... + // smulwt r4, r4, r8 // .........................................................................................................................................................................................................*.................................................................................................... + // usub16 r8, r6, r5 // ..................................................................................................................................................................................................................*........................................................................................... + // smlabt r14, r14, r12, r0 // ...........................................................................................................................................................................................................*.................................................................................................. + // vmov r10, s12 // ...........................................................................................................................................................................................................................*.................................................................................. + // smlabt r4, r4, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + // uadd16 r6, r6, r5 // ....................................................................................................................................................................................................................*......................................................................................... + // smulwb r5, r10, r9 // ................................................................................................................................................................................................................................*............................................................................. + // pkhtb r14, r4, r14, asr #16 // ................................................................................................................................................................................................................*............................................................................................. + // usub16 r4, r7, r14 // .....................................................................................................................................................................................................................................*........................................................................ + // smulwt r10, r10, r9 // ..............................................................................................................................................................................................................................*............................................................................... + // uadd16 r9, r7, r14 // .......................................................................................................................................................................................................................................*...................................................................... + // smlabt r5, r5, r12, r0 // ....................................................................................................................................................................................................................................*......................................................................... + // vmov r7, s14 // ......................................................................................................................................................................................................................................*....................................................................... + // smlabt r14, r10, r12, r0 // ..................................................................................................................................................................................................................................*........................................................................... + // smulwb r10, r7, r4 // ........................................................................................................................................................................................................................................*..................................................................... + // pkhtb r14, r14, r5, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + // smulwt r4, r7, r4 // ..........................................................................................................................................................................................................................................*................................................................... + // usub16 r5, r11, r14 // ...............................................................................................................................................................................................................................................*.............................................................. + // smlabt r7, r10, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + // vmov r10, s11 // .............................................................................................................................................................................................................................................*................................................................ + // smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + // uadd16 r14, r11, r14 // ...........................................................................................................................................................................................................................................*.................................................................. + // smulwb r11, r10, r3 // ................................................................................................................................................................................................................................................*............................................................. + // pkhtb r7, r4, r7, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + // smulwt r3, r10, r3 // ..................................................................................................................................................................................................................................................*........................................................... + // usub16 r10, r8, r7 // ...................................................................................................................................................................................................................................................*.......................................................... + // smlabt r4, r11, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + // vmov r11, s13 // .....................................................................................................................................................................................................................................................*........................................................ + // smlabt r3, r3, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + // uadd16 r8, r8, r7 // .......................................................................................................................................................................................................................................................*...................................................... + // smulwb r7, r11, r9 // ........................................................................................................................................................................................................................................................*..................................................... + // pkhtb r4, r3, r4, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + // smulwt r3, r11, r9 // ..........................................................................................................................................................................................................................................................*................................................... + // vmov r9, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + // smlabt r11, r7, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + // usub16 r7, r14, r9 // .............................................................................................................................................................................................................................................................*................................................ + // smlabt r0, r3, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + // uadd16 r3, r14, r9 // ...............................................................................................................................................................................................................................................................*.............................................. + // vmov r14, s5 // ................................................................................................................................................................................................................................................................*............................................. + // pkhtb r11, r0, r11, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + // vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + // usub16 r9, r2, r4 // ...................................................................................................................................................................................................................................................................*.......................................... + // str.w r7, [r0, #160] // ....................................................................................................................................................................................................................................................................*......................................... + // uadd16 r2, r2, r4 // ..........................................................................................................................................................................................................................................................................*................................... + // vmov r7, s4 // .......................................................................................................................................................................................................................................................................................*...................... + // usub16 r4, r6, r11 // .....................................................................................................................................................................................................................................................................*........................................ + // str.w r3, [r0, #128] // ......................................................................................................................................................................................................................................................................*....................................... + // usub16 r3, r4, r14 // .........................................................................................................................................................................................................................................................................*.................................... + // str.w r3, [r0, #352] // ...........................................................................................................................................................................................................................................................................*.................................. + // uadd16 r3, r6, r11 // .......................................................................................................................................................................................................................................................................*...................................... + // vmov r11, s7 // ........................................................................................................................................................................................................................................................................*..................................... + // uadd16 r6, r4, r14 // ............................................................................................................................................................................................................................................................................*................................. + // str.w r6, [r0, #320] // .............................................................................................................................................................................................................................................................................*................................ + // uadd16 r4, r10, r11 // ..............................................................................................................................................................................................................................................................................*............................... + // vmov r14, s24 // ...............................................................................................................................................................................................................................................................................*.............................. + // str.w r4, [r0, #448] // ..................................................................................................................................................................................................................................................................................*........................... + // usub16 r4, r10, r11 // ................................................................................................................................................................................................................................................................................*............................. + // vmov r6, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + // uadd16 r10, r5, r6 // ....................................................................................................................................................................................................................................................................................*......................... + // str.w r10, [r0, #192] // .....................................................................................................................................................................................................................................................................................*........................ + // usub16 r11, r5, r6 // ......................................................................................................................................................................................................................................................................................*....................... + // str.w r11, [r0, #224] // ...........................................................................................................................................................................................................................................................................................................*.. + // usub16 r6, r3, r7 // ..........................................................................................................................................................................................................................................................................................*................... + // str.w r6, [r0, #288] // ...........................................................................................................................................................................................................................................................................................*.................. + // vmov r10, s6 // .......................................................................................................................................................................................................................................................................................................*...... + // str.w r4, [r0, #480] // .................................................................................................................................................................................................................................................................................*............................ + // usub16 r4, r8, r10 // ..........................................................................................................................................................................................................................................................................................................*... + // str.w r4, [r0, #416] // ............................................................................................................................................................................................................................................................................................................*. + // uadd16 r5, r3, r7 // ........................................................................................................................................................................................................................................................................................*..................... + // vmov r3, s1 // .............................................................................................................................................................................................................................................................................................*................ + // uadd16 r7, r9, r3 // ..............................................................................................................................................................................................................................................................................................*............... + // str.w r7, [r0, #64] // ...............................................................................................................................................................................................................................................................................................*.............. + // usub16 r9, r9, r3 // ................................................................................................................................................................................................................................................................................................*............. + // vmov r3, s0 // .........................................................................................................................................................................................................................................................................................*.................... + // uadd16 r4, r2, r3 // ............................................................................................................................................................................................................................................................................................*................. + // str.w r9, [r0, #96] // .....................................................................................................................................................................................................................................................................................................*........ + // usub16 r7, r2, r3 // ....................................................................................................................................................................................................................................................................................................*......... + // str.w r4, [r0], #4 // ..................................................................................................................................................................................................................................................................................................*........... + // cmp.w r0, r14 // ...................................................................................................................................................................................................................................................................................................*.......... + // str.w r5, [r0, #252] // .................................................................................................................................................................................................................................................................................................*............ + // uadd16 r2, r8, r10 // ........................................................................................................................................................................................................................................................................................................*..... + // str.w r2, [r0, #380] // .........................................................................................................................................................................................................................................................................................................*.... + // str.w r7, [r0, #28] // ......................................................................................................................................................................................................................................................................................................*....... + // bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* + + + + // ----------------------------------------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + // ldr.w r2, [r0, #32] // ..........*................................................................................................................................................................................................................................................................................................... + // ldr.w r3, [r0, #256/4+32] // .*............................................................................................................................................................................................................................................................................................................ + // ldr.w r4, [r0, #2*256/4+32] // ............*................................................................................................................................................................................................................................................................................................. + // ldr.w r5, [r0, #3*256/4+32] // *............................................................................................................................................................................................................................................................................................................. + // ldr.w r6, [r0, #256+32] // ........*..................................................................................................................................................................................................................................................................................................... + // ldr.w r7, [r0, #5*256/4+32] // ....*......................................................................................................................................................................................................................................................................................................... + // ldr.w r8, [r0, #6*256/4+32] // ......*....................................................................................................................................................................................................................................................................................................... + // ldr.w r9, [r0, #7*256/4+32] // .....*........................................................................................................................................................................................................................................................................................................ + // movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // vmov r10, s8 // ...*.......................................................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r6 // ....................*......................................................................................................................................................................................................................................................................................... + // smulwt r6, r10, r6 // ......................*....................................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // ........................*..................................................................................................................................................................................................................................................................................... + // smlabt r6, r6, r12, r0 // ..........................*................................................................................................................................................................................................................................................................................... + // pkhtb r14, r6, r14, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + // usub16 r6, r2, r14 // ..............................*............................................................................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ................................*............................................................................................................................................................................................................................................................................. + // smulwb r14, r10, r7 // .............*................................................................................................................................................................................................................................................................................................ + // smulwt r7, r10, r7 // .......*...................................................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // smlabt r7, r7, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + // pkhtb r14, r7, r14, asr #16 // .....................*........................................................................................................................................................................................................................................................................................ + // usub16 r7, r3, r14 // .......................*...................................................................................................................................................................................................................................................................................... + // uadd16 r3, r3, r14 // .........................*.................................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r8 // ...........................*.................................................................................................................................................................................................................................................................................. + // smulwt r8, r10, r8 // .............................*................................................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................*.............................................................................................................................................................................................................................................................................. + // smlabt r8, r8, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + // pkhtb r14, r8, r14, asr #16 // ....................................*......................................................................................................................................................................................................................................................................... + // usub16 r8, r4, r14 // ........................................*..................................................................................................................................................................................................................................................................... + // uadd16 r4, r4, r14 // ..........................................*................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r9 // .........*.................................................................................................................................................................................................................................................................................................... + // smulwt r9, r10, r9 // ...........*.................................................................................................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + // smlabt r9, r9, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + // pkhtb r14, r9, r14, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + // usub16 r9, r5, r14 // ......................................*....................................................................................................................................................................................................................................................................... + // uadd16 r5, r5, r14 // ..................................*........................................................................................................................................................................................................................................................................... + // vmov r10, s9 // ...................................*.......................................................................................................................................................................................................................................................................... + // vmov r11, s10 // .................................................*............................................................................................................................................................................................................................................................ + // smulwb r14, r10, r4 // ..............................................*............................................................................................................................................................................................................................................................... + // smulwt r4, r10, r4 // ............................................*................................................................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + // smlabt r4, r4, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + // pkhtb r14, r4, r14, asr #16 // ......................................................*....................................................................................................................................................................................................................................................... + // usub16 r4, r2, r14 // ..........................................................*................................................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ........................................................*..................................................................................................................................................................................................................................................... + // smulwb r14, r10, r5 // .....................................*........................................................................................................................................................................................................................................................................ + // smulwt r5, r10, r5 // .......................................*...................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .........................................*.................................................................................................................................................................................................................................................................... + // smlabt r5, r5, r12, r0 // ...........................................*.................................................................................................................................................................................................................................................................. + // pkhtb r14, r5, r14, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + // usub16 r5, r3, r14 // ...................................................*.......................................................................................................................................................................................................................................................... + // uadd16 r3, r3, r14 // ...............................................*.............................................................................................................................................................................................................................................................. + // smulwb r14, r11, r8 // ....................................................*......................................................................................................................................................................................................................................................... + // smulwt r8, r11, r8 // .....................................................*........................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................*...................................................................................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // .........................................................*.................................................................................................................................................................................................................................................... + // pkhtb r14, r8, r14, asr #16 // ............................................................*................................................................................................................................................................................................................................................. + // usub16 r8, r6, r14 // ..............................................................*............................................................................................................................................................................................................................................... + // uadd16 r6, r6, r14 // ..................................................................*........................................................................................................................................................................................................................................... + // smulwb r14, r11, r9 // ...........................................................*.................................................................................................................................................................................................................................................. + // smulwt r9, r11, r9 // .............................................................*................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................*.............................................................................................................................................................................................................................................. + // smlabt r9, r9, r12, r0 // .................................................................*............................................................................................................................................................................................................................................ + // pkhtb r14, r9, r14, asr #16 // ....................................................................*......................................................................................................................................................................................................................................... + // usub16 r9, r7, r14 // ..........................................................................*................................................................................................................................................................................................................................... + // uadd16 r7, r7, r14 // ........................................................................*..................................................................................................................................................................................................................................... + // vmov r10, s11 // ................................................................*............................................................................................................................................................................................................................................. + // vmov r11, s12 // ......................................................................*....................................................................................................................................................................................................................................... + // smulwb r14, r10, r3 // ...................................................................*.......................................................................................................................................................................................................................................... + // smulwt r3, r10, r3 // .....................................................................*........................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + // pkhtb r14, r3, r14, asr #16 // ............................................................................*................................................................................................................................................................................................................................. + // usub16 r3, r2, r14 // ..................................................................................*........................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ................................................................................*............................................................................................................................................................................................................................. + // smulwb r14, r11, r5 // ...........................................................................*.................................................................................................................................................................................................................................. + // smulwt r5, r11, r5 // .............................................................................*................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................*.............................................................................................................................................................................................................................. + // smlabt r5, r5, r12, r0 // .................................................................................*............................................................................................................................................................................................................................ + // pkhtb r14, r5, r14, asr #16 // ....................................................................................*......................................................................................................................................................................................................................... + // usub16 r5, r4, r14 // ........................................................................................*..................................................................................................................................................................................................................... + // uadd16 r4, r4, r14 // ..........................................................................................*................................................................................................................................................................................................................... + // vmov r10, s13 // ......................................................................................*....................................................................................................................................................................................................................... + // vmov r11, s14 // ..............................................................................*............................................................................................................................................................................................................................... + // smulwb r14, r10, r7 // ............................................................................................*................................................................................................................................................................................................................. + // smulwt r7, r10, r7 // ..............................................................................................*............................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .......................................................................................................*...................................................................................................................................................................................................... + // smlabt r7, r7, r12, r0 // .....................................................................................................*........................................................................................................................................................................................................ + // pkhtb r14, r7, r14, asr #16 // ..............................................................................................................*............................................................................................................................................................................................... + // usub16 r7, r6, r14 // ................................................................................................................*............................................................................................................................................................................................. + // uadd16 r6, r6, r14 // ..................................................................................................................*........................................................................................................................................................................................... + // smulwb r14, r11, r9 // ...................................................................................*.......................................................................................................................................................................................................................... + // smulwt r9, r11, r9 // .....................................................................................*........................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................................................*...................................................................................................................................................................................................................... + // smlabt r9, r9, r12, r0 // .........................................................................................*.................................................................................................................................................................................................................... + // pkhtb r14, r9, r14, asr #16 // ...........................................................................................*.................................................................................................................................................................................................................. + // usub16 r9, r8, r14 // ...................................................................................................*.......................................................................................................................................................................................................... + // uadd16 r8, r8, r14 // .................................................................................................*............................................................................................................................................................................................................ + // vmov r10, s15 // ..........................................................................................................................*................................................................................................................................................................................... + // vmov r11, s16 // .............................................................................................*................................................................................................................................................................................................................ + // smulwb r14, r10, r2 // ............................................................................................................................*................................................................................................................................................................................. + // smulwt r2, r10, r2 // .............................................................................................................................*................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................................................................*.............................................................................................................................................................................. + // smlabt r2, r2, r12, r0 // .................................................................................................................................*............................................................................................................................................................................ + // pkhtb r2, r2, r14, asr #16 // ...................................................................................................................................*.......................................................................................................................................................................... + // smulwb r14, r11, r3 // ................................................................................................*............................................................................................................................................................................................................. + // smulwt r3, r11, r3 // ...............................................................................................*.............................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ....................................................................................................*......................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // ..................................................................................................*........................................................................................................................................................................................................... + // pkhtb r3, r3, r14, asr #16 // ........................................................................................................*..................................................................................................................................................................................................... + // vmov r10, s17 // .......................................................................................................................................*...................................................................................................................................................................... + // vmov r11, s18 // ............................................................................................................*................................................................................................................................................................................................. + // smulwb r14, r10, r4 // ...........................................................................................................................................*.................................................................................................................................................................. + // smulwt r4, r10, r4 // ...............................................................................................................................................*.............................................................................................................................................................. + // smlabt r14, r14, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + // smlabt r4, r4, r12, r0 // ...................................................................................................................................................*.......................................................................................................................................................... + // pkhtb r4, r4, r14, asr #16 // ......................................................................................................................................................*....................................................................................................................................................... + // smulwb r14, r11, r5 // ...............................................................................................................*.............................................................................................................................................................................................. + // smulwt r5, r11, r5 // .................................................................................................................*............................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .....................................................................................................................*........................................................................................................................................................................................ + // smlabt r5, r5, r12, r0 // ......................................................................................................................................*....................................................................................................................................................................... + // pkhtb r5, r5, r14, asr #16 // ........................................................................................................................................*..................................................................................................................................................................... + // vmov r10, s19 // ................................................................................................................................*............................................................................................................................................................................. + // vmov r11, s20 // ........................................................................................................................................................*..................................................................................................................................................... + // smulwb r14, r10, r6 // ..................................................................................................................................*........................................................................................................................................................................... + // smulwt r6, r10, r6 // ....................................................................................................................................*......................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .............................................................................................................................................*................................................................................................................................................................ + // smlabt r6, r6, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + // pkhtb r6, r6, r14, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + // smulwb r14, r11, r7 // ...........................................................................................................................................................*.................................................................................................................................................. + // smulwt r7, r11, r7 // .............................................................................................................................................................*................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + // smlabt r7, r7, r12, r0 // .................................................................................................................................................................*............................................................................................................................................ + // pkhtb r7, r7, r14, asr #16 // ......................................................................................................................................................................*....................................................................................................................................... + // vmov r10, s21 // ....................................................................................................................*......................................................................................................................................................................................... + // vmov r11, s22 // ......................................................................................................*....................................................................................................................................................................................................... + // smulwb r14, r10, r8 // ......................................................................................................................*....................................................................................................................................................................................... + // smulwt r8, r10, r8 // .......................................................................................................................*...................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .........................................................................................................................*.................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + // pkhtb r8, r8, r14, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + // smulwb r14, r11, r9 // .........................................................................................................*.................................................................................................................................................................................................... + // smulwt r9, r11, r9 // ...........................................................................................................*.................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + // smlabt r9, r9, r12, r0 // ...................................................................................................................*.......................................................................................................................................................................................... + // pkhtb r9, r9, r14, asr #16 // ........................................................................................................................*..................................................................................................................................................................................... + // vmov s0, r2 // .....................................................................................................................................*........................................................................................................................................................................ + // vmov s1, r3 // ..........................................................................................................*................................................................................................................................................................................................... + // vmov s2, r4 // ............................................................................................................................................................*................................................................................................................................................. + // vmov s3, r5 // ..........................................................................................................................................*................................................................................................................................................................... + // vmov s4, r6 // .......................................................................................................................................................................................*...................................................................................................................... + // vmov s5, r7 // ........................................................................................................................................................................*..................................................................................................................................... + // vmov s6, r8 // ..................................................................................................................................................*........................................................................................................................................................... + // vmov s7, r9 // ............................................................................................................................................*................................................................................................................................................................. + // vmov r0, s23 // .........................................................................................................................................*.................................................................................................................................................................... + // ldr.w r2, [r0, #0] // ....................................................................................................................................................................*......................................................................................................................................... + // ldr.w r3, [r0, #256/4] // ..............................................................................................................................................*............................................................................................................................................................... + // ldr.w r4, [r0, #2*256/4] // .........................................................................................................................................................................................*.................................................................................................................... + // ldr.w r5, [r0, #3*256/4] // ...........................................................................................................................................................................................*.................................................................................................................. + // ldr.w r6, [r0, #256] // ..............................................................................................................................................................*............................................................................................................................................... + // ldr.w r7, [r0, #5*256/4] // ................................................................................................................................................*............................................................................................................................................................. + // ldr.w r8, [r0, #6*256/4] // ................................................................................................................................................................*............................................................................................................................................. + // ldr.w r9, [r0, #7*256/4] // ................................................................................................................................................................................*............................................................................................................................. + // movw r0, #24608 // ..................................................................................................................................................................*........................................................................................................................................... + // vmov r10, s8 // ....................................................................................................................................................*......................................................................................................................................................... + // smulwb r14, r10, r6 // .........................................................................................................................................................................*.................................................................................................................................... + // smulwt r6, r10, r6 // ...........................................................................................................................................................................*.................................................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + // smlabt r6, r6, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + // pkhtb r14, r6, r14, asr #16 // ..................................................................................................................................................................................*........................................................................................................................... + // usub16 r6, r2, r14 // .....................................................................................................................................................................................*........................................................................................................................ + // uadd16 r2, r2, r14 // ....................................................................................................................................................................................*......................................................................................................................... + // smulwb r14, r10, r7 // .......................................................................................................................................................*...................................................................................................................................................... + // smulwt r7, r10, r7 // .........................................................................................................................................................*.................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................*........................................................................................................................................ + // smlabt r7, r7, r12, r0 // ...................................................................................................................................................................*.......................................................................................................................................... + // pkhtb r14, r7, r14, asr #16 // ..........................................................................................................................................................................*................................................................................................................................... + // usub16 r7, r3, r14 // ..............................................................................................................................................................................*............................................................................................................................... + // uadd16 r3, r3, r14 // ............................................................................................................................................................................*................................................................................................................................. + // smulwb r14, r10, r8 // .......................................................................................................................................................................*...................................................................................................................................... + // smulwt r8, r10, r8 // .................................................................................................................................................................................*............................................................................................................................ + // smlabt r14, r14, r12, r0 // ...................................................................................................................................................................................*.......................................................................................................................... + // smlabt r8, r8, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................... + // pkhtb r14, r8, r14, asr #16 // .............................................................................................................................................................................................*................................................................................................................ + // usub16 r8, r4, r14 // ...................................................................................................................................................................................................*.......................................................................................................... + // uadd16 r4, r4, r14 // ...............................................................................................................................................................................................*.............................................................................................................. + // smulwb r14, r10, r9 // ........................................................................................................................................................................................*..................................................................................................................... + // smulwt r9, r10, r9 // ......................................................................................................................................................................................*....................................................................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................*................................................................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................*............................................................................................................ + // usub16 r9, r5, r14 // .....................................................................................................................................................................................................*........................................................................................................ + // uadd16 r5, r5, r14 // .......................................................................................................................................................................................................*...................................................................................................... + // vmov r10, s9 // ................................................................................................................................................................................................*............................................................................................................. + // vmov r11, s10 // ..............................................................................................................................................................................................................*............................................................................................... + // smulwb r14, r10, r4 // ..................................................................................................................................................................................................*........................................................................................................... + // smulwt r4, r10, r4 // ....................................................................................................................................................................................................*......................................................................................................... + // smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................*....................................................................................................... + // smlabt r4, r4, r12, r0 // ........................................................................................................................................................................................................*..................................................................................................... + // pkhtb r14, r4, r14, asr #16 // ..........................................................................................................................................................................................................*................................................................................................... + // usub16 r4, r2, r14 // ............................................................................................................................................................................................................*................................................................................................. + // uadd16 r2, r2, r14 // ................................................................................................................................................................................................................*............................................................................................. + // smulwb r14, r10, r5 // .........................................................................................................................................................................................................*.................................................................................................... + // smulwt r5, r10, r5 // ...........................................................................................................................................................................................................*.................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + // smlabt r5, r5, r12, r0 // ...............................................................................................................................................................................................................*.............................................................................................. + // pkhtb r14, r5, r14, asr #16 // ..................................................................................................................................................................................................................*........................................................................................... + // usub16 r5, r3, r14 // ....................................................................................................................................................................................................................*......................................................................................... + // uadd16 r3, r3, r14 // ......................................................................................................................................................................................................................*....................................................................................... + // smulwb r14, r11, r8 // .................................................................................................................................................................................................................*............................................................................................ + // smulwt r8, r11, r8 // ...................................................................................................................................................................................................................*.......................................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + // smlabt r8, r8, r12, r0 // .......................................................................................................................................................................................................................*...................................................................................... + // pkhtb r14, r8, r14, asr #16 // .........................................................................................................................................................................................................................*.................................................................................... + // usub16 r8, r6, r14 // ...........................................................................................................................................................................................................................*.................................................................................. + // uadd16 r6, r6, r14 // ...............................................................................................................................................................................................................................*.............................................................................. + // smulwb r14, r11, r9 // ........................................................................................................................................................................................................................*..................................................................................... + // smulwt r9, r11, r9 // ..........................................................................................................................................................................................................................*................................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................................................*............................................................................ + // usub16 r9, r7, r14 // ..................................................................................................................................................................................................................................*........................................................................... + // uadd16 r7, r7, r14 // ....................................................................................................................................................................................................................................*......................................................................... + // vmov r10, s11 // .............................................................................................................................................................................................................................................*................................................................ + // vmov r11, s12 // .............................................................................................................................................................................................................................*................................................................................ + // smulwb r14, r10, r3 // ................................................................................................................................................................................................................................................*............................................................. + // smulwt r3, r10, r3 // ..................................................................................................................................................................................................................................................*........................................................... + // smlabt r14, r14, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + // smlabt r3, r3, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + // pkhtb r14, r3, r14, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + // usub16 r3, r2, r14 // ...................................................................................................................................................................................................................................................................*.......................................... + // uadd16 r2, r2, r14 // .....................................................................................................................................................................................................................................................................*........................................ + // smulwb r14, r11, r5 // ................................................................................................................................................................................................................................*............................................................................. + // smulwt r5, r11, r5 // ...................................................................................................................................................................................................................................*.......................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................................................................................*........................................................................ + // smlabt r5, r5, r12, r0 // .......................................................................................................................................................................................................................................*...................................................................... + // pkhtb r14, r5, r14, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + // usub16 r5, r4, r14 // ...........................................................................................................................................................................................................................................*.................................................................. + // uadd16 r4, r4, r14 // ...............................................................................................................................................................................................................................................*.............................................................. + // vmov r10, s13 // .....................................................................................................................................................................................................................................................*........................................................ + // vmov r11, s14 // ......................................................................................................................................................................................................................................*....................................................................... + // smulwb r14, r10, r7 // ........................................................................................................................................................................................................................................................*..................................................... + // smulwt r7, r10, r7 // ..........................................................................................................................................................................................................................................................*................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + // smlabt r7, r7, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + // pkhtb r14, r7, r14, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + // usub16 r7, r6, r14 // .......................................................................................................................................................................................................................................................................*...................................... + // uadd16 r6, r6, r14 // ...........................................................................................................................................................................................................................................................................*.................................. + // smulwb r14, r11, r9 // ........................................................................................................................................................................................................................................*..................................................................... + // smulwt r9, r11, r9 // ..........................................................................................................................................................................................................................................*................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + // usub16 r9, r8, r14 // ...................................................................................................................................................................................................................................................*.......................................................... + // uadd16 r8, r8, r14 // .......................................................................................................................................................................................................................................................*...................................................... + // vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + // vmov r10, s1 // ...............................................................................................................................................................................................................................................................................................*.............. + // uadd16 r14, r3, r10 // ................................................................................................................................................................................................................................................................................................*............. + // usub16 r3, r3, r10 // ..................................................................................................................................................................................................................................................................................................*........... + // str.w r14, [r0, #1*256/4] // .................................................................................................................................................................................................................................................................................................*............ + // str.w r3, [r0, #1*256/4+32] // .....................................................................................................................................................................................................................................................................................................*........ + // vmov r10, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + // uadd16 r14, r5, r10 // ....................................................................................................................................................................................................................................................................................*......................... + // usub16 r5, r5, r10 // ......................................................................................................................................................................................................................................................................................*....................... + // str.w r14, [r0, #3*256/4] // .....................................................................................................................................................................................................................................................................................*........................ + // str.w r5, [r0, #3*256/4+32] // .......................................................................................................................................................................................................................................................................................*...................... + // vmov r10, s5 // ................................................................................................................................................................................................................................................................*............................................. + // uadd16 r14, r7, r10 // .............................................................................................................................................................................................................................................................................*................................ + // usub16 r7, r7, r10 // .........................................................................................................................................................................................................................................................................*.................................... + // str.w r14, [r0, #5*256/4] // ..............................................................................................................................................................................................................................................................................*............................... + // str.w r7, [r0, #5*256/4+32] // ..........................................................................................................................................................................................................................................................................*................................... + // vmov r10, s7 // ............................................................................................................................................................................................................................................................................*................................. + // uadd16 r14, r9, r10 // ...............................................................................................................................................................................................................................................................................*.............................. + // usub16 r9, r9, r10 // ..................................................................................................................................................................................................................................................................................*........................... + // str.w r14, [r0, #7*256/4] // .................................................................................................................................................................................................................................................................................*............................ + // str.w r9, [r0, #7*256/4+32] // ...........................................................................................................................................................................................................................................................................................*.................. + // vmov r5, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + // uadd16 r14, r4, r5 // ...............................................................................................................................................................................................................................................................*.............................................. + // usub16 r10, r4, r5 // .............................................................................................................................................................................................................................................................*................................................ + // str.w r14, [r0, #2*256/4] // ........................................................................................................................................................................................................................................................................*..................................... + // str.w r10, [r0, #2*256/4+32] // ....................................................................................................................................................................................................................................................................*......................................... + // vmov r7, s4 // ......................................................................................................................................................................................................................................................................*....................................... + // uadd16 r14, r6, r7 // ..............................................................................................................................................................................................................................................................................................*............... + // usub16 r10, r6, r7 // ........................................................................................................................................................................................................................................................................................*..................... + // str.w r14, [r0, #4*256/4] // .........................................................................................................................................................................................................................................................................................................*.... + // str.w r10, [r0, #4*256/4+32] // .........................................................................................................................................................................................................................................................................................*.................... + // vmov r9, s6 // ..........................................................................................................................................................................................................................................................................................*................... + // uadd16 r14, r8, r9 // ..........................................................................................................................................................................................................................................................................................................*... + // usub16 r10, r8, r9 // ............................................................................................................................................................................................................................................................................................*................. + // str.w r14, [r0, #6*256/4] // ...........................................................................................................................................................................................................................................................................................................*.. + // str.w r10, [r0, #6*256/4+32] // .............................................................................................................................................................................................................................................................................................*................ + // vmov r3, s0 // ...................................................................................................................................................................................................................................................................................................*.......... + // uadd16 r14, r2, r3 // ....................................................................................................................................................................................................................................................................................................*......... + // usub16 r10, r2, r3 // ......................................................................................................................................................................................................................................................................................................*....... + // str.w r10, [r0, #32] // ............................................................................................................................................................................................................................................................................................................*. + // str.w r14, [r0], #4 // .......................................................................................................................................................................................................................................................................................................*...... + // vmov r14, s24 // ................................................................................................................................................................................................................................................................................*............................. + // cmp.w r0, r14 // ........................................................................................................................................................................................................................................................................................................*..... + // bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* - // ---------------------------------------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - // vmov r14, s8 // *.......................................................................................................................................................................................................................................................................................................... - // ldr.w r6, [r0, #352] // .*......................................................................................................................................................................................................................................................................................................... - // vmov s23, r0 // ...*....................................................................................................................................................................................................................................................................................................... - // ldr.w r7, [r0, #416] // ..........*................................................................................................................................................................................................................................................................................................ - // ldr.w r5, [r0, #96] // ..*........................................................................................................................................................................................................................................................................................................ - // smulwb r2, r14, r6 // .....*..................................................................................................................................................................................................................................................................................................... - // ldr.w r1, [r0, #160] // ............*.............................................................................................................................................................................................................................................................................................. - // smulwb r11, r14, r7 // ........................*.................................................................................................................................................................................................................................................................................. - // ldr.w r10, [r0, #288] // ....*...................................................................................................................................................................................................................................................................................................... - // smulwt r7, r14, r7 // ..........................*................................................................................................................................................................................................................................................................................ - // ldr.w r3, [r0, #224] // ..............*............................................................................................................................................................................................................................................................................................ - // smulwt r4, r14, r6 // .......*................................................................................................................................................................................................................................................................................................... - // ldr.w r8, [r0, #32] // ........*.................................................................................................................................................................................................................................................................................................. - // smulwt r6, r14, r10 // ...............*........................................................................................................................................................................................................................................................................................... - // ldr.w r9, [r0, #480] // ......*.................................................................................................................................................................................................................................................................................................... - // smulwb r10, r14, r10 // .........*................................................................................................................................................................................................................................................................................................. - // movw r0, #24608 // ................*.......................................................................................................................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ................................*.......................................................................................................................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // ..............................*............................................................................................................................................................................................................................................................................ - // smlabt r4, r4, r12, r0 // ............................*.............................................................................................................................................................................................................................................................................. - // pkhtb r11, r7, r11, asr #16 // .....................................*..................................................................................................................................................................................................................................................................... - // smlabt r7, r10, r12, r0 // ...................*....................................................................................................................................................................................................................................................................................... - // usub16 r10, r1, r11 // .........................................*................................................................................................................................................................................................................................................................. - // smlabt r6, r6, r12, r0 // ....................*...................................................................................................................................................................................................................................................................................... - // uadd16 r11, r1, r11 // .......................................*................................................................................................................................................................................................................................................................... - // smlabt r2, r2, r12, r0 // ......................*.................................................................................................................................................................................................................................................................................... - // pkhtb r7, r6, r7, asr #16 // .......................*................................................................................................................................................................................................................................................................................... - // smulwb r1, r14, r9 // ...........*............................................................................................................................................................................................................................................................................................... - // usub16 r6, r8, r7 // .............................................*............................................................................................................................................................................................................................................................. - // smulwt r14, r14, r9 // .............*............................................................................................................................................................................................................................................................................................. - // uadd16 r9, r8, r7 // ...........................................*............................................................................................................................................................................................................................................................... - // smlabt r8, r1, r12, r0 // ..................*........................................................................................................................................................................................................................................................................................ - // vmov r1, s9 // .............................*............................................................................................................................................................................................................................................................................. - // smlabt r14, r14, r12, r0 // .................*......................................................................................................................................................................................................................................................................................... - // pkhtb r4, r4, r2, asr #16 // ...............................*........................................................................................................................................................................................................................................................................... - // smulwb r7, r1, r11 // ............................................*.............................................................................................................................................................................................................................................................. - // pkhtb r14, r14, r8, asr #16 // .....................*..................................................................................................................................................................................................................................................................................... - // smulwt r2, r1, r11 // ..........................................*................................................................................................................................................................................................................................................................ - // usub16 r8, r3, r14 // .........................*................................................................................................................................................................................................................................................................................. - // smlabt r7, r7, r12, r0 // ................................................*.......................................................................................................................................................................................................................................................... - // uadd16 r14, r3, r14 // ...........................*............................................................................................................................................................................................................................................................................... - // smlabt r2, r2, r12, r0 // ..............................................*............................................................................................................................................................................................................................................................ - // smulwb r3, r1, r14 // ..................................*........................................................................................................................................................................................................................................................................ - // pkhtb r2, r2, r7, asr #16 // ....................................................*...................................................................................................................................................................................................................................................... - // smulwt r11, r1, r14 // ....................................*...................................................................................................................................................................................................................................................................... - // vmov r7, s10 // ...............................................*........................................................................................................................................................................................................................................................... - // smlabt r1, r3, r12, r0 // ......................................*.................................................................................................................................................................................................................................................................... - // smulwb r3, r7, r8 // .....................................................*..................................................................................................................................................................................................................................................... - // uadd16 r14, r5, r4 // ...................................*....................................................................................................................................................................................................................................................................... - // smulwt r8, r7, r8 // ......................................................*.................................................................................................................................................................................................................................................... - // usub16 r4, r5, r4 // .................................*......................................................................................................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ........................................*.................................................................................................................................................................................................................................................................. - // smlabt r5, r3, r12, r0 // ........................................................*.................................................................................................................................................................................................................................................. - // pkhtb r11, r11, r1, asr #16 // .................................................*......................................................................................................................................................................................................................................................... - // smlabt r8, r8, r12, r0 // ..........................................................*................................................................................................................................................................................................................................................ - // usub16 r3, r14, r11 // .......................................................*................................................................................................................................................................................................................................................... - // smulwb r1, r7, r10 // ..................................................*........................................................................................................................................................................................................................................................ - // pkhtb r8, r8, r5, asr #16 // .............................................................*............................................................................................................................................................................................................................................. - // smulwt r7, r7, r10 // ...................................................*....................................................................................................................................................................................................................................................... - // vmov r10, s12 // .........................................................*................................................................................................................................................................................................................................................. - // smlabt r1, r1, r12, r0 // ....................................................................*...................................................................................................................................................................................................................................... - // uadd16 r5, r4, r8 // .................................................................*......................................................................................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // ..........................................................................*................................................................................................................................................................................................................................ - // uadd16 r14, r14, r11 // ...........................................................*............................................................................................................................................................................................................................................... - // smulwt r11, r10, r3 // ..............................................................*............................................................................................................................................................................................................................................ - // pkhtb r7, r7, r1, asr #16 // .............................................................................*............................................................................................................................................................................................................................. - // smulwb r1, r10, r3 // ............................................................*.............................................................................................................................................................................................................................................. - // vmov r3, s13 // ...........................................................................*............................................................................................................................................................................................................................... - // smlabt r10, r11, r12, r0 // ..................................................................*........................................................................................................................................................................................................................................ - // usub16 r11, r4, r8 // ...............................................................*........................................................................................................................................................................................................................................... - // smlabt r4, r1, r12, r0 // ................................................................*.......................................................................................................................................................................................................................................... - // usub16 r1, r6, r7 // ...................................................................................*....................................................................................................................................................................................................................... - // smulwb r8, r3, r5 // ................................................................................*.......................................................................................................................................................................................................................... - // uadd16 r6, r6, r7 // ...............................................................................*........................................................................................................................................................................................................................... - // smulwt r5, r3, r5 // ..................................................................................*........................................................................................................................................................................................................................ - // uadd16 r3, r9, r2 // .......................................................................*................................................................................................................................................................................................................................... - // smlabt r8, r8, r12, r0 // ....................................................................................*...................................................................................................................................................................................................................... - // vmov r7, s11 // ...................................................................*....................................................................................................................................................................................................................................... - // smlabt r5, r5, r12, r0 // ......................................................................................*.................................................................................................................................................................................................................... - // pkhtb r10, r10, r4, asr #16 // .....................................................................*..................................................................................................................................................................................................................................... - // smulwb r4, r7, r14 // ........................................................................*.................................................................................................................................................................................................................................. - // pkhtb r8, r5, r8, asr #16 // .........................................................................................*................................................................................................................................................................................................................. - // smulwt r14, r7, r14 // ......................................................................*.................................................................................................................................................................................................................................... - // usub16 r5, r9, r2 // .........................................................................*................................................................................................................................................................................................................................. - // smlabt r4, r4, r12, r0 // ............................................................................*.............................................................................................................................................................................................................................. - // vmov r9, s14 // .....................................................................................*..................................................................................................................................................................................................................... - // smlabt r2, r14, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................ - // usub16 r7, r6, r8 // .....................................................................................................*..................................................................................................................................................................................................... - // smulwb r14, r9, r11 // ........................................................................................*.................................................................................................................................................................................................................. - // pkhtb r4, r2, r4, asr #16 // .................................................................................*......................................................................................................................................................................................................................... - // vmov r2, s22 // .......................................................................................................................................*................................................................................................................................................................... - // smulwt r11, r9, r11 // ..........................................................................................*................................................................................................................................................................................................................ - // uadd16 r6, r6, r8 // .......................................................................................................*................................................................................................................................................................................................... - // smlabt r9, r14, r12, r0 // ............................................................................................*.............................................................................................................................................................................................................. - // vmov r14, s20 // .......................................................................................*................................................................................................................................................................................................................... - // smlabt r8, r11, r12, r0 // ..............................................................................................*............................................................................................................................................................................................................ - // smulwt r11, r14, r7 // ........................................................................................................*.................................................................................................................................................................................................. - // pkhtb r9, r8, r9, asr #16 // .................................................................................................*......................................................................................................................................................................................................... - // smulwb r7, r14, r7 // .........................................................................................................*................................................................................................................................................................................................. - // usub16 r8, r1, r9 // ..............................................................................................................*............................................................................................................................................................................................ - // smlabt r14, r11, r12, r0 // ...............................................................................................................*........................................................................................................................................................................................... - // uadd16 r1, r1, r9 // ............................................................................................................*.............................................................................................................................................................................................. - // smulwb r9, r2, r8 // ................................................................................................................................................*.......................................................................................................................................................... - // smulwt r11, r2, r8 // ..............................................................................................................................................*............................................................................................................................................................ - // uadd16 r8, r3, r4 // ..........................................................................................................................*................................................................................................................................................................................ - // smlabt r2, r9, r12, r0 // ....................................................................................................................................................*...................................................................................................................................................... - // vmov r9, s21 // ....................................................................................................................*...................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ..................................................................................................................................................*........................................................................................................................................................ - // usub16 r4, r3, r4 // ............................................................................................................................*.............................................................................................................................................................................. - // smulwb r3, r9, r1 // .............................................................................................................................*............................................................................................................................................................................. - // pkhtb r2, r11, r2, asr #16 // .........................................................................................................................................................*................................................................................................................................................. - // smulwt r1, r9, r1 // ...............................................................................................................................*........................................................................................................................................................................... - // vmov s7, r2 // ...........................................................................................................................................................*............................................................................................................................................... - // smlabt r11, r3, r12, r0 // ................................................................................................................................*.......................................................................................................................................................................... - // vmov r2, s19 // ...................................................................................................*....................................................................................................................................................................................................... - // smlabt r1, r1, r12, r0 // ..................................................................................................................................*........................................................................................................................................................................ - // vmov r9, s16 // .....................................................................................................................................*..................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // .............................................................................................................*............................................................................................................................................................................................. - // pkhtb r1, r1, r11, asr #16 // .........................................................................................................................................*................................................................................................................................................................. - // smulwt r11, r9, r4 // ......................................................................................................................................................*.................................................................................................................................................... - // vmov s6, r1 // ...............................................................................................................................................*........................................................................................................................................................... - // smulwt r1, r2, r6 // ...........................................................................................................*............................................................................................................................................................................................... - // pkhtb r3, r14, r7, asr #16 // ..................................................................................................................*........................................................................................................................................................................................ - // smlabt r7, r11, r12, r0 // ..........................................................................................................................................................*................................................................................................................................................ - // uadd16 r11, r5, r10 // ...........................................................................................*............................................................................................................................................................................................................... - // smulwb r14, r2, r6 // .................................................................................................................*......................................................................................................................................................................................... - // vmov r2, s17 // .............................................................................................*............................................................................................................................................................................................................. - // smlabt r1, r1, r12, r0 // .......................................................................................................................*................................................................................................................................................................................... - // smulwt r6, r2, r11 // ................................................................................................*.......................................................................................................................................................................................................... - // smlabt r14, r14, r12, r0 // .........................................................................................................................*................................................................................................................................................................................. - // vmov s5, r3 // ......................................................................................................................*.................................................................................................................................................................................... - // smlabt r6, r6, r12, r0 // ....................................................................................................*...................................................................................................................................................................................................... - // smulwb r11, r2, r11 // ..................................................................................................*........................................................................................................................................................................................................ - // pkhtb r14, r1, r14, asr #16 // ..............................................................................................................................*............................................................................................................................................................................ - // smulwb r1, r9, r4 // ........................................................................................................................................................*.................................................................................................................................................. - // vmov r9, s15 // .................................................................................................................................*......................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ......................................................................................................*.................................................................................................................................................................................................... - // vmov r4, s23 // .............................................................................................................................................*............................................................................................................................................................. - // smulwb r2, r9, r8 // ....................................................................................................................................*...................................................................................................................................................................... - // usub16 r5, r5, r10 // ...............................................................................................*........................................................................................................................................................................................................... - // smlabt r1, r1, r12, r0 // ..............................................................................................................................................................*............................................................................................................................................ - // vmov r3, s18 // ................................................................................................................*.......................................................................................................................................................................................... - // smlabt r10, r2, r12, r0 // ............................................................................................................................................*.............................................................................................................................................................. - // pkhtb r11, r6, r11, asr #16 // ..........................................................................................................*................................................................................................................................................................................................ - // smulwt r2, r3, r5 // ...................................................................................................................*....................................................................................................................................................................................... - // vmov s2, r11 // ........................................................................................................................*.................................................................................................................................................................................. - // smulwt r9, r9, r8 // ......................................................................................................................................*.................................................................................................................................................................... - // pkhtb r11, r7, r1, asr #16 // ...................................................................................................................................................................*....................................................................................................................................... - // smlabt r1, r2, r12, r0 // ........................................................................................................................................*.................................................................................................................................................................. - // vmov r2, s8 // ...................................................................................................................................................*....................................................................................................................................................... - // smulwb r3, r3, r5 // .....................................................................................................................*..................................................................................................................................................................................... - // ldr.w r7, [r4, #320] // .....................................................................................................................................................*..................................................................................................................................................... - // smlabt r5, r9, r12, r0 // ..........................................................................................................................................*................................................................................................................................................................ - // ldr.w r8, [r4, #0] // .............................................................................................................................................................*............................................................................................................................................. - // smlabt r6, r3, r12, r0 // ...........................................................................................................................*............................................................................................................................................................................... - // pkhtb r10, r5, r10, asr #16 // .................................................................................................................................................*......................................................................................................................................................... - // smulwb r5, r2, r7 // ................................................................................................................................................................*.......................................................................................................................................... - // ldr.w r3, [r4, #256] // .......................................................................................................................................................*................................................................................................................................................... - // smulwt r9, r2, r7 // ....................................................................................................................................................................*...................................................................................................................................... - // movw r0, #24608 // .....................................................................................................................................................................*..................................................................................................................................... - // smlabt r7, r5, r12, r0 // ..........................................................................................................................................................................*................................................................................................................................ - // vmov s4, r14 // ...................................................................................................................................*....................................................................................................................................................................... - // smulwb r5, r2, r3 // ............................................................................................................................................................*.............................................................................................................................................. - // pkhtb r14, r1, r6, asr #16 // ...........................................................................................................................................*............................................................................................................................................................... - // smulwt r3, r2, r3 // ..................................................................................................................................................................*........................................................................................................................................ - // ldr.w r6, [r4, #384] // .......................................................................................................................................................................*................................................................................................................................... - // smlabt r1, r5, r12, r0 // ......................................................................................................................................................................*.................................................................................................................................... - // ldr.w r5, [r4, #64] // .....................................................................................................................................................................................*..................................................................................................................... - // smlabt r3, r3, r12, r0 // ........................................................................................................................................................................*.................................................................................................................................. - // vmov s3, r14 // ...............................................................................................................................................................*........................................................................................................................................... - // smulwb r14, r2, r6 // ............................................................................................................................................................................*.............................................................................................................................. - // pkhtb r1, r3, r1, asr #16 // ...........................................................................................................................................................................*............................................................................................................................... - // smulwt r3, r2, r6 // ..................................................................................................................................................................................*........................................................................................................................ - // usub16 r6, r8, r1 // .............................................................................................................................................................................*............................................................................................................................. - // smlabt r9, r9, r12, r0 // ................................................................................................................................................................................*.......................................................................................................................... - // uadd16 r1, r8, r1 // .................................................................................................................................................................................*......................................................................................................................... - // smlabt r8, r3, r12, r0 // ........................................................................................................................................................................................*.................................................................................................................. - // pkhtb r3, r9, r7, asr #16 // ...................................................................................................................................................................................*....................................................................................................................... - // smlabt r9, r14, r12, r0 // ......................................................................................................................................................................................*.................................................................................................................... - // vmov s0, r10 // .................................................................................................................................................................*......................................................................................................................................... - // ldr.w r10, [r4, #448] // .........................................................................................................................................................................*................................................................................................................................. - // ldr.w r7, [r4, #128] // .......................................................................................................................................................................................*................................................................................................................... - // vmov s1, r11 // ...............................................................................................................................................................................*........................................................................................................................... - // pkhtb r8, r8, r9, asr #16 // ...........................................................................................................................................................................................*............................................................................................................... - // smulwb r14, r2, r10 // ..............................................................................................................................................................................*............................................................................................................................ - // uadd16 r11, r7, r8 // .............................................................................................................................................................................................*............................................................................................................. - // smulwt r10, r2, r10 // ....................................................................................................................................................................................*...................................................................................................................... - // ldr.w r9, [r4, #192] // .........................................................................................................................................................................................*................................................................................................................. - // smlabt r4, r14, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................ - // vmov r2, s9 // ............................................................................................................................................................................................*.............................................................................................................. - // smlabt r14, r10, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................ - // usub16 r10, r7, r8 // ...............................................................................................................................................................................................*........................................................................................................... - // smulwb r7, r2, r11 // ................................................................................................................................................................................................*.......................................................................................................... - // pkhtb r8, r14, r4, asr #16 // .................................................................................................................................................................................................*......................................................................................................... - // smulwt r14, r2, r11 // ..................................................................................................................................................................................................*........................................................................................................ - // usub16 r4, r9, r8 // ...................................................................................................................................................................................................*....................................................................................................... - // smlabt r11, r7, r12, r0 // ....................................................................................................................................................................................................*...................................................................................................... - // uadd16 r7, r9, r8 // .....................................................................................................................................................................................................*..................................................................................................... - // smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................*.................................................................................................... - // vmov r8, s10 // .......................................................................................................................................................................................................*................................................................................................... - // smulwb r9, r2, r7 // ........................................................................................................................................................................................................*.................................................................................................. - // pkhtb r11, r14, r11, asr #16 // .........................................................................................................................................................................................................*................................................................................................. - // smulwt r2, r2, r7 // ..........................................................................................................................................................................................................*................................................................................................ - // usub16 r14, r1, r11 // ...........................................................................................................................................................................................................*............................................................................................... - // smlabt r7, r9, r12, r0 // ............................................................................................................................................................................................................*.............................................................................................. - // uadd16 r1, r1, r11 // .............................................................................................................................................................................................................*............................................................................................. - // smlabt r11, r2, r12, r0 // ..............................................................................................................................................................................................................*............................................................................................ - // uadd16 r9, r5, r3 // .................................................................................................................................................................................................................*......................................................................................... - // smulwb r2, r8, r10 // ..................................................................................................................................................................................................................*........................................................................................ - // usub16 r3, r5, r3 // ...............................................................................................................................................................................................................*........................................................................................... - // smulwt r5, r8, r4 // ................................................................................................................................................................................................................*.......................................................................................... - // pkhtb r7, r11, r7, asr #16 // ...................................................................................................................................................................................................................*....................................................................................... - // smulwt r11, r8, r10 // ....................................................................................................................................................................................................................*...................................................................................... - // usub16 r10, r9, r7 // .....................................................................................................................................................................................................................*..................................................................................... - // smlabt r2, r2, r12, r0 // ......................................................................................................................................................................................................................*.................................................................................... - // uadd16 r7, r9, r7 // .......................................................................................................................................................................................................................*................................................................................... - // smlabt r9, r11, r12, r0 // ........................................................................................................................................................................................................................*.................................................................................. - // vmov r11, s12 // .........................................................................................................................................................................................................................*................................................................................. - // smulwb r4, r8, r4 // ..........................................................................................................................................................................................................................*................................................................................ - // pkhtb r9, r9, r2, asr #16 // ...........................................................................................................................................................................................................................*............................................................................... - // smulwb r2, r11, r10 // ............................................................................................................................................................................................................................*.............................................................................. - // usub16 r8, r6, r9 // .............................................................................................................................................................................................................................*............................................................................. - // smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................ - // uadd16 r9, r6, r9 // ...............................................................................................................................................................................................................................*........................................................................... - // vmov r6, s13 // .................................................................................................................................................................................................................................*......................................................................... - // smlabt r5, r5, r12, r0 // ................................................................................................................................................................................................................................*.......................................................................... - // smulwt r11, r11, r10 // ..................................................................................................................................................................................................................................*........................................................................ - // pkhtb r4, r5, r4, asr #16 // ...................................................................................................................................................................................................................................*....................................................................... - // smlabt r5, r2, r12, r0 // ....................................................................................................................................................................................................................................*...................................................................... - // uadd16 r2, r3, r4 // .....................................................................................................................................................................................................................................*..................................................................... - // smlabt r11, r11, r12, r0 // ......................................................................................................................................................................................................................................*.................................................................... - // usub16 r4, r3, r4 // .......................................................................................................................................................................................................................................*................................................................... - // smulwb r10, r6, r2 // ........................................................................................................................................................................................................................................*.................................................................. - // vmov r3, s11 // .............................................................................................................................................................................................................................................*............................................................. - // smulwt r2, r6, r2 // ..........................................................................................................................................................................................................................................*................................................................ - // pkhtb r5, r11, r5, asr #16 // .........................................................................................................................................................................................................................................*................................................................. - // smlabt r11, r10, r12, r0 // ............................................................................................................................................................................................................................................*.............................................................. - // uadd16 r10, r14, r5 // ...........................................................................................................................................................................................................................................*............................................................... - // smlabt r2, r2, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................ - // usub16 r14, r14, r5 // ...............................................................................................................................................................................................................................................*........................................................... - // smulwb r5, r3, r7 // ................................................................................................................................................................................................................................................*.......................................................... - // pkhtb r11, r2, r11, asr #16 // .................................................................................................................................................................................................................................................*......................................................... - // smulwt r7, r3, r7 // ..................................................................................................................................................................................................................................................*........................................................ - // usub16 r2, r9, r11 // ...................................................................................................................................................................................................................................................*....................................................... - // smlabt r3, r5, r12, r0 // ....................................................................................................................................................................................................................................................*...................................................... - // vmov r5, s14 // .....................................................................................................................................................................................................................................................*..................................................... - // smlabt r7, r7, r12, r0 // ......................................................................................................................................................................................................................................................*.................................................... - // uadd16 r9, r9, r11 // .......................................................................................................................................................................................................................................................*................................................... - // smulwb r11, r5, r4 // ..........................................................................................................................................................................................................................................................*................................................ - // pkhtb r3, r7, r3, asr #16 // .........................................................................................................................................................................................................................................................*................................................. - // smulwt r7, r5, r4 // ........................................................................................................................................................................................................................................................*.................................................. - // vmov r6, s1 // .............................................................................................................................................................................................................................................................*............................................. - // smlabt r5, r11, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................ - // usub16 r4, r1, r3 // ...........................................................................................................................................................................................................................................................*............................................... - // smlabt r7, r7, r12, r0 // ............................................................................................................................................................................................................................................................*.............................................. - // usub16 r11, r4, r6 // ...............................................................................................................................................................................................................................................................*........................................... - // vmov r0, s23 // ................................................................................................................................................................................................................................................................*.......................................... - // uadd16 r6, r4, r6 // .....................................................................................................................................................................................................................................................................*..................................... - // str.w r11, [r0, #96] // .................................................................................................................................................................................................................................................................*......................................... - // str.w r6, [r0, #64] // ......................................................................................................................................................................................................................................................................*.................................... - // vmov r4, s2 // ..................................................................................................................................................................................................................................................................*........................................ - // usub16 r11, r10, r4 // .......................................................................................................................................................................................................................................................................*................................... - // str.w r11, [r0, #160] // ........................................................................................................................................................................................................................................................................*.................................. - // pkhtb r6, r7, r5, asr #16 // ...................................................................................................................................................................................................................................................................*....................................... - // vmov r11, s5 // ....................................................................................................................................................................................................................................................................*...................................... - // uadd16 r5, r2, r11 // .........................................................................................................................................................................................................................................................................*................................. - // str.w r5, [r0, #320] // ..........................................................................................................................................................................................................................................................................*................................ - // usub16 r7, r2, r11 // ...........................................................................................................................................................................................................................................................................*............................... - // str.w r7, [r0, #352] // ..............................................................................................................................................................................................................................................................................*............................ - // usub16 r5, r8, r6 // .............................................................................................................................................................................................................................................................................*............................. - // vmov r7, s7 // ............................................................................................................................................................................................................................................................................*.............................. - // usub16 r11, r5, r7 // ...............................................................................................................................................................................................................................................................................*........................... - // vmov r2, s0 // ....................................................................................................................................................................................................................................................................................*...................... - // uadd16 r7, r5, r7 // .................................................................................................................................................................................................................................................................................*......................... - // str.w r11, [r0, #480] // ................................................................................................................................................................................................................................................................................*.......................... - // uadd16 r1, r1, r3 // ...................................................................................................................................................................................................................................................................................*....................... - // vmov r3, s3 // ..................................................................................................................................................................................................................................................................................*........................ - // uadd16 r10, r10, r4 // .......................................................................................................................................................................................................................................................................................*................... - // str.w r10, [r0, #128] // ........................................................................................................................................................................................................................................................................................*.................. - // uadd16 r5, r14, r3 // .........................................................................................................................................................................................................................................................................................*................. - // str.w r5, [r0, #192] // ..........................................................................................................................................................................................................................................................................................*................ - // usub16 r10, r1, r2 // ...........................................................................................................................................................................................................................................................................................*............... - // str.w r10, [r0, #32] // ............................................................................................................................................................................................................................................................................................*.............. - // usub16 r3, r14, r3 // .....................................................................................................................................................................................................................................................................................*..................... - // str.w r3, [r0, #224] // ......................................................................................................................................................................................................................................................................................*.................... - // uadd16 r14, r1, r2 // .............................................................................................................................................................................................................................................................................................*............. - // vmov r1, s6 // ................................................................................................................................................................................................................................................................................................*.......... - // uadd16 r3, r8, r6 // ...............................................................................................................................................................................................................................................................................................*........... - // str.w r14, [r0], #4 // ..............................................................................................................................................................................................................................................................................................*............ - // usub16 r2, r3, r1 // .................................................................................................................................................................................................................................................................................................*......... - // str.w r2, [r0, #412] // ..................................................................................................................................................................................................................................................................................................*........ - // uadd16 r1, r3, r1 // ...................................................................................................................................................................................................................................................................................................*....... - // str.w r1, [r0, #380] // .....................................................................................................................................................................................................................................................................................................*..... - // str.w r7, [r0, #444] // ....................................................................................................................................................................................................................................................................................................*...... - // vmov r7, s4 // ......................................................................................................................................................................................................................................................................................................*.... - // usub16 r11, r9, r7 // .......................................................................................................................................................................................................................................................................................................*... - // str.w r11, [r0, #284] // ........................................................................................................................................................................................................................................................................................................*.. - // uadd16 r7, r9, r7 // .........................................................................................................................................................................................................................................................................................................*. - // str.w r7, [r0, #252] // ..........................................................................................................................................................................................................................................................................................................* - vmov tmp, s24 - cmp poly, tmp - bne layer1234_loop sub.w poly, #8*strincr @@ -778,249 +1098,246 @@ layer1234_loop: add.w tmp, poly, #strincr2*16 vmov s13, tmp - vmov twiddle_ptr, s25 layer567_loop: // Instructions: 110 - // Expected cycles: 60 - // Expected IPC: 1.83 + // Expected cycles: 59 + // Expected IPC: 1.86 // // --------------------------------------------- original position ---------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|--------- - ldr.w r6, [r0, #24] // *............................................................................................................. - ldr.w r2, [r1], #28 // .....*........................................................................................................ - ldr.w r3, [r0, #16] // ..........*................................................................................................... - ldr.w r8, [r0, #20] // ......*....................................................................................................... - ldr.w r11, [r0, #28] // ........*..................................................................................................... - smulwt r10, r2, r6 // ...........*.................................................................................................. - ldr.w r5, [r0, #4] // ...*.......................................................................................................... - smulwb r14, r2, r8 // ......................*....................................................................................... - ldr.w r4, [r0, #0] // ....*......................................................................................................... - smulwt r7, r2, r8 // ........................*..................................................................................... - ldr.w r8, [r0, #12] // .*............................................................................................................ - smulwb r6, r2, r6 // .........*.................................................................................................... - ldr.w r9, [r0, #8] // ..*........................................................................................................... - vmov s23, r0 // .......*...................................................................................................... - movw r0, #24608 // ............*................................................................................................. - smlabt r14, r14, r12, r0 // ..........................*................................................................................... + vmov s23, r0 // .*............................................................................................................ + ldr.w r9, [r1], #28 // *............................................................................................................. + ldr.w r14, [r0, #24] // ..*........................................................................................................... + ldr.w r2, [r0, #20] // ...*.......................................................................................................... + ldr.w r10, [r0, #12] // ....*......................................................................................................... + ldr.w r8, [r0, #16] // .....*........................................................................................................ + ldr.w r6, [r0, #28] // ......*....................................................................................................... + smulwb r11, r9, r2 // .........*.................................................................................................... + ldr.w r4, [r0, #4] // ........*..................................................................................................... + smulwt r3, r9, r2 // .......*...................................................................................................... + ldr.w r2, [r0, #8] // ..........*................................................................................................... + smulwb r7, r9, r6 // ...........*.................................................................................................. + ldr.w r5, [r0, #0] // ............*................................................................................................. + smulwt r6, r9, r6 // .............*................................................................................................ + movw r0, #24608 // ..............*............................................................................................... + smlabt r7, r7, r12, r0 // ...............*.............................................................................................. // gap // .............................................................................................................. - smlabt r6, r6, r12, r0 // .............*................................................................................................ + smlabt r11, r11, r12, r0 // ................*............................................................................................. // gap // .............................................................................................................. - smlabt r10, r10, r12, r0 // ..............*............................................................................................... + smlabt r6, r6, r12, r0 // .................*............................................................................................ // gap // .............................................................................................................. - smlabt r7, r7, r12, r0 // ............................*................................................................................. - pkhtb r10, r10, r6, asr #16 // ................*............................................................................................. - smulwb r6, r2, r3 // .............................*................................................................................ - pkhtb r14, r7, r14, asr #16 // ..............................*............................................................................... - smulwt r3, r2, r3 // ...............................*.............................................................................. - usub16 r7, r5, r14 // ..................................*........................................................................... - smlabt r6, r6, r12, r0 // .................................*............................................................................ - uadd16 r14, r5, r14 // ....................................*......................................................................... - smlabt r3, r3, r12, r0 // ...................................*.......................................................................... + smlabt r3, r3, r12, r0 // ..................*........................................................................................... + pkhtb r6, r6, r7, asr #16 // ...................*.......................................................................................... + smulwb r7, r9, r14 // ....................*......................................................................................... // gap // .............................................................................................................. - smulwb r5, r2, r11 // ...............*.............................................................................................. - pkhtb r3, r3, r6, asr #16 // ......................................*....................................................................... - smulwt r11, r2, r11 // .................*............................................................................................ - ldr r2, [r1, #-20] // ................................*............................................................................. - smlabt r5, r5, r12, r0 // ...................*.......................................................................................... - usub16 r6, r9, r10 // ..................*........................................................................................... - smlabt r11, r11, r12, r0 // .....................*........................................................................................ - uadd16 r10, r9, r10 // ....................*......................................................................................... - smulwb r9, r2, r6 // .....................................*........................................................................ - pkhtb r11, r11, r5, asr #16 // .......................*...................................................................................... - smulwt r6, r2, r6 // .......................................*...................................................................... - uadd16 r5, r8, r11 // ...........................*.................................................................................. - smlabt r9, r9, r12, r0 // .........................................*.................................................................... - usub16 r11, r8, r11 // .........................*.................................................................................... - smlabt r6, r6, r12, r0 // ...........................................*.................................................................. - usub16 r8, r4, r3 // ..........................................*................................................................... + smulwt r14, r9, r14 // ......................*....................................................................................... + pkhtb r3, r3, r11, asr #16 // .....................*........................................................................................ + smlabt r11, r7, r12, r0 // .......................*...................................................................................... + usub16 r7, r4, r3 // ..........................*................................................................................... + smlabt r14, r14, r12, r0 // .........................*.................................................................................... + uadd16 r3, r4, r3 // ........................*..................................................................................... + smulwb r4, r9, r8 // ...........................*.................................................................................. + pkhtb r11, r14, r11, asr #16 // ............................*................................................................................. + smulwt r8, r9, r8 // .............................*................................................................................ + ldr r9, [r1, #-20] // ...............................................*.............................................................. + smlabt r4, r4, r12, r0 // ...............................*.............................................................................. + usub16 r14, r2, r11 // .....................................*........................................................................ + smlabt r8, r8, r12, r0 // .................................*............................................................................ + uadd16 r2, r2, r11 // .......................................*...................................................................... + smulwb r11, r9, r14 // ....................................................*......................................................... + pkhtb r8, r8, r4, asr #16 // ...................................*.......................................................................... + smulwt r14, r9, r14 // .....................................................*........................................................ + uadd16 r4, r10, r6 // ..............................*............................................................................... + smlabt r11, r11, r12, r0 // ........................................................*..................................................... + usub16 r6, r10, r6 // ..................................*........................................................................... + smlabt r10, r14, r12, r0 // .........................................................*.................................................... // gap // .............................................................................................................. - pkhtb r9, r6, r9, asr #16 // .............................................*................................................................ - smulwb r6, r2, r11 // ..............................................*............................................................... - uadd16 r3, r4, r3 // ........................................*..................................................................... - smulwt r2, r2, r11 // ............................................*................................................................. - ldr r11, [r1, #-24] // ...............................................*.............................................................. - smlabt r4, r6, r12, r0 // ..................................................*........................................................... - uadd16 r6, r8, r9 // ...................................................*.......................................................... - smlabt r2, r2, r12, r0 // ................................................*............................................................. - usub16 r8, r8, r9 // .................................................*............................................................ - smulwb r9, r11, r5 // ....................................................*......................................................... - pkhtb r2, r2, r4, asr #16 // ......................................................*....................................................... - smulwt r4, r11, r5 // .....................................................*........................................................ - usub16 r5, r7, r2 // .......................................................*...................................................... - smlabt r9, r9, r12, r0 // ........................................................*..................................................... - uadd16 r7, r7, r2 // .........................................................*.................................................... - smlabt r2, r4, r12, r0 // ..........................................................*................................................... + smulwb r14, r9, r6 // ...........................................................*.................................................. + pkhtb r10, r10, r11, asr #16 // ............................................................*................................................. + smulwt r9, r9, r6 // .............................................................*................................................ + ldr r6, [r1, #-24] // ................................*............................................................................. + smlabt r14, r14, r12, r0 // ...............................................................*.............................................. + uadd16 r11, r5, r8 // .........................................*.................................................................... + smlabt r9, r9, r12, r0 // .................................................................*............................................ + usub16 r5, r5, r8 // ...........................................*.................................................................. + smulwb r8, r6, r4 // ....................................*......................................................................... + pkhtb r14, r9, r14, asr #16 // ....................................................................*......................................... + smulwt r4, r6, r4 // ......................................*....................................................................... + usub16 r9, r7, r14 // ........................................................................*..................................... + smlabt r8, r8, r12, r0 // ........................................*..................................................................... + uadd16 r7, r7, r14 // ..........................................................................*................................... + smlabt r14, r4, r12, r0 // ..........................................*................................................................... // gap // .............................................................................................................. - smulwb r4, r11, r10 // ...........................................................*.................................................. - pkhtb r9, r2, r9, asr #16 // ............................................................*................................................. - smulwt r11, r11, r10 // .............................................................*................................................ - ldr r2, [r1, #-16] // ......................................................................................*....................... - smlabt r10, r4, r12, r0 // ...............................................................*.............................................. - uadd16 r4, r14, r9 // ..................................................................*........................................... - smlabt r11, r11, r12, r0 // .................................................................*............................................ - usub16 r9, r14, r9 // ................................................................*............................................. - smulwb r14, r2, r4 // ...........................................................................................*.................. - pkhtb r11, r11, r10, asr #16 // ....................................................................*......................................... - smulwt r2, r2, r4 // ............................................................................................*................. - ldr r4, [r1, #-12] // ..............................................................*............................................... - smlabt r14, r14, r12, r0 // ..............................................................................................*............... - uadd16 r10, r3, r11 // ........................................................................*..................................... - smlabt r2, r2, r12, r0 // ................................................................................................*............. - usub16 r3, r3, r11 // ..........................................................................*................................... - smulwb r11, r4, r9 // ...................................................................*.......................................... - pkhtb r2, r2, r14, asr #16 // .....................................................................................................*........ - smulwt r4, r4, r9 // .....................................................................*........................................ - ldr r14, [r1, #-4] // ..............................................................................*............................... - smlabt r9, r11, r12, r0 // .......................................................................*...................................... + smulwb r4, r6, r2 // ..............................................*............................................................... + pkhtb r8, r14, r8, asr #16 // .............................................*................................................................ + smulwt r14, r6, r2 // ............................................*................................................................. + ldr r6, [r1, #-12] // ......................................................................*....................................... + smlabt r4, r4, r12, r0 // ..................................................*........................................................... + usub16 r2, r3, r8 // .................................................*............................................................ + smlabt r14, r14, r12, r0 // ................................................*............................................................. + uadd16 r8, r3, r8 // ...................................................*.......................................................... + smulwb r3, r6, r2 // ...........................................................................*.................................. + pkhtb r4, r14, r4, asr #16 // ......................................................*....................................................... + smulwt r2, r6, r2 // .............................................................................*................................ + ldr r14, [r1, #-16] // ..............................................................*............................................... + smlabt r6, r3, r12, r0 // ...............................................................................*.............................. + usub16 r3, r5, r10 // ..................................................................*........................................... + smlabt r2, r2, r12, r0 // .................................................................................*............................ + uadd16 r5, r5, r10 // ................................................................*............................................. + smulwb r10, r14, r8 // ...................................................................*.......................................... + pkhtb r2, r2, r6, asr #16 // ....................................................................................*......................... + smulwt r8, r14, r8 // .....................................................................*........................................ + ldr r6, [r1, #-4] // ......................................................................................*....................... + smlabt r10, r10, r12, r0 // .......................................................................*...................................... + uadd16 r14, r11, r4 // ..........................................................*................................................... + smlabt r8, r8, r12, r0 // .........................................................................*.................................... + usub16 r11, r11, r4 // .......................................................*...................................................... + smulwb r4, r6, r9 // ...........................................................................................*.................. + pkhtb r10, r8, r10, asr #16 // ............................................................................*................................. + smulwt r9, r6, r9 // .............................................................................................*................ + ldr r8, [r1, #-8] // ..............................................................................*............................... + smlabt r4, r4, r12, r0 // ...............................................................................................*.............. + uadd16 r6, r14, r10 // ..................................................................................*........................... + smlabt r9, r9, r12, r0 // .................................................................................................*............ + usub16 r14, r14, r10 // ................................................................................*............................. + smulwb r10, r8, r7 // ...................................................................................*.......................... // gap // .............................................................................................................. - smlabt r11, r4, r12, r0 // .........................................................................*.................................... + smulwt r7, r8, r7 // .....................................................................................*........................ + uadd16 r8, r11, r2 // ..........................................................................................*................... + smlabt r10, r10, r12, r0 // .......................................................................................*...................... + usub16 r2, r11, r2 // ........................................................................................*..................... + smlabt r7, r7, r12, r0 // .........................................................................................*.................... + vmov r0, s23 // ..................................................................................................*........... + str.w r6, [r0], #32 // .....................................................................................................*........ // @slothy:core // @slothy:before=cmp + str r14, [r0, #-28] // ...................................................................................................*.......... // gap // .............................................................................................................. - smulwb r4, r14, r5 // ...................................................................................*.......................... - pkhtb r11, r11, r9, asr #16 // ............................................................................*................................. - smulwt r14, r14, r5 // .....................................................................................*........................ - ldr r9, [r1, #-8] // ......................................................................*....................................... - smlabt r4, r4, r12, r0 // .......................................................................................*...................... - usub16 r5, r3, r11 // ..................................................................................*........................... - smlabt r14, r14, r12, r0 // .........................................................................................*.................... - uadd16 r11, r3, r11 // ................................................................................*............................. - smulwt r3, r9, r7 // .............................................................................*................................ - pkhtb r14, r14, r4, asr #16 // .............................................................................................*................ - smulwb r4, r9, r7 // ...........................................................................*.................................. - uadd16 r9, r8, r14 // ...................................................................................................*.......... - smlabt r7, r3, r12, r0 // .................................................................................*............................ - usub16 r3, r10, r2 // .........................................................................................................*.... - smlabt r4, r4, r12, r0 // ...............................................................................*.............................. - vmov r0, s23 // .................................................................................................*............ - str.w r3, [r0, #4] // ............................................................................................................*. - str.w r11, [r0, #8] // ..................................................................................................*........... - // gap // .............................................................................................................. - str.w r5, [r0, #12] // ....................................................................................................*......... - // gap // .............................................................................................................. - pkhtb r4, r7, r4, asr #16 // ....................................................................................*......................... - str.w r9, [r0, #24] // ..........................................................................................................*... - usub16 r7, r6, r4 // ........................................................................................*..................... - str.w r7, [r0, #20] // ......................................................................................................*....... - uadd16 r9, r10, r2 // .......................................................................................................*...... - str.w r9, [r0], #32 // .............................................................................................................* // @slothy:core // @slothy:before=cmp - uadd16 r6, r6, r4 // ..........................................................................................*................... - str r6, [r0, #-16] // ........................................................................................................*..... - usub16 r6, r8, r14 // ...............................................................................................*.............. - str r6, [r0, #-4] // ...........................................................................................................*.. + pkhtb r7, r7, r10, asr #16 // ............................................................................................*................. + str r2, [r0, #-20] // .......................................................................................................*...... + usub16 r6, r5, r7 // ..............................................................................................*............... + str r8, [r0, #-24] // .........................................................................................................*.... + uadd16 r14, r5, r7 // ................................................................................................*............. + str r14, [r0, #-16] // ...........................................................................................................*.. + pkhtb r14, r9, r4, asr #16 // ....................................................................................................*......... + str r6, [r0, #-12] // ..........................................................................................................*... + uadd16 r8, r3, r14 // ........................................................................................................*..... + str r8, [r0, #-8] // ............................................................................................................*. + usub16 r8, r3, r14 // ......................................................................................................*....... + str r8, [r0, #-4] // .............................................................................................................* - // ----------------------------------------------- new position ------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|--------- - // ldr.w r3, [r0, #24] // *............................................................................................................. - // ldr.w r5, [r0, #12] // ..........*................................................................................................... - // ldr.w r6, [r0, #8] // ............*................................................................................................. - // ldr.w r14, [r0, #4] // ......*....................................................................................................... - // ldr.w r11, [r0, #0] // ........*..................................................................................................... - // ldr.w r10, [r1], #28 // .*............................................................................................................ - // ldr.w r9, [r0, #20] // ...*.......................................................................................................... - // vmov s23, r0 // .............*................................................................................................ - // ldr.w r8, [r0, #28] // ....*......................................................................................................... - // smulwb r2, r10, r3 // ...........*.................................................................................................. - // ldr.w r7, [r0, #16] // ..*........................................................................................................... - // smulwt r3, r10, r3 // .....*........................................................................................................ - // movw r0, #24608 // ..............*............................................................................................... - // smlabt r2, r2, r12, r0 // ................*............................................................................................. - // smlabt r3, r3, r12, r0 // .................*............................................................................................ - // smulwb r4, r10, r8 // ...........................*.................................................................................. - // pkhtb r2, r3, r2, asr #16 // ...................*.......................................................................................... - // smulwt r3, r10, r8 // .............................*................................................................................ - // usub16 r8, r6, r2 // ................................*............................................................................. - // smlabt r4, r4, r12, r0 // ...............................*.............................................................................. - // uadd16 r2, r6, r2 // ..................................*........................................................................... - // smlabt r3, r3, r12, r0 // .................................*............................................................................ - // smulwb r6, r10, r9 // .......*...................................................................................................... - // pkhtb r4, r3, r4, asr #16 // ....................................*......................................................................... - // smulwt r3, r10, r9 // .........*.................................................................................................... - // usub16 r9, r5, r4 // ........................................*..................................................................... - // smlabt r6, r6, r12, r0 // ...............*.............................................................................................. - // uadd16 r5, r5, r4 // ......................................*....................................................................... - // smlabt r4, r3, r12, r0 // ..................*........................................................................................... - // smulwb r3, r10, r7 // ....................*......................................................................................... - // pkhtb r6, r4, r6, asr #16 // .....................*........................................................................................ - // smulwt r10, r10, r7 // ......................*....................................................................................... - // ldr r4, [r1, #-20] // ..............................*............................................................................... - // smlabt r3, r3, r12, r0 // ........................*..................................................................................... - // usub16 r7, r14, r6 // .......................*...................................................................................... - // smlabt r10, r10, r12, r0 // ..........................*................................................................................... - // uadd16 r14, r14, r6 // .........................*.................................................................................... - // smulwb r6, r4, r8 // ...................................*.......................................................................... - // pkhtb r10, r10, r3, asr #16 // ............................*................................................................................. - // smulwt r8, r4, r8 // .....................................*........................................................................ - // uadd16 r3, r11, r10 // .............................................*................................................................ - // smlabt r6, r6, r12, r0 // .......................................*...................................................................... - // usub16 r10, r11, r10 // ..........................................*................................................................... - // smlabt r11, r8, r12, r0 // .........................................*.................................................................... - // smulwt r8, r4, r9 // ..............................................*............................................................... - // pkhtb r6, r11, r6, asr #16 // ...........................................*.................................................................. - // smulwb r9, r4, r9 // ............................................*................................................................. - // ldr r11, [r1, #-24] // ...............................................*.............................................................. - // smlabt r4, r8, r12, r0 // ..................................................*........................................................... - // usub16 r8, r10, r6 // ...................................................*.......................................................... - // smlabt r9, r9, r12, r0 // ................................................*............................................................. - // uadd16 r6, r10, r6 // .................................................*............................................................ - // smulwb r10, r11, r5 // ....................................................*......................................................... - // smulwt r5, r11, r5 // ......................................................*....................................................... - // pkhtb r4, r4, r9, asr #16 // .....................................................*........................................................ - // usub16 r9, r7, r4 // .......................................................*...................................................... - // smlabt r10, r10, r12, r0 // ........................................................*..................................................... - // uadd16 r7, r7, r4 // .........................................................*.................................................... - // smlabt r4, r5, r12, r0 // ..........................................................*................................................... - // smulwb r5, r11, r2 // ...........................................................*.................................................. - // pkhtb r4, r4, r10, asr #16 // ............................................................*................................................. - // smulwt r10, r11, r2 // .............................................................*................................................ - // ldr r2, [r1, #-12] // ......................................................................*....................................... - // smlabt r5, r5, r12, r0 // ...............................................................*.............................................. - // usub16 r11, r14, r4 // ..................................................................*........................................... - // smlabt r10, r10, r12, r0 // .................................................................*............................................ - // uadd16 r14, r14, r4 // ................................................................*............................................. - // smulwb r4, r2, r11 // ...........................................................................*.................................. - // pkhtb r5, r10, r5, asr #16 // ....................................................................*......................................... - // smulwt r10, r2, r11 // .............................................................................*................................ - // ldr r2, [r1, #-8] // ....................................................................................*......................... - // smlabt r4, r4, r12, r0 // ...............................................................................*.............................. - // uadd16 r11, r3, r5 // ........................................................................*..................................... - // smlabt r10, r10, r12, r0 // ................................................................................*............................. - // usub16 r5, r3, r5 // ..........................................................................*................................... - // smulwb r3, r2, r7 // ...........................................................................................*.................. - // pkhtb r10, r10, r4, asr #16 // ..................................................................................*........................... - // smulwt r7, r2, r7 // .........................................................................................*.................... - // ldr r2, [r1, #-4] // ..............................................................................*............................... - // smlabt r3, r3, r12, r0 // ...............................................................................................*.............. - // uadd16 r4, r5, r10 // ........................................................................................*..................... - // smlabt r7, r7, r12, r0 // .............................................................................................*................ - // usub16 r10, r5, r10 // ......................................................................................*....................... - // smulwb r5, r2, r9 // .................................................................................*............................ - // pkhtb r7, r7, r3, asr #16 // ....................................................................................................*......... - // smulwt r9, r2, r9 // ...................................................................................*.......................... - // ldr r2, [r1, #-16] // ..............................................................*............................................... - // smlabt r5, r5, r12, r0 // .....................................................................................*........................ - // usub16 r3, r6, r7 // ......................................................................................................*....... - // smlabt r9, r9, r12, r0 // .......................................................................................*...................... - // uadd16 r6, r6, r7 // ..........................................................................................................*... - // smulwb r7, r2, r14 // ...................................................................*.......................................... - // smulwt r2, r2, r14 // .....................................................................*........................................ - // pkhtb r14, r9, r5, asr #16 // ..........................................................................................*................... - // smlabt r7, r7, r12, r0 // .......................................................................*...................................... - // usub16 r5, r8, r14 // ............................................................................................................*. - // smlabt r9, r2, r12, r0 // .........................................................................*.................................... - // vmov r0, s23 // ................................................................................................*............. - // str.w r4, [r0, #8] // ..................................................................................................*........... - // uadd16 r4, r8, r14 // ............................................................................................*................. - // str.w r10, [r0, #12] // ...................................................................................................*.......... - // pkhtb r7, r9, r7, asr #16 // ............................................................................*................................. - // str.w r3, [r0, #20] // .......................................................................................................*...... - // uadd16 r10, r11, r7 // ........................................................................................................*..... - // str.w r6, [r0, #16] // ...........................................................................................................*.. - // usub16 r6, r11, r7 // ..............................................................................................*............... - // str.w r4, [r0, #24] // .....................................................................................................*........ - // str.w r5, [r0, #28] // .............................................................................................................* - // str.w r6, [r0, #4] // .................................................................................................*............ - // str.w r10, [r0], #32 // .........................................................................................................*.... + // ----------------------------------------------- new position ------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|--------- + // ldr.w r9, [r1], #28 // .*............................................................................................................ + // vmov s23, r0 // *............................................................................................................. + // ldr.w r11, [r0, #24] // ..*........................................................................................................... + // ldr.w r2, [r0, #20] // ...*.......................................................................................................... + // ldr.w r5, [r0, #12] // ....*......................................................................................................... + // ldr.w r8, [r0, #16] // .....*........................................................................................................ + // ldr.w r3, [r0, #28] // ......*....................................................................................................... + // smulwt r14, r9, r2 // .........*.................................................................................................... + // ldr.w r10, [r0, #4] // ........*..................................................................................................... + // smulwb r7, r9, r2 // .......*...................................................................................................... + // ldr.w r2, [r0, #8] // ..........*................................................................................................... + // smulwb r6, r9, r3 // ...........*.................................................................................................. + // ldr.w r4, [r0, #0] // ............*................................................................................................. + // smulwt r3, r9, r3 // .............*................................................................................................ + // movw r0, #24608 // ..............*............................................................................................... + // smlabt r6, r6, r12, r0 // ...............*.............................................................................................. + // smlabt r7, r7, r12, r0 // ................*............................................................................................. + // smlabt r3, r3, r12, r0 // .................*............................................................................................ + // smlabt r14, r14, r12, r0 // ..................*........................................................................................... + // pkhtb r6, r3, r6, asr #16 // ...................*.......................................................................................... + // smulwb r3, r9, r11 // ....................*......................................................................................... + // pkhtb r7, r14, r7, asr #16 // ......................*....................................................................................... + // smulwt r11, r9, r11 // .....................*........................................................................................ + // smlabt r14, r3, r12, r0 // .......................*...................................................................................... + // uadd16 r3, r10, r7 // ..........................*................................................................................... + // smlabt r11, r11, r12, r0 // .........................*.................................................................................... + // usub16 r7, r10, r7 // ........................*..................................................................................... + // smulwb r10, r9, r8 // ...........................*.................................................................................. + // pkhtb r11, r11, r14, asr #16 // ............................*................................................................................. + // smulwt r8, r9, r8 // .............................*................................................................................ + // uadd16 r14, r5, r6 // ......................................*....................................................................... + // smlabt r10, r10, r12, r0 // ...............................*.............................................................................. + // ldr r9, [r1, #-24] // .............................................*................................................................ + // smlabt r8, r8, r12, r0 // .................................*............................................................................ + // usub16 r6, r5, r6 // ........................................*..................................................................... + // pkhtb r8, r8, r10, asr #16 // ....................................*......................................................................... + // smulwb r5, r9, r14 // ..................................................*........................................................... + // usub16 r10, r2, r11 // ................................*............................................................................. + // smulwt r14, r9, r14 // ....................................................*......................................................... + // uadd16 r11, r2, r11 // ..................................*........................................................................... + // smlabt r5, r5, r12, r0 // ......................................................*....................................................... + // uadd16 r2, r4, r8 // ...............................................*.............................................................. + // smlabt r14, r14, r12, r0 // ........................................................*..................................................... + // usub16 r8, r4, r8 // .................................................*............................................................ + // smulwt r4, r9, r11 // ...........................................................*.................................................. + // pkhtb r14, r14, r5, asr #16 // ..........................................................*................................................... + // smulwb r9, r9, r11 // .........................................................*.................................................... + // ldr r11, [r1, #-20] // ..............................*............................................................................... + // smlabt r4, r4, r12, r0 // ...............................................................*.............................................. + // usub16 r5, r3, r14 // ..............................................................*............................................... + // smlabt r9, r9, r12, r0 // .............................................................*................................................ + // uadd16 r14, r3, r14 // ................................................................*............................................. + // smulwb r3, r11, r10 // ...................................*.......................................................................... + // smulwt r10, r11, r10 // .....................................*........................................................................ + // pkhtb r9, r4, r9, asr #16 // ..................................................................*........................................... + // usub16 r4, r2, r9 // ................................................................................*............................. + // smlabt r3, r3, r12, r0 // .......................................*...................................................................... + // smlabt r10, r10, r12, r0 // .........................................*.................................................................... + // uadd16 r2, r2, r9 // ..............................................................................*............................... + // smulwb r9, r11, r6 // ..........................................*................................................................... + // pkhtb r10, r10, r3, asr #16 // ...........................................*.................................................................. + // smulwt r3, r11, r6 // ............................................*................................................................. + // ldr r6, [r1, #-16] // ....................................................................*......................................... + // smlabt r9, r9, r12, r0 // ..............................................*............................................................... + // uadd16 r11, r8, r10 // ........................................................................*..................................... + // smlabt r3, r3, r12, r0 // ................................................*............................................................. + // usub16 r10, r8, r10 // ......................................................................*....................................... + // smulwb r8, r6, r14 // .........................................................................*.................................... + // pkhtb r3, r3, r9, asr #16 // ...................................................*.......................................................... + // smulwt r6, r6, r14 // ...........................................................................*.................................. + // ldr r14, [r1, #-12] // ............................................................*................................................. + // smlabt r8, r8, r12, r0 // .............................................................................*................................ + // usub16 r9, r7, r3 // .....................................................*........................................................ + // smlabt r6, r6, r12, r0 // ...............................................................................*.............................. + // uadd16 r7, r7, r3 // .......................................................*...................................................... + // smulwb r3, r14, r5 // .................................................................*............................................ + // pkhtb r6, r6, r8, asr #16 // ..................................................................................*........................... + // smulwt r8, r14, r5 // ...................................................................*.......................................... + // ldr r14, [r1, #-8] // ....................................................................................*......................... + // smlabt r5, r3, r12, r0 // .....................................................................*........................................ + // usub16 r3, r2, r6 // ........................................................................................*..................... + // smlabt r8, r8, r12, r0 // .......................................................................*...................................... + // uadd16 r6, r2, r6 // ......................................................................................*....................... + // smulwb r2, r14, r7 // .........................................................................................*.................... + // pkhtb r8, r8, r5, asr #16 // ..........................................................................*................................... + // smulwt r14, r14, r7 // ..........................................................................................*................... + // ldr r7, [r1, #-4] // ............................................................................*................................. + // smlabt r2, r2, r12, r0 // ............................................................................................*................. + // usub16 r5, r4, r8 // .............................................................................................*................ + // smlabt r14, r14, r12, r0 // ..............................................................................................*............... + // uadd16 r8, r4, r8 // ...........................................................................................*.................. + // smulwb r4, r7, r9 // .................................................................................*............................ + // pkhtb r14, r14, r2, asr #16 // ..................................................................................................*........... + // smulwt r9, r7, r9 // ...................................................................................*.......................... + // usub16 r7, r11, r14 // ....................................................................................................*......... + // smlabt r2, r4, r12, r0 // .....................................................................................*........................ + // uadd16 r11, r11, r14 // ......................................................................................................*....... + // smlabt r14, r9, r12, r0 // .......................................................................................*...................... + // vmov r0, s23 // ...............................................................................................*.............. + // str.w r3, [r0, #4] // .................................................................................................*............ + // pkhtb r14, r14, r2, asr #16 // ........................................................................................................*..... + // str.w r6, [r0], #32 // ................................................................................................*............. + // usub16 r9, r10, r14 // ............................................................................................................*. + // str r5, [r0, #-20] // ...................................................................................................*.......... + // uadd16 r6, r10, r14 // ..........................................................................................................*... + // str r8, [r0, #-24] // .....................................................................................................*........ + // str r7, [r0, #-12] // .........................................................................................................*.... + // str r11, [r0, #-16] // .......................................................................................................*...... + // str r6, [r0, #-8] // ...........................................................................................................*.. + // str r9, [r0, #-4] // .............................................................................................................* vmov tmp, s13 cmp poly, tmp diff --git a/examples/opt/armv7m/ntt_dilithium_opt_m7.s b/examples/opt/armv7m/ntt_dilithium_opt_m7.s index f1158bcb..1035708c 100644 --- a/examples/opt/armv7m/ntt_dilithium_opt_m7.s +++ b/examples/opt/armv7m/ntt_dilithium_opt_m7.s @@ -243,409 +243,409 @@ pqcrystals_dilithium_ntt_opt_m7: add.w temp_l, ptr_p, #32*strincr // 32 iterations vmov s9, temp_l - // Instructions: 78 - // Expected cycles: 43 - // Expected IPC: 1.81 - // - // Cycle bound: 43.0 - // IPC bound: 1.81 - // - // Wall time: 6.53s - // User time: 6.53s - // - // ------------ cycle (expected) ------------> - // 0 25 - // |------------------------|----------------- - ldr.w r11, [r0, #896] // *.......................................... - vmov r12, s2 // *.......................................... - vmov r6, s4 // .*......................................... - ldr.w r10, [r0, #768] // .*......................................... - ldr.w r8, [r0, #512] // ..*........................................ - smull r7, r11, r11, r12 // ..*........................................ - ldr.w r4, [r0, #384] // ...*....................................... - smull r9, r10, r10, r12 // ...*....................................... - ldr.w r14, [r0, #640] // ....*...................................... - mul r5, r7, r2 // ....*...................................... - mul r1, r9, r2 // .....*..................................... - smlal r7, r11, r5, r3 // ......*.................................... - smull r7, r5, r8, r12 // .......*................................... - add r4, r4, r11 // ........*.................................. - smlal r9, r10, r1, r3 // ........*.................................. - sub.w r9, r4, r11, lsl #1 // .........*................................. - mul r1, r7, r2 // .........*................................. - vmov r8, s3 // ..........*................................ - smull r11, r9, r9, r6 // ..........*................................ - smlal r7, r5, r1, r3 // ...........*............................... - mul r7, r11, r2 // ............*.............................. - smull r14, r12, r14, r12 // .............*............................. - smlal r11, r9, r7, r3 // ..............*............................ - ldr.w r7, [r0, #256] // ...............*........................... - mul r1, r14, r2 // ...............*........................... - add r7, r7, r10 // ................*.......................... - smull r11, r4, r4, r8 // ................*.......................... - smlal r14, r12, r1, r3 // .................*......................... - sub.w r1, r7, r10, lsl #1 // ..................*........................ - mul r10, r11, r2 // ..................*........................ - ldr.w r14, [r0, #128] // ...................*....................... - smull r6, r1, r1, r6 // ...................*....................... - add r14, r14, r12 // ....................*...................... - smlal r11, r4, r10, r3 // ....................*...................... - sub.w r10, r14, r12, lsl #1 // .....................*..................... - mul r11, r6, r2 // .....................*..................... - add r12, r10, r9 // ......................*.................... - smull r10, r8, r7, r8 // ......................*.................... - add r7, r14, r4 // .......................*................... - smlal r6, r1, r11, r3 // .......................*................... - vmov r14, s5 // ........................*.................. - mul r11, r10, r2 // ........................*.................. - sub.w r6, r7, r4, lsl #1 // .........................*................. - smull r14, r7, r7, r14 // .........................*................. - ldr.w r4, [r0] // ..........................*................ - smlal r10, r8, r11, r3 // ..........................*................ - vmov r10, s7 // ...........................*............... - mul r11, r14, r2 // ...........................*............... - sub.w r9, r12, r9, lsl #1 // ............................*.............. - smull r12, r10, r12, r10 // ............................*.............. - add r4, r4, r5 // .............................*............. - smlal r14, r7, r11, r3 // .............................*............. - vmov r11, s8 // ..............................*............ - mul r14, r12, r2 // ..............................*............ - sub.w r5, r4, r5, lsl #1 // ...............................*........... - smull r11, r9, r9, r11 // ...............................*........... - add r5, r5, r1 // ................................*.......... - smlal r12, r10, r14, r3 // ................................*.......... - vmov r14, s6 // .................................*......... - mul r12, r11, r2 // .................................*......... - add r4, r4, r8 // ..................................*........ - smull r14, r6, r6, r14 // ..................................*........ - sub.w r1, r5, r1, lsl #1 // ...................................*....... - smlal r11, r9, r12, r3 // ...................................*....... - add r5, r5, r10 // ....................................*...... - str.w r5, [r0, #512] // ....................................*...... - add r1, r1, r9 // .....................................*..... - mul r11, r14, r2 // .....................................*..... - str.w r1, [r0, #768] // ......................................*.... - sub.w r1, r1, r9, lsl #1 // ......................................*.... - sub.w r10, r5, r10, lsl #1 // .......................................*... - smlal r14, r6, r11, r3 // .......................................*... - str.w r1, [r0, #896] // ........................................*.. - sub.w r1, r4, r8, lsl #1 // ........................................*.. - add r1, r1, r6 // .........................................*. - str.w r1, [r0, #256] // .........................................*. - sub.w r11, r1, r6, lsl #1 // ..........................................* - str.w r10, [r0, #640] // ..........................................* - - // ------------ cycle (expected) ------------> - // 0 25 - // |------------------------|----------------- - // vmov r8, s3 // ..........*................................ - // vmov r9, s2 // *.......................................... - // ldr.w r10, [r0, #896] // *.......................................... - // ldr.w r14, [r0, #768] // .*......................................... - // ldr.w r1, [r0, #512] // ..*........................................ - // smull r4, r5, r10, r9 // ..*........................................ - // smull r11, r6, r14, r9 // ...*....................................... - // ldr.w r14, [r0, #384] // ...*....................................... - // mul r10, r4, r2 // ....*...................................... - // mul r12, r11, r2 // .....*..................................... - // smlal r4, r5, r10, r3 // ......*.................................... - // vmov r7, s4 // .*......................................... - // smull r4, r1, r1, r9 // .......*................................... - // add r14, r14, r5 // ........*.................................. - // smlal r11, r6, r12, r3 // ........*.................................. - // sub.w r5, r14, r5, lsl #1 // .........*................................. - // mul r10, r4, r2 // .........*................................. - // ldr.w r11, [r0, #640] // ....*...................................... - // smull r5, r12, r5, r7 // ..........*................................ - // smlal r4, r1, r10, r3 // ...........*............................... - // mul r10, r5, r2 // ............*.............................. - // smull r4, r11, r11, r9 // .............*............................. - // ldr.w r9, [r0, #256] // ...............*........................... - // smlal r5, r12, r10, r3 // ..............*............................ - // add r10, r9, r6 // ................*.......................... - // mul r9, r4, r2 // ...............*........................... - // smull r5, r14, r14, r8 // ................*.......................... - // sub.w r6, r10, r6, lsl #1 // ..................*........................ - // smlal r4, r11, r9, r3 // .................*......................... - // ldr.w r4, [r0, #128] // ...................*....................... - // mul r9, r5, r2 // ..................*........................ - // add r4, r4, r11 // ....................*...................... - // smull r7, r6, r6, r7 // ...................*....................... - // sub.w r11, r4, r11, lsl #1 // .....................*..................... - // smlal r5, r14, r9, r3 // ....................*...................... - // add r5, r11, r12 // ......................*.................... - // mul r9, r7, r2 // .....................*..................... - // add r4, r4, r14 // .......................*................... - // smull r10, r8, r10, r8 // ......................*.................... - // sub.w r11, r4, r14, lsl #1 // .........................*................. - // smlal r7, r6, r9, r3 // .......................*................... - // vmov r7, s5 // ........................*.................. - // mul r9, r10, r2 // ........................*.................. - // sub.w r14, r5, r12, lsl #1 // ............................*.............. - // smull r12, r7, r4, r7 // .........................*................. - // ldr.w r4, [r0] // ..........................*................ - // smlal r10, r8, r9, r3 // ..........................*................ - // vmov r10, s7 // ...........................*............... - // mul r9, r12, r2 // ...........................*............... - // add r4, r4, r1 // .............................*............. - // smull r5, r10, r5, r10 // ............................*.............. - // sub.w r1, r4, r1, lsl #1 // ...............................*........... - // smlal r12, r7, r9, r3 // .............................*............. - // vmov r12, s8 // ..............................*............ - // mul r9, r5, r2 // ..............................*............ - // add r1, r1, r6 // ................................*.......... - // smull r14, r12, r14, r12 // ...............................*........... - // sub.w r6, r1, r6, lsl #1 // ...................................*....... - // smlal r5, r10, r9, r3 // ................................*.......... - // vmov r5, s6 // .................................*......... - // mul r9, r14, r2 // .................................*......... - // add r1, r1, r10 // ....................................*...... - // smull r5, r11, r11, r5 // ..................................*........ - // sub.w r10, r1, r10, lsl #1 // .......................................*... - // smlal r14, r12, r9, r3 // ...................................*....... - // add r4, r4, r8 // ..................................*........ - // mul r14, r5, r2 // .....................................*..... - // add r6, r6, r12 // .....................................*..... - // str.w r6, [r0, #768] // ......................................*.... - // sub.w r8, r4, r8, lsl #1 // ........................................*.. - // smlal r5, r11, r14, r3 // .......................................*... - // sub.w r5, r6, r12, lsl #1 // ......................................*.... - // str.w r5, [r0, #896] // ........................................*.. - // str.w r10, [r0, #640] // ..........................................* - // add r10, r8, r11 // .........................................*. - // str.w r10, [r0, #256] // .........................................*. - // sub.w r11, r10, r11, lsl #1 // ..........................................* - // str.w r1, [r0, #512] // ....................................*...... - - push {r7} - vmov r7, s9 - sub r7, r7, #4 - vmov s9, r7 - pop {r7} -layer123_loop: - // Instructions: 86 - // Expected cycles: 46 - // Expected IPC: 1.87 - // - // Cycle bound: 46.0 - // IPC bound: 1.87 - // - // Wall time: 441.25s - // User time: 441.25s - // - // ------------- cycle (expected) --------------> - // 0 25 - // |------------------------|-------------------- - add r1, r4, r7 // *............................................. - str.w r11, [r0, #384] // *............................................. - sub.w r8, r1, r7, lsl #1 // .*............................................ - str.w r8, [r0, #128] // .*............................................ - vmov r8, s3 // ..e........................................... - str r1, [r0], #4 // ..*........................................... // @slothy:core - vmov r9, s2 // ...e.......................................... - ldr.w r10, [r0, #896] // ...e.......................................... - ldr.w r14, [r0, #768] // ....e......................................... - ldr.w r1, [r0, #512] // .....e........................................ - smull r4, r5, r10, r9 // .....e........................................ - vmov r7, s9 // ......*....................................... - smull r11, r6, r14, r9 // ......e....................................... - ldr.w r14, [r0, #384] // .......e...................................... - mul r10, r4, r2 // .......e...................................... - mul r12, r11, r2 // ........e..................................... - cmp.w r0, r7 // .........*.................................... - smlal r4, r5, r10, r3 // .........e.................................... - vmov r7, s4 // ..........e................................... - smull r4, r1, r1, r9 // ..........e................................... - add r14, r14, r5 // ...........e.................................. - smlal r11, r6, r12, r3 // ...........e.................................. - sub.w r5, r14, r5, lsl #1 // ............e................................. - mul r10, r4, r2 // ............e................................. - ldr.w r11, [r0, #640] // .............e................................ - smull r5, r12, r5, r7 // .............e................................ - smlal r4, r1, r10, r3 // ..............e............................... - mul r10, r5, r2 // ...............e.............................. - smull r4, r11, r11, r9 // ................e............................. - ldr.w r9, [r0, #256] // .................e............................ - smlal r5, r12, r10, r3 // .................e............................ - add r10, r9, r6 // ..................e........................... - mul r9, r4, r2 // ..................e........................... - smull r5, r14, r14, r8 // ...................e.......................... - sub.w r6, r10, r6, lsl #1 // ....................e......................... - smlal r4, r11, r9, r3 // ....................e......................... - ldr.w r4, [r0, #128] // .....................e........................ - mul r9, r5, r2 // .....................e........................ - add r4, r4, r11 // ......................e....................... - smull r7, r6, r6, r7 // ......................e....................... - sub.w r11, r4, r11, lsl #1 // .......................e...................... - smlal r5, r14, r9, r3 // .......................e...................... - add r5, r11, r12 // ........................e..................... - mul r9, r7, r2 // ........................e..................... - add r4, r4, r14 // .........................e.................... - smull r10, r8, r10, r8 // .........................e.................... - sub.w r11, r4, r14, lsl #1 // ..........................e................... - smlal r7, r6, r9, r3 // ..........................e................... - vmov r7, s5 // ...........................e.................. - mul r9, r10, r2 // ...........................e.................. - sub.w r14, r5, r12, lsl #1 // ............................e................. - smull r12, r7, r4, r7 // ............................e................. - ldr.w r4, [r0] // .............................e................ - smlal r10, r8, r9, r3 // .............................e................ - vmov r10, s7 // ..............................e............... - mul r9, r12, r2 // ..............................e............... - add r4, r4, r1 // ...............................e.............. - smull r5, r10, r5, r10 // ...............................e.............. - sub.w r1, r4, r1, lsl #1 // ................................e............. - smlal r12, r7, r9, r3 // ................................e............. - vmov r12, s8 // .................................e............ - mul r9, r5, r2 // .................................e............ - add r1, r1, r6 // ..................................e........... - smull r14, r12, r14, r12 // ..................................e........... - sub.w r6, r1, r6, lsl #1 // ...................................e.......... - smlal r5, r10, r9, r3 // ...................................e.......... - vmov r5, s6 // ....................................e......... - mul r9, r14, r2 // ....................................e......... - add r1, r1, r10 // .....................................e........ - smull r5, r11, r11, r5 // .....................................e........ - sub.w r10, r1, r10, lsl #1 // ......................................e....... - smlal r14, r12, r9, r3 // ......................................e....... - add r4, r4, r8 // .......................................e...... - mul r14, r5, r2 // .......................................e...... - add r6, r6, r12 // ........................................e..... - str.w r6, [r0, #768] // ........................................e..... - sub.w r8, r4, r8, lsl #1 // .........................................e.... - smlal r5, r11, r14, r3 // .........................................e.... - sub.w r5, r6, r12, lsl #1 // ..........................................e... - str.w r5, [r0, #896] // ..........................................e... - str.w r10, [r0, #640] // ...........................................e.. - add r10, r8, r11 // ...........................................e.. - str.w r10, [r0, #256] // ............................................e. - sub.w r11, r10, r11, lsl #1 // ............................................e. - str.w r1, [r0, #512] // .............................................e - bne layer123_loop // .............................................* // @slothy:branch - - // ----------------------------------- cycle (expected) ------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|-------------- - // ldr.w R5, [R0] // ...........................e................'............................~................ - // ldr.w R6, [R0, #1*512/4] // ...................e........................'....................~........................ - // ldr.w R7, [R0, #2*512/4] // ...............e............................'................~............................ - // ldr.w R8, [R0, #3*512/4] // .....e......................................'......~...................................... - // ldr.w R4, [R0, #4*512/4] // ...e........................................'....~........................................ - // ldr.w R11, [R0, #5*512/4] // ...........e................................'............~................................ - // ldr.w R12, [R0, #6*512/4] // ..e.........................................'...~......................................... - // ldr.w R14, [R0, #7*512/4] // .e..........................................'..~.......................................... - // vmov R1, s2 // .e..........................................'..~.......................................... - // smull R9, R4, R4, R1 // ........e...................................'.........~................................... - // mul R10, R9, R2 // ..........e.................................'...........~................................. - // smlal R9, R4, R10, R3 // ............e...............................'.............~............................... - // smull R9, R11, R11, R1 // ..............e.............................'...............~............................. - // mul R10, R9, R2 // ................e...........................'.................~........................... - // smlal R9, R11, R10, R3 // ..................e.........................'...................~......................... - // smull R9, R12, R12, R1 // ....e.......................................'.....~....................................... - // mul R10, R9, R2 // ......e.....................................'.......~..................................... - // smlal R9, R12, R10, R3 // .........e..................................'..........~.................................. - // smull R9, R14, R14, R1 // ...e........................................'....~........................................ - // mul R10, R9, R2 // .....e......................................'......~...................................... - // smlal R9, R14, R10, R3 // .......e....................................'........~.................................... - // add R5, R5, R4 // .............................e..............'..............................~.............. - // add R6, R6, R11 // ....................e.......................'.....................~....................... - // add R7, R7, R12 // ................e...........................'.................~........................... - // add R8, R8, R14 // .........e..................................'..........~.................................. - // sub.w R4, R5, R4, lsl #1 // ..............................e.............'...............................~............. - // sub.w R11, R6, R11, lsl #1 // .....................e......................'......................~...................... - // sub.w R12, R7, R12, lsl #1 // ..................e.........................'...................~......................... - // sub.w R14, R8, R14, lsl #1 // ..........e.................................'...........~................................. - // vmov R1, s3 // e...........................................'.~........................................... - // smull R9, R7, R7, R1 // .......................e....................'........................~.................... - // mul R10, R9, R2 // .........................e..................'..........................~.................. - // smlal R9, R7, R10, R3 // ...........................e................'............................~................ - // smull R9, R8, R8, R1 // .................e..........................'..................~.......................... - // mul R10, R9, R2 // ...................e........................'....................~........................ - // smlal R9, R8, R10, R3 // .....................e......................'......................~...................... - // vmov R1, s4 // ........e...................................'.........~................................... - // smull R9, R12, R12, R1 // ....................e.......................'.....................~....................... - // mul R10, R9, R2 // ......................e.....................'.......................~..................... - // smlal R9, R12, R10, R3 // ........................e...................'.........................~................... - // smull R9, R14, R14, R1 // ...........e................................'............~................................ - // mul R10, R9, R2 // .............e..............................'..............~.............................. - // smlal R9, R14, R10, R3 // ...............e............................'................~............................ - // add R5, R5, R7 // .....................................e......'......................................~...... - // add R6, R6, R8 // .......................e....................'........................~.................... - // add R4, R4, R12 // ................................e...........'.................................~........... - // add R11, R11, R14 // ......................e.....................'.......................~..................... - // sub.w R7, R5, R7, lsl #1 // .......................................e....'........................................~.... - // sub.w R8, R6, R8, lsl #1 // ........................e...................'.........................~................... - // sub.w R12, R4, R12, lsl #1 // .................................e..........'..................................~.......... - // sub.w R14, R11, R14, lsl #1 // ..........................e.................'...........................~................. - // vmov R1, s5 // .........................e..................'..........................~.................. - // smull R9, R6, R6, R1 // ..........................e.................'...........................~................. - // mul R10, R9, R2 // ............................e...............'.............................~............... - // smlal R9, R6, R10, R3 // ..............................e.............'...............................~............. - // vmov R1, s6 // ..................................e.........'...................................~......... - // smull R9, R8, R8, R1 // ...................................e........'....................................~........ - // mul R10, R9, R2 // .....................................e......'......................................~...... - // smlal R9, R8, R10, R3 // .......................................e....'........................................~.... - // vmov R1, s7 // ............................e...............'.............................~............... - // smull R9, R11, R11, R1 // .............................e..............'..............................~.............. - // mul R10, R9, R2 // ...............................e............'................................~............ - // smlal R9, R11, R10, R3 // .................................e..........'..................................~.......... - // vmov R1, s8 // ...............................e............'................................~............ - // smull R9, R14, R14, R1 // ................................e...........'.................................~........... - // mul R10, R9, R2 // ..................................e.........'...................................~......... - // smlal R9, R14, R10, R3 // ....................................e.......'.....................................~....... - // add R5, R5, R6 // ............................................*............................................. - // add R7, R7, R8 // .........................................e..'..........................................~.. - // add R4, R4, R11 // ...................................e........'....................................~........ - // add R12, R12, R14 // ......................................e.....'.......................................~..... - // sub.w R6, R5, R6, lsl #1 // ............................................'*............................................ - // sub.w R8, R7, R8, lsl #1 // ..........................................e.'...........................................~. - // sub.w R11, R4, R11, lsl #1 // ....................................e.......'.....................................~....... - // sub.w R14, R12, R14, lsl #1 // ........................................e...'.........................................~... - // str.w R6, [R0, #1*512/4] // ............................................'*............................................ - // str.w R7, [R0, #2*512/4] // ..........................................e.'...........................................~. - // str.w R8, [R0, #3*512/4] // ............................................*............................................. - // str.w R4, [R0, #4*512/4] // ...........................................e'............................................. - // str.w R11, [R0, #5*512/4] // .........................................e..'..........................................~.. - // str.w R12, [R0, #6*512/4] // ......................................e.....'.......................................~..... - // str.w R14, [R0, #7*512/4] // ........................................e...'.........................................~... - // str R5, [R0], #4 // ~...........................................'.*........................................... - // vmov R10, s9 // ....~.......................................'.....*....................................... - // cmp.w R0, R10 // .......~....................................'........*.................................... - // bne layer123_loop // ...........................................~'............................................* - - - // Instructions: 8 - // Expected cycles: 6 - // Expected IPC: 1.33 + // Instructions: 24 + // Expected cycles: 15 + // Expected IPC: 1.60 // - // Cycle bound: 6.0 - // IPC bound: 1.33 + // Cycle bound: 15.0 + // IPC bound: 1.60 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.07s + // User time: 0.07s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - add r1, r4, r7 // *............................. - str.w r11, [r0, #384] // *............................. - sub.w r11, r1, r7, lsl #1 // .*............................ - str.w r11, [r0, #128] // .*............................ - str r1, [r0], #4 // ..*........................... // @slothy:core - vmov r1, s9 // ..*........................... - cmp.w r0, r1 // ...*.......................... + ldr.w r6, [r0, #768] // *............................. + ldr.w r1, [r0, #896] // .*............................ + vmov r9, s2 // .*............................ + smull r4, r12, r6, r9 // ..*........................... + smull r1, r5, r1, r9 // ...*.......................... + mul r8, r4, r2 // ....*......................... + ldr.w r14, [r0, #640] // .....*........................ + mul r11, r1, r2 // .....*........................ + ldr.w r6, [r0, #512] // ......*....................... + smlal r4, r12, r8, r3 // ......*....................... + ldr.w r4, [r0, #256] // .......*...................... + smull r7, r14, r14, r9 // .......*...................... + add r8, r4, r12 // ........*..................... + smlal r1, r5, r11, r3 // ........*..................... + ldr.w r4, [r0, #384] // .........*.................... + mul r1, r7, r2 // .........*.................... + add r11, r4, r5 // ..........*................... + smull r10, r4, r6, r9 // ..........*................... + sub.w r6, r11, r5, lsl #1 // ...........*.................. + smlal r7, r14, r1, r3 // ...........*.................. + vmov r1, s3 // ............*................. + mul r9, r10, r2 // ............*................. + smull r5, r11, r11, r1 // .............*................ + smlal r10, r4, r9, r3 // ..............*............... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // add r1, r4, r7 // *.............................. - // str.w r11, [r0, #384] // *.............................. - // sub.w r8, r1, r7, lsl #1 // .*............................. - // str.w r8, [r0, #128] // .*............................. - // str r1, [r0], #4 // ..*............................ - // vmov r7, s9 // ..*............................ - // cmp.w r0, r7 // ...*........................... - // bne layer123_loop // .....*......................... + // ldr.w r1, [r0, #768] // *.............................. + // ldr.w r4, [r0, #896] // .*............................. + // vmov r9, s2 // .*............................. + // smull r14, r12, r1, r9 // ..*............................ + // smull r4, r7, r4, r9 // ...*........................... + // mul r5, r14, r2 // ....*.......................... + // ldr.w r6, [r0, #640] // .....*......................... + // smlal r14, r12, r5, r3 // ......*........................ + // smull r6, r14, r6, r9 // .......*....................... + // mul r5, r4, r2 // .....*......................... + // ldr.w r1, [r0, #512] // ......*........................ + // smlal r4, r7, r5, r3 // ........*...................... + // ldr.w r5, [r0, #384] // .........*..................... + // smull r11, r4, r1, r9 // ..........*.................... + // ldr.w r1, [r0, #256] // .......*....................... + // add r8, r1, r12 // ........*...................... + // mul r1, r11, r2 // ............*.................. + // add r5, r5, r7 // ..........*.................... + // mul r9, r6, r2 // .........*..................... + // smlal r11, r4, r1, r3 // ..............*................ + // vmov r1, s3 // ............*.................. + // smlal r6, r14, r9, r3 // ...........*................... + // sub.w r6, r5, r7, lsl #1 // ...........*................... + // smull r5, r11, r5, r1 // .............*................. + + push {r11} + vmov r11, s9 + sub r11, r11, #4 + vmov s9, r11 + pop {r11} +layer123_loop: + // Instructions: 86 + // Expected cycles: 46 + // Expected IPC: 1.87 + // + // Cycle bound: 44.0 + // IPC bound: 1.95 + // + // Wall time: 300.39s + // User time: 300.39s + // + // ------------- cycle (expected) --------------> + // 0 25 + // |------------------------|-------------------- + sub.w r7, r8, r12, lsl #1 // *............................................. + mul r12, r5, r2 // *............................................. + vmov r9, s4 // .*............................................ + smull r1, r8, r8, r1 // .*............................................ + smlal r5, r11, r12, r3 // ..*........................................... + ldr.w r5, [r0] // ...*.......................................... + mul r10, r1, r2 // ...*.......................................... + smull r6, r12, r6, r9 // ....*......................................... + add r5, r5, r4 // .....*........................................ + smlal r1, r8, r10, r3 // .....*........................................ + sub.w r10, r5, r4, lsl #1 // ......*....................................... + mul r4, r6, r2 // ......*....................................... + ldr.w r1, [r0, #128] // .......*...................................... + smull r7, r9, r7, r9 // .......*...................................... + add r1, r1, r14 // ........*..................................... + smlal r6, r12, r4, r3 // ........*..................................... + sub.w r6, r1, r14, lsl #1 // .........*.................................... + mul r4, r7, r2 // .........*.................................... + add r6, r6, r12 // ..........*................................... + vmov r14, s8 // ..........*................................... + sub.w r12, r6, r12, lsl #1 // ...........*.................................. + smlal r7, r9, r4, r3 // ...........*.................................. + add r4, r1, r11 // ............*................................. + smull r12, r1, r12, r14 // ............*................................. + add r14, r10, r9 // .............*................................ + vmov r10, s5 // .............*................................ + sub.w r7, r14, r9, lsl #1 // ..............*............................... + mul r9, r12, r2 // ..............*............................... + sub.w r11, r4, r11, lsl #1 // ...............*.............................. + smull r4, r10, r4, r10 // ...............*.............................. + add r5, r5, r8 // ................*............................. + smlal r12, r1, r9, r3 // ................*............................. + vmov r9, s7 // .................*............................ + mul r12, r4, r2 // .................*............................ + add r7, r7, r1 // ..................*........................... + smull r6, r9, r6, r9 // ..................*........................... + sub.w r1, r7, r1, lsl #1 // ...................*.......................... + str.w r7, [r0, #768] // ...................*.......................... + str.w r1, [r0, #896] // ....................*......................... + sub.w r8, r5, r8, lsl #1 // ....................*......................... + mul r1, r6, r2 // .....................*........................ + smlal r4, r10, r12, r3 // ......................*....................... + vmov r12, s6 // .......................*...................... + smlal r6, r9, r1, r3 // .......................*...................... + ldr.w r1, [r0, #772] // ........................e..................... + smull r7, r11, r11, r12 // ........................*..................... + add r14, r14, r9 // .........................*.................... + str.w r14, [r0, #512] // .........................*.................... + sub.w r6, r14, r9, lsl #1 // ..........................*................... + mul r14, r7, r2 // ..........................*................... + ldr.w r4, [r0, #900] // ...........................e.................. + str.w r6, [r0, #640] // ...........................*.................. + vmov r9, s2 // ............................e................. + smlal r7, r11, r14, r3 // ............................*................. + smull r14, r12, r1, r9 // .............................e................ + add r1, r5, r10 // ..............................*............... + smull r4, r7, r4, r9 // ..............................e............... + add r8, r8, r11 // ...............................*.............. + mul r5, r14, r2 // ...............................e.............. + str.w r8, [r0, #256] // ................................*............. + ldr.w r6, [r0, #644] // ................................e............. + sub.w r8, r8, r11, lsl #1 // .................................*............ + smlal r14, r12, r5, r3 // .................................e............ + sub.w r10, r1, r10, lsl #1 // ..................................*........... + smull r6, r14, r6, r9 // ..................................e........... + vmov r11, s9 // ...................................*.......... + mul r5, r4, r2 // ...................................e.......... + str r1, [r0], #4 // ....................................*......... // @slothy:core // @slothy:before=cmp + ldr.w r1, [r0, #512] // ....................................e......... + cmp.w r0, r11 // .....................................*........ // @slothy:id=cmp + smlal r4, r7, r5, r3 // .....................................e........ + ldr.w r5, [r0, #384] // ......................................e....... + smull r11, r4, r1, r9 // ......................................e....... + str.w r8, [r0, #380] // .......................................*...... + ldr.w r1, [r0, #256] // .......................................e...... + add r8, r1, r12 // ........................................e..... + mul r1, r11, r2 // ........................................e..... + add r5, r5, r7 // .........................................e.... + mul r9, r6, r2 // .........................................e.... + smlal r11, r4, r1, r3 // ..........................................e... + vmov r1, s3 // ...........................................e.. + smlal r6, r14, r9, r3 // ...........................................e.. + sub.w r6, r5, r7, lsl #1 // ............................................e. + smull r5, r11, r5, r1 // ............................................e. + str.w r10, [r0, #124] // .............................................* + bne layer123_loop // .............................................* // @slothy:branch + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr.w R5, [R0] // ......................'..*.......................................... + // ldr.w R6, [R0, #1*512/4] // ......................'......*...................................... + // ldr.w R7, [R0, #2*512/4] // ...............e......'......................................~...... + // ldr.w R8, [R0, #3*512/4] // ..............e.......'.....................................~....... + // ldr.w R4, [R0, #4*512/4] // ............e.........'...................................~......... + // ldr.w R11, [R0, #5*512/4] // ........e.............'...............................~............. + // ldr.w R12, [R0, #6*512/4] // e.....................'.......................~..................... + // ldr.w R14, [R0, #7*512/4] // ...e..................'..........................~.................. + // vmov R1, s2 // ....e.................'...........................~................. + // smull R9, R4, R4, R1 // ..............e.......'.....................................~....... + // mul R10, R9, R2 // ................e.....'.......................................~..... + // smlal R9, R4, R10, R3 // ..................e...'.........................................~... + // smull R9, R11, R11, R1 // ..........e...........'.................................~........... + // mul R10, R9, R2 // .................e....'........................................~.... + // smlal R9, R11, R10, R3 // ...................e..'..........................................~.. + // smull R9, R12, R12, R1 // .....e................'............................~................ + // mul R10, R9, R2 // .......e..............'..............................~.............. + // smlal R9, R12, R10, R3 // .........e............'................................~............ + // smull R9, R14, R14, R1 // ......e...............'.............................~............... + // mul R10, R9, R2 // ...........e..........'..................................~.......... + // smlal R9, R14, R10, R3 // .............e........'....................................~........ + // add R5, R5, R4 // ......................'....*........................................ + // add R6, R6, R11 // ......................'.......*..................................... + // add R7, R7, R12 // ................e.....'.......................................~..... + // add R8, R8, R14 // .................e....'........................................~.... + // sub.w R4, R5, R4, lsl #1 // ......................'.....*....................................... + // sub.w R11, R6, R11, lsl #1 // ......................'........*.................................... + // sub.w R12, R7, R12, lsl #1 // ......................*............................................. + // sub.w R14, R8, R14, lsl #1 // ....................e.'...........................................~. + // vmov R1, s3 // ...................e..'..........................................~.. + // smull R9, R7, R7, R1 // ......................'*............................................ + // mul R10, R9, R2 // ......................'..*.......................................... + // smlal R9, R7, R10, R3 // ......................'....*........................................ + // smull R9, R8, R8, R1 // ....................e.'...........................................~. + // mul R10, R9, R2 // ......................*............................................. + // smlal R9, R8, R10, R3 // ......................'.*........................................... + // vmov R1, s4 // ......................'*............................................ + // smull R9, R12, R12, R1 // ......................'......*...................................... + // mul R10, R9, R2 // ......................'........*.................................... + // smlal R9, R12, R10, R3 // ......................'..........*.................................. + // smull R9, R14, R14, R1 // ......................'...*......................................... + // mul R10, R9, R2 // ......................'.....*....................................... + // smlal R9, R14, R10, R3 // ......................'.......*..................................... + // add R5, R5, R7 // ......................'...............*............................. + // add R6, R6, R8 // ......................'...........*................................. + // add R4, R4, R12 // ......................'............*................................ + // add R11, R11, R14 // ......................'.........*................................... + // sub.w R7, R5, R7, lsl #1 // ......................'...................*......................... + // sub.w R8, R6, R8, lsl #1 // ......................'..............*.............................. + // sub.w R12, R4, R12, lsl #1 // ......................'.............*............................... + // sub.w R14, R11, R14, lsl #1 // ......................'..........*.................................. + // vmov R1, s5 // ......................'............*................................ + // smull R9, R6, R6, R1 // ......................'..............*.............................. + // mul R10, R9, R2 // ......................'................*............................ + // smlal R9, R6, R10, R3 // ......................'.....................*....................... + // vmov R1, s6 // ......................'......................*...................... + // smull R9, R8, R8, R1 // ~.....................'.......................*..................... + // mul R10, R9, R2 // ..~...................'.........................*................... + // smlal R9, R8, R10, R3 // ....~.................'...........................*................. + // vmov R1, s7 // ......................'................*............................ + // smull R9, R11, R11, R1 // ......................'.................*........................... + // mul R10, R9, R2 // ......................'....................*........................ + // smlal R9, R11, R10, R3 // ......................'......................*...................... + // vmov R1, s8 // ......................'.........*................................... + // smull R9, R14, R14, R1 // ......................'...........*................................. + // mul R10, R9, R2 // ......................'.............*............................... + // smlal R9, R14, R10, R3 // ......................'...............*............................. + // add R5, R5, R6 // ......~...............'.............................*............... + // add R7, R7, R8 // .......~..............'..............................*.............. + // add R4, R4, R11 // .~....................'........................*.................... + // add R12, R12, R14 // ......................'.................*........................... + // sub.w R6, R5, R6, lsl #1 // ..........~...........'.................................*........... + // sub.w R8, R7, R8, lsl #1 // .........~............'................................*............ + // sub.w R11, R4, R11, lsl #1 // ..~...................'.........................*................... + // sub.w R14, R12, R14, lsl #1 // ......................'..................*.......................... + // str.w R6, [R0, #1*512/4] // .....................~'............................................* + // str.w R7, [R0, #2*512/4] // ........~.............'...............................*............. + // str.w R8, [R0, #3*512/4] // ...............~......'......................................*...... + // str.w R4, [R0, #4*512/4] // .~....................'........................*.................... + // str.w R11, [R0, #5*512/4] // ...~..................'..........................*.................. + // str.w R12, [R0, #6*512/4] // ......................'..................*.......................... + // str.w R14, [R0, #7*512/4] // ......................'...................*......................... + // str R5, [R0], #4 // ............~.........'...................................*......... + // vmov R10, s9 // ...........~..........'..................................*.......... + // cmp.w R0, R10 // .............~........'....................................*........ + // bne layer123_loop // .....................~'............................................* + + + // Instructions: 62 + // Expected cycles: 31 + // Expected IPC: 2.00 + // + // Cycle bound: 31.0 + // IPC bound: 2.00 + // + // Wall time: 207.02s + // User time: 207.02s + // + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + vmov r10, s4 // *.............................. + mul r9, r5, r2 // *.............................. + sub.w r12, r8, r12, lsl #1 // .*............................. + smull r1, r8, r8, r1 // .*............................. + ldr.w r7, [r0, #128] // ..*............................ + smlal r5, r11, r9, r3 // ..*............................ + add r9, r7, r14 // ...*........................... + mul r5, r1, r2 // ...*........................... + sub.w r7, r9, r14, lsl #1 // ....*.......................... + smull r12, r14, r12, r10 // ....*.......................... + add r9, r9, r11 // .....*......................... + smlal r1, r8, r5, r3 // .....*......................... + ldr.w r1, [r0] // ......*........................ + mul r5, r12, r2 // ......*........................ + sub.w r11, r9, r11, lsl #1 // .......*....................... + smull r10, r6, r6, r10 // .......*....................... + add r1, r1, r4 // ........*...................... + smlal r12, r14, r5, r3 // ........*...................... + vmov r5, s5 // .........*..................... + mul r12, r10, r2 // .........*..................... + sub.w r4, r1, r4, lsl #1 // ..........*.................... + smull r9, r5, r9, r5 // ..........*.................... + add r1, r1, r8 // ...........*................... + smlal r10, r6, r12, r3 // ...........*................... + vmov r10, s6 // ............*.................. + mul r12, r9, r2 // ............*.................. + sub.w r8, r1, r8, lsl #1 // .............*................. + smull r10, r11, r11, r10 // .............*................. + add r7, r7, r6 // ..............*................ + smlal r9, r5, r12, r3 // ..............*................ + sub.w r12, r7, r6, lsl #1 // ...............*............... + mul r6, r10, r2 // ...............*............... + add r1, r1, r5 // ................*.............. + str r1, [r0], #4 // ................*.............. // @slothy:core // @slothy:before=cmp + sub.w r5, r1, r5, lsl #1 // .................*............. + smlal r10, r11, r6, r3 // .................*............. + str.w r5, [r0, #124] // ..................*............ + vmov r5, s7 // ..................*............ + add r8, r8, r11 // ...................*........... + smull r10, r6, r7, r5 // ...................*........... + sub.w r7, r8, r11, lsl #1 // ....................*.......... + str.w r8, [r0, #252] // ....................*.......... + vmov r11, s8 // .....................*......... + mul r1, r10, r2 // .....................*......... + vmov r8, s9 // ......................*........ + smull r11, r9, r12, r11 // ......................*........ + add r12, r4, r14 // .......................*....... + smlal r10, r6, r1, r3 // .......................*....... + sub.w r10, r12, r14, lsl #1 // ........................*...... + mul r5, r11, r2 // ........................*...... + add r4, r12, r6 // .........................*..... + str.w r4, [r0, #508] // .........................*..... + sub.w r12, r4, r6, lsl #1 // ..........................*.... + smlal r11, r9, r5, r3 // ..........................*.... + cmp.w r0, r8 // ...........................*... // @slothy:id=cmp + str.w r12, [r0, #636] // ...........................*... + str.w r7, [r0, #380] // ............................*.. + add r10, r10, r9 // ............................*.. + str.w r10, [r0, #764] // .............................*. + sub.w r4, r10, r9, lsl #1 // .............................*. + str.w r4, [r0, #892] // ..............................* + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // sub.w r7, r8, r12, lsl #1 // .*............................. + // mul r12, r5, r2 // *.............................. + // vmov r9, s4 // *.............................. + // smull r1, r8, r8, r1 // .*............................. + // smlal r5, r11, r12, r3 // ..*............................ + // ldr.w r5, [r0] // ......*........................ + // mul r10, r1, r2 // ...*........................... + // smull r6, r12, r6, r9 // .......*....................... + // add r5, r5, r4 // ........*...................... + // smlal r1, r8, r10, r3 // .....*......................... + // sub.w r10, r5, r4, lsl #1 // ..........*.................... + // mul r4, r6, r2 // .........*..................... + // ldr.w r1, [r0, #128] // ..*............................ + // smull r7, r9, r7, r9 // ....*.......................... + // add r1, r1, r14 // ...*........................... + // smlal r6, r12, r4, r3 // ...........*................... + // sub.w r6, r1, r14, lsl #1 // ....*.......................... + // mul r4, r7, r2 // ......*........................ + // add r6, r6, r12 // ..............*................ + // vmov r14, s8 // .....................*......... + // sub.w r12, r6, r12, lsl #1 // ...............*............... + // smlal r7, r9, r4, r3 // ........*...................... + // add r4, r1, r11 // .....*......................... + // smull r12, r1, r12, r14 // ......................*........ + // add r14, r10, r9 // .......................*....... + // vmov r10, s5 // .........*..................... + // sub.w r7, r14, r9, lsl #1 // ........................*...... + // mul r9, r12, r2 // ........................*...... + // sub.w r11, r4, r11, lsl #1 // .......*....................... + // smull r4, r10, r4, r10 // ..........*.................... + // add r5, r5, r8 // ...........*................... + // smlal r12, r1, r9, r3 // ..........................*.... + // vmov r9, s7 // ..................*............ + // mul r12, r4, r2 // ............*.................. + // add r7, r7, r1 // ............................*.. + // smull r6, r9, r6, r9 // ...................*........... + // sub.w r1, r7, r1, lsl #1 // .............................*. + // str.w r7, [r0, #768] // .............................*. + // str.w r1, [r0, #896] // ..............................* + // sub.w r8, r5, r8, lsl #1 // .............*................. + // mul r1, r6, r2 // .....................*......... + // smlal r4, r10, r12, r3 // ..............*................ + // vmov r12, s6 // ............*.................. + // smlal r6, r9, r1, r3 // .......................*....... + // smull r7, r11, r11, r12 // .............*................. + // add r14, r14, r9 // .........................*..... + // str.w r14, [r0, #512] // .........................*..... + // sub.w r6, r14, r9, lsl #1 // ..........................*.... + // mul r14, r7, r2 // ...............*............... + // str.w r6, [r0, #640] // ...........................*... + // smlal r7, r11, r14, r3 // .................*............. + // add r1, r5, r10 // ................*.............. + // add r8, r8, r11 // ...................*........... + // str.w r8, [r0, #256] // ....................*.......... + // sub.w r8, r8, r11, lsl #1 // ....................*.......... + // sub.w r10, r1, r10, lsl #1 // .................*............. + // vmov r11, s9 // ......................*........ + // str r1, [r0], #4 // ................*.............. + // cmp.w r0, r11 // ...........................*... + // str.w r8, [r0, #380] // ............................*.. + // str.w r10, [r0, #124] // ..................*............ + // bne layer123_loop // ..............................* sub ptr_p, #32*4 @@ -660,409 +660,409 @@ layer123_loop: vmov ptr_zeta, s0 vldm ptr_zeta!, {s2-s8} vmov s0, ptr_zeta - // Instructions: 78 - // Expected cycles: 43 - // Expected IPC: 1.81 - // - // Cycle bound: 43.0 - // IPC bound: 1.81 - // - // Wall time: 21.10s - // User time: 21.10s - // - // ------------ cycle (expected) ------------> - // 0 25 - // |------------------------|----------------- - ldr.w r10, [r0, #112] // *.......................................... - vmov r1, s2 // .*......................................... - ldr.w r14, [r0, #80] // .*......................................... - ldr.w r7, [r0, #16] // ..*........................................ - smull r11, r4, r10, r1 // ..*........................................ - smull r10, r6, r14, r1 // ...*....................................... - ldr.w r12, [r0, #96] // ....*...................................... - mul r5, r11, r2 // ....*...................................... - ldr.w r8, [r0, #48] // .....*..................................... - mul r14, r10, r2 // .....*..................................... - vmov r9, s4 // ......*.................................... - smlal r11, r4, r5, r3 // ......*.................................... - ldr.w r11, [r0, #64] // .......*................................... - smull r5, r12, r12, r1 // .......*................................... - add r8, r8, r4 // ........*.................................. - smlal r10, r6, r14, r3 // ........*.................................. - sub.w r10, r8, r4, lsl #1 // .........*................................. - mul r14, r5, r2 // .........*................................. - add r4, r7, r6 // ..........*................................ - smull r7, r10, r10, r9 // ..........*................................ - sub.w r6, r4, r6, lsl #1 // ...........*............................... - smlal r5, r12, r14, r3 // ...........*............................... - ldr.w r14, [r0, #32] // ............*.............................. - mul r5, r7, r2 // ............*.............................. - add r14, r14, r12 // .............*............................. - smull r11, r1, r11, r1 // .............*............................. - sub.w r12, r14, r12, lsl #1 // ..............*............................ - smlal r7, r10, r5, r3 // ..............*............................ - ldr.w r7, [r0] // ...............*........................... - mul r5, r11, r2 // ...............*........................... - add r6, r6, r10 // ................*.......................... - smull r12, r9, r12, r9 // ................*.......................... - sub.w r10, r6, r10, lsl #1 // .................*......................... - smlal r11, r1, r5, r3 // .................*......................... - vmov r5, s8 // ..................*........................ - mul r11, r12, r2 // ..................*........................ - add r7, r7, r1 // ...................*....................... - smull r5, r10, r10, r5 // ...................*....................... - sub.w r1, r7, r1, lsl #1 // ....................*...................... - smlal r12, r9, r11, r3 // ....................*...................... - vmov r11, s7 // .....................*..................... - mul r12, r5, r2 // .....................*..................... - add r1, r1, r9 // ......................*.................... - smull r11, r6, r6, r11 // ......................*.................... - sub.w r9, r1, r9, lsl #1 // .......................*................... - smlal r5, r10, r12, r3 // .......................*................... - mul r5, r11, r2 // ........................*.................. - add r12, r9, r10 // .........................*................. - str.w r12, [r0, #96] // .........................*................. - vmov r9, s3 // ..........................*................ - smlal r11, r6, r5, r3 // ..........................*................ - sub.w r10, r12, r10, lsl #1 // ...........................*............... - smull r5, r12, r8, r9 // ...........................*............... - add r1, r1, r6 // ............................*.............. - str.w r1, [r0, #64] // ............................*.............. - sub.w r6, r1, r6, lsl #1 // .............................*............. - mul r1, r5, r2 // .............................*............. - smull r11, r8, r14, r9 // ..............................*............ - vmov r9, s6 // ...............................*........... - smlal r5, r12, r1, r3 // ...............................*........... - vmov r5, s5 // ................................*.......... - str.w r6, [r0, #80] // ................................*.......... - add r6, r4, r12 // .................................*......... - mul r4, r11, r2 // .................................*......... - sub.w r1, r6, r12, lsl #1 // ..................................*........ - smull r14, r5, r6, r5 // ..................................*........ - smull r6, r9, r1, r9 // ...................................*....... - smlal r11, r8, r4, r3 // ....................................*...... - mul r11, r6, r2 // .....................................*..... - add r12, r7, r8 // ......................................*.... - mul r1, r14, r2 // ......................................*.... - sub.w r4, r12, r8, lsl #1 // .......................................*... - smlal r6, r9, r11, r3 // .......................................*... - smlal r14, r5, r1, r3 // ........................................*.. - str.w r10, [r0, #112] // .........................................*. - add r8, r4, r9 // .........................................*. - sub.w r7, r8, r9, lsl #1 // ..........................................* - str.w r7, [r0, #48] // ..........................................* - - // ------------ cycle (expected) ------------> - // 0 25 - // |------------------------|----------------- - // vmov r4, s2 // .*......................................... - // ldr.w r1, [r0, #96] // ....*...................................... - // vmov r8, s3 // ..........................*................ - // ldr.w r7, [r0, #112] // *.......................................... - // ldr.w r5, [r0, #48] // .....*..................................... - // smull r1, r14, r1, r4 // .......*................................... - // smull r6, r10, r7, r4 // ..*........................................ - // ldr.w r12, [r0, #32] // ............*.............................. - // mul r9, r1, r2 // .........*................................. - // mul r7, r6, r2 // ....*...................................... - // ldr.w r11, [r0, #80] // .*......................................... - // smlal r1, r14, r9, r3 // ...........*............................... - // smlal r6, r10, r7, r3 // ......*.................................... - // add r12, r12, r14 // .............*............................. - // smull r1, r11, r11, r4 // ...*....................................... - // sub.w r6, r12, r14, lsl #1 // ..............*............................ - // smull r12, r9, r12, r8 // ..............................*............ - // add r7, r5, r10 // ........*.................................. - // mul r5, r1, r2 // .....*..................................... - // mul r14, r12, r2 // .................................*......... - // smlal r1, r11, r5, r3 // ........*.................................. - // sub.w r1, r7, r10, lsl #1 // .........*................................. - // smull r5, r10, r7, r8 // ...........................*............... - // smlal r12, r9, r14, r3 // ....................................*...... - // vmov r14, s4 // ......*.................................... - // mul r12, r5, r2 // .............................*............. - // smull r7, r8, r1, r14 // ..........*................................ - // ldr.w r1, [r0, #64] // .......*................................... - // smlal r5, r10, r12, r3 // ...............................*........... - // ldr.w r5, [r0, #16] // ..*........................................ - // mul r12, r7, r2 // ............*.............................. - // add r5, r5, r11 // ..........*................................ - // smull r6, r14, r6, r14 // ................*.......................... - // sub.w r11, r5, r11, lsl #1 // ...........*............................... - // smlal r7, r8, r12, r3 // ..............*............................ - // add r7, r5, r10 // .................................*......... - // mul r12, r6, r2 // ..................*........................ - // add r11, r11, r8 // ................*.......................... - // smull r4, r1, r1, r4 // .............*............................. - // sub.w r5, r7, r10, lsl #1 // ..................................*........ - // smlal r6, r14, r12, r3 // ....................*...................... - // vmov r10, s6 // ...............................*........... - // mul r6, r4, r2 // ...............*........................... - // sub.w r8, r11, r8, lsl #1 // .................*......................... - // smull r5, r10, r5, r10 // ...................................*....... - // ldr.w r12, [r0] // ...............*........................... - // smlal r4, r1, r6, r3 // .................*......................... - // vmov r6, s7 // .....................*..................... - // mul r4, r5, r2 // .....................................*..... - // add r12, r12, r1 // ...................*....................... - // smull r6, r11, r11, r6 // ......................*.................... - // sub.w r1, r12, r1, lsl #1 // ....................*...................... - // smlal r5, r10, r4, r3 // .......................................*... - // vmov r4, s8 // ..................*........................ - // mul r5, r6, r2 // ........................*.................. - // add r1, r1, r14 // ......................*.................... - // smull r4, r8, r8, r4 // ...................*....................... - // add r12, r12, r9 // ......................................*.... - // smlal r6, r11, r5, r3 // ..........................*................ - // sub.w r6, r12, r9, lsl #1 // .......................................*... - // mul r9, r4, r2 // .....................*..................... - // add r5, r1, r11 // ............................*.............. - // str.w r5, [r0, #64] // ............................*.............. - // sub.w r11, r5, r11, lsl #1 // .............................*............. - // smlal r4, r8, r9, r3 // .......................*................... - // str.w r11, [r0, #80] // ................................*.......... - // vmov r4, s5 // ................................*.......... - // sub.w r14, r1, r14, lsl #1 // .......................*................... - // smull r9, r5, r7, r4 // ..................................*........ - // add r1, r14, r8 // .........................*................. - // str.w r1, [r0, #96] // .........................*................. - // sub.w r11, r1, r8, lsl #1 // ...........................*............... - // mul r7, r9, r2 // ......................................*.... - // str.w r11, [r0, #112] // .........................................*. - // add r8, r6, r10 // .........................................*. - // sub.w r10, r8, r10, lsl #1 // ..........................................* - // smlal r9, r5, r7, r3 // ........................................*.. - // str.w r10, [r0, #48] // ..........................................* - - push {r10} - vmov r10, s10 - sub r10, r10, #4 - vmov s10, r10 - pop {r10} -layer456_loop: - // Instructions: 86 - // Expected cycles: 46 - // Expected IPC: 1.87 + // Instructions: 21 + // Expected cycles: 14 + // Expected IPC: 1.50 // - // Cycle bound: 46.0 - // IPC bound: 1.87 + // Cycle bound: 14.0 + // IPC bound: 1.50 // - // Wall time: 245.38s - // User time: 245.38s + // Wall time: 0.04s + // User time: 0.04s // - // ------------- cycle (expected) --------------> + // ----- cycle (expected) ------> // 0 25 - // |------------------------|-------------------- - add r11, r12, r5 // *............................................. - str.w r8, [r0, #32] // *............................................. - sub.w r1, r11, r5, lsl #1 // .*............................................ - str.w r1, [r0, #16] // .*............................................ - str r11, [r0], #4 // ..*........................................... // @slothy:core - vmov r4, s2 // ..e........................................... - ldr.w r1, [r0, #96] // ...e.......................................... - vmov r10, s10 // ...*.......................................... - vmov r8, s3 // ....e......................................... - ldr.w r7, [r0, #112] // ....e......................................... - ldr.w r5, [r0, #48] // .....e........................................ - smull r1, r14, r1, r4 // .....e........................................ - cmp.w r0, r10 // ......*....................................... - smull r6, r10, r7, r4 // ......e....................................... - ldr.w r12, [r0, #32] // .......e...................................... - mul r9, r1, r2 // .......e...................................... - mul r7, r6, r2 // ........e..................................... - ldr.w r11, [r0, #80] // .........e.................................... - smlal r1, r14, r9, r3 // .........e.................................... - smlal r6, r10, r7, r3 // ..........e................................... - add r12, r12, r14 // ...........e.................................. - smull r1, r11, r11, r4 // ...........e.................................. - sub.w r6, r12, r14, lsl #1 // ............e................................. - smull r12, r9, r12, r8 // ............e................................. - add r7, r5, r10 // .............e................................ - mul r5, r1, r2 // .............e................................ - mul r14, r12, r2 // ..............e............................... - smlal r1, r11, r5, r3 // ...............e.............................. - sub.w r1, r7, r10, lsl #1 // ................e............................. - smull r5, r10, r7, r8 // ................e............................. - smlal r12, r9, r14, r3 // .................e............................ - vmov r14, s4 // ..................e........................... - mul r12, r5, r2 // ..................e........................... - smull r7, r8, r1, r14 // ...................e.......................... - ldr.w r1, [r0, #64] // ....................e......................... - smlal r5, r10, r12, r3 // ....................e......................... - ldr.w r5, [r0, #16] // .....................e........................ - mul r12, r7, r2 // .....................e........................ - add r5, r5, r11 // ......................e....................... - smull r6, r14, r6, r14 // ......................e....................... - sub.w r11, r5, r11, lsl #1 // .......................e...................... - smlal r7, r8, r12, r3 // .......................e...................... - add r7, r5, r10 // ........................e..................... - mul r12, r6, r2 // ........................e..................... - add r11, r11, r8 // .........................e.................... - smull r4, r1, r1, r4 // .........................e.................... - sub.w r5, r7, r10, lsl #1 // ..........................e................... - smlal r6, r14, r12, r3 // ..........................e................... - vmov r10, s6 // ...........................e.................. - mul r6, r4, r2 // ...........................e.................. - sub.w r8, r11, r8, lsl #1 // ............................e................. - smull r5, r10, r5, r10 // ............................e................. - ldr.w r12, [r0] // .............................e................ - smlal r4, r1, r6, r3 // .............................e................ - vmov r6, s7 // ..............................e............... - mul r4, r5, r2 // ..............................e............... - add r12, r12, r1 // ...............................e.............. - smull r6, r11, r11, r6 // ...............................e.............. - sub.w r1, r12, r1, lsl #1 // ................................e............. - smlal r5, r10, r4, r3 // ................................e............. - vmov r4, s8 // .................................e............ - mul r5, r6, r2 // .................................e............ - add r1, r1, r14 // ..................................e........... - smull r4, r8, r8, r4 // ..................................e........... - add r12, r12, r9 // ...................................e.......... - smlal r6, r11, r5, r3 // ...................................e.......... - sub.w r6, r12, r9, lsl #1 // ....................................e......... - mul r9, r4, r2 // ....................................e......... - add r5, r1, r11 // .....................................e........ - str.w r5, [r0, #64] // .....................................e........ - sub.w r11, r5, r11, lsl #1 // ......................................e....... - smlal r4, r8, r9, r3 // ......................................e....... - str.w r11, [r0, #80] // .......................................e...... - vmov r4, s5 // .......................................e...... - sub.w r14, r1, r14, lsl #1 // ........................................e..... - smull r9, r5, r7, r4 // ........................................e..... - add r1, r14, r8 // .........................................e.... - str.w r1, [r0, #96] // .........................................e.... - sub.w r11, r1, r8, lsl #1 // ..........................................e... - mul r7, r9, r2 // ..........................................e... - str.w r11, [r0, #112] // ...........................................e.. - add r8, r6, r10 // ...........................................e.. - sub.w r10, r8, r10, lsl #1 // ............................................e. - smlal r9, r5, r7, r3 // ............................................e. - str.w r10, [r0, #48] // .............................................e - bne layer456_loop // .............................................* // @slothy:branch + // |------------------------|---- + ldr.w r6, [r0, #96] // *............................. + vmov r4, s2 // *............................. + ldr.w r14, [r0, #112] // .*............................ + vmov r1, s3 // .*............................ + ldr.w r12, [r0, #32] // ..*........................... + smull r6, r10, r6, r4 // ..*........................... + ldr.w r9, [r0, #48] // ...*.......................... + smull r8, r7, r14, r4 // ...*.......................... + ldr.w r14, [r0, #80] // ....*......................... + mul r5, r6, r2 // ....*......................... + mul r11, r8, r2 // .....*........................ + smlal r6, r10, r5, r3 // ......*....................... + smlal r8, r7, r11, r3 // .......*...................... + add r6, r12, r10 // ........*..................... + add r12, r9, r7 // .........*.................... + smull r8, r9, r6, r1 // .........*.................... + sub.w r10, r6, r10, lsl #1 // ..........*................... + smull r1, r5, r12, r1 // ..........*................... + mul r6, r8, r2 // ...........*.................. + mul r11, r1, r2 // ............*................. + smlal r8, r9, r6, r3 // .............*................ - // ----------------------------------- cycle (expected) ------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|-------------- - // ldr.w R5, [R0] // ...........................e................'............................~................ - // ldr.w R6, [R0, #1*64/4] // ...................e........................'....................~........................ - // ldr.w R7, [R0, #2*64/4] // .....e......................................'......~...................................... - // ldr.w R8, [R0, #3*64/4] // ...e........................................'....~........................................ - // ldr.w R4, [R0, #4*64/4] // ..................e.........................'...................~......................... - // ldr.w R11, [R0, #5*64/4] // .......e....................................'........~.................................... - // ldr.w R12, [R0, #6*64/4] // .e..........................................'..~.......................................... - // ldr.w R14, [R0, #7*64/4] // ..e.........................................'...~......................................... - // vmov R1, s2 // e...........................................'.~........................................... - // smull R9, R4, R4, R1 // .......................e....................'........................~.................... - // mul R10, R9, R2 // .........................e..................'..........................~.................. - // smlal R9, R4, R10, R3 // ...........................e................'............................~................ - // smull R9, R11, R11, R1 // .........e..................................'..........~.................................. - // mul R10, R9, R2 // ...........e................................'............~................................ - // smlal R9, R11, R10, R3 // .............e..............................'..............~.............................. - // smull R9, R12, R12, R1 // ...e........................................'....~........................................ - // mul R10, R9, R2 // .....e......................................'......~...................................... - // smlal R9, R12, R10, R3 // .......e....................................'........~.................................... - // smull R9, R14, R14, R1 // ....e.......................................'.....~....................................... - // mul R10, R9, R2 // ......e.....................................'.......~..................................... - // smlal R9, R14, R10, R3 // ........e...................................'.........~................................... - // add R5, R5, R4 // .............................e..............'..............................~.............. - // add R6, R6, R11 // ....................e.......................'.....................~....................... - // add R7, R7, R12 // .........e..................................'..........~.................................. - // add R8, R8, R14 // ...........e................................'............~................................ - // sub.w R4, R5, R4, lsl #1 // ..............................e.............'...............................~............. - // sub.w R11, R6, R11, lsl #1 // .....................e......................'......................~...................... - // sub.w R12, R7, R12, lsl #1 // ..........e.................................'...........~................................. - // sub.w R14, R8, R14, lsl #1 // ..............e.............................'...............~............................. - // vmov R1, s3 // ..e.........................................'...~......................................... - // smull R9, R7, R7, R1 // ..........e.................................'...........~................................. - // mul R10, R9, R2 // ............e...............................'.............~............................... - // smlal R9, R7, R10, R3 // ...............e............................'................~............................ - // smull R9, R8, R8, R1 // ..............e.............................'...............~............................. - // mul R10, R9, R2 // ................e...........................'.................~........................... - // smlal R9, R8, R10, R3 // ..................e.........................'...................~......................... - // vmov R1, s4 // ................e...........................'.................~........................... - // smull R9, R12, R12, R1 // ....................e.......................'.....................~....................... - // mul R10, R9, R2 // ......................e.....................'.......................~..................... - // smlal R9, R12, R10, R3 // ........................e...................'.........................~................... - // smull R9, R14, R14, R1 // .................e..........................'..................~.......................... - // mul R10, R9, R2 // ...................e........................'....................~........................ - // smlal R9, R14, R10, R3 // .....................e......................'......................~...................... - // add R5, R5, R7 // .................................e..........'..................................~.......... - // add R6, R6, R8 // ......................e.....................'.......................~..................... - // add R4, R4, R12 // ................................e...........'.................................~........... - // add R11, R11, R14 // .......................e....................'........................~.................... - // sub.w R7, R5, R7, lsl #1 // ..................................e.........'...................................~......... - // sub.w R8, R6, R8, lsl #1 // ........................e...................'.........................~................... - // sub.w R12, R4, R12, lsl #1 // ......................................e.....'.......................................~..... - // sub.w R14, R11, R14, lsl #1 // ..........................e.................'...........................~................. - // vmov R1, s5 // .....................................e......'......................................~...... - // smull R9, R6, R6, R1 // ......................................e.....'.......................................~..... - // mul R10, R9, R2 // ........................................e...'.........................................~... - // smlal R9, R6, R10, R3 // ..........................................e.'...........................................~. - // vmov R1, s6 // .........................e..................'..........................~.................. - // smull R9, R8, R8, R1 // ..........................e.................'...........................~................. - // mul R10, R9, R2 // ............................e...............'.............................~............... - // smlal R9, R8, R10, R3 // ..............................e.............'...............................~............. - // vmov R1, s7 // ............................e...............'.............................~............... - // smull R9, R11, R11, R1 // .............................e..............'..............................~.............. - // mul R10, R9, R2 // ...............................e............'................................~............ - // smlal R9, R11, R10, R3 // .................................e..........'..................................~.......... - // vmov R1, s8 // ...............................e............'................................~............ - // smull R9, R14, R14, R1 // ................................e...........'.................................~........... - // mul R10, R9, R2 // ..................................e.........'...................................~......... - // smlal R9, R14, R10, R3 // ....................................e.......'.....................................~....... - // add R5, R5, R6 // ............................................*............................................. - // add R7, R7, R8 // .........................................e..'..........................................~.. - // add R4, R4, R11 // ...................................e........'....................................~........ - // add R12, R12, R14 // .......................................e....'........................................~.... - // sub.w R6, R5, R6, lsl #1 // ............................................'*............................................ - // sub.w R8, R7, R8, lsl #1 // ..........................................e.'...........................................~. - // sub.w R11, R4, R11, lsl #1 // ....................................e.......'.....................................~....... - // sub.w R14, R12, R14, lsl #1 // ........................................e...'.........................................~... - // str.w R6, [R0, #1*64/4] // ............................................'*............................................ - // str.w R7, [R0, #2*64/4] // ............................................*............................................. - // str.w R8, [R0, #3*64/4] // ...........................................e'............................................. - // str.w R4, [R0, #4*64/4] // ...................................e........'....................................~........ - // str.w R11, [R0, #5*64/4] // .....................................e......'......................................~...... - // str.w R12, [R0, #6*64/4] // .......................................e....'........................................~.... - // str.w R14, [R0, #7*64/4] // .........................................e..'..........................................~.. - // str R5, [R0], #4 // ~...........................................'.*........................................... - // vmov R10, s10 // .~..........................................'..*.......................................... - // cmp.w R0, R10 // ....~.......................................'.....*....................................... - // bne layer456_loop // ...........................................~'............................................* + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r6, [r0, #96] // *.............................. + // vmov r4, s2 // *.............................. + // smull r14, r6, r6, r4 // ..*............................ + // ldr.w r11, [r0, #112] // .*............................. + // ldr.w r9, [r0, #32] // ..*............................ + // mul r10, r14, r2 // ....*.......................... + // smull r1, r7, r11, r4 // ...*........................... + // smlal r14, r6, r10, r3 // ......*........................ + // mul r14, r1, r2 // .....*......................... + // add r10, r9, r6 // ........*...................... + // vmov r5, s3 // .*............................. + // smlal r1, r7, r14, r3 // .......*....................... + // ldr.w r14, [r0, #48] // ...*........................... + // smull r11, r9, r10, r5 // .........*..................... + // sub.w r10, r10, r6, lsl #1 // ..........*.................... + // add r12, r14, r7 // .........*..................... + // mul r6, r11, r2 // ...........*................... + // smull r1, r5, r12, r5 // ..........*.................... + // smlal r11, r9, r6, r3 // .............*................. + // ldr.w r14, [r0, #80] // ....*.......................... + // mul r11, r1, r2 // ............*.................. + push {r14} + vmov r14, s10 + sub r14, r14, #4 + vmov s10, r14 + pop {r14} +layer456_loop: + // Instructions: 86 + // Expected cycles: 47 + // Expected IPC: 1.83 + // + // Cycle bound: 44.0 + // IPC bound: 1.95 + // + // Wall time: 300.30s + // User time: 300.30s + // + // -------------- cycle (expected) --------------> + // 0 25 + // |------------------------|--------------------- + smull r6, r14, r14, r4 // *.............................................. + vmov r8, s4 // .*............................................. + smlal r1, r5, r11, r3 // .*............................................. + mul r1, r6, r2 // ..*............................................ + smull r11, r10, r10, r8 // ...*........................................... + sub.w r7, r12, r7, lsl #1 // ....*.......................................... + smlal r6, r14, r1, r3 // ....*.......................................... + smull r7, r6, r7, r8 // .....*......................................... + ldr.w r8, [r0, #64] // ......*........................................ + mul r1, r11, r2 // ......*........................................ + mul r12, r7, r2 // .......*....................................... + smlal r11, r10, r1, r3 // ........*...................................... + ldr.w r11, [r0, #16] // .........*..................................... + smlal r7, r6, r12, r3 // .........*..................................... + add r12, r11, r14 // ..........*.................................... + smull r11, r4, r8, r4 // ..........*.................................... + vmov r7, s7 // ...........*................................... + sub.w r14, r12, r14, lsl #1 // ...........*................................... + add r1, r14, r6 // ............*.................................. + mul r8, r11, r2 // ............*.................................. + add r12, r12, r5 // .............*................................. + smull r14, r7, r1, r7 // .............*................................. + sub.w r5, r12, r5, lsl #1 // ..............*................................ + smlal r11, r4, r8, r3 // ..............*................................ + vmov r8, s6 // ...............*............................... + mul r11, r14, r2 // ...............*............................... + sub.w r6, r1, r6, lsl #1 // ................*.............................. + smull r5, r8, r5, r8 // ................*.............................. + ldr.w r1, [r0] // .................*............................. + smlal r14, r7, r11, r3 // .................*............................. + vmov r14, s8 // ..................*............................ + mul r11, r5, r2 // ..................*............................ + add r1, r1, r4 // ...................*........................... + sub.w r4, r1, r4, lsl #1 // ....................*.......................... + smlal r5, r8, r11, r3 // ....................*.......................... + add r1, r1, r9 // .....................*......................... + add r4, r4, r10 // .....................*......................... + add r11, r4, r7 // ......................*........................ + str.w r11, [r0, #64] // ......................*........................ + sub.w r10, r4, r10, lsl #1 // .......................*....................... + smull r14, r5, r6, r14 // .......................*....................... + sub.w r7, r11, r7, lsl #1 // ........................*...................... + str.w r7, [r0, #80] // ........................*...................... + sub.w r9, r1, r9, lsl #1 // .........................*..................... + mul r11, r14, r2 // .........................*..................... + add r9, r9, r8 // ..........................*.................... + str.w r9, [r0, #32] // ..........................*.................... + sub.w r9, r9, r8, lsl #1 // ...........................*................... + smlal r14, r5, r11, r3 // ...........................*................... + str.w r9, [r0, #48] // ............................*.................. + vmov r11, s5 // ............................*.................. + add r7, r10, r5 // .............................*................. + smull r10, r8, r12, r11 // .............................*................. + ldr.w r6, [r0, #100] // ..............................e................ + str.w r7, [r0, #96] // ..............................*................ + vmov r4, s2 // ...............................e............... + mul r9, r10, r2 // ...............................*............... + sub.w r12, r7, r5, lsl #1 // ................................*.............. + smull r14, r6, r6, r4 // ................................e.............. + ldr.w r11, [r0, #116] // .................................e............. + smlal r10, r8, r9, r3 // .................................*............. + ldr.w r9, [r0, #36] // ..................................e............ + mul r10, r14, r2 // ..................................e............ + add r5, r1, r8 // ...................................*........... + smull r1, r7, r11, r4 // ...................................e........... + smlal r14, r6, r10, r3 // ....................................e.......... + sub.w r8, r5, r8, lsl #1 // .....................................*......... + mul r14, r1, r2 // .....................................e......... + str r5, [r0], #4 // ......................................*........ // @slothy:core // @slothy:before=cmp + add r10, r9, r6 // ......................................e........ + vmov r5, s3 // .......................................e....... + smlal r1, r7, r14, r3 // .......................................e....... + ldr.w r14, [r0, #48] // ........................................e...... + smull r11, r9, r10, r5 // ........................................e...... + str.w r12, [r0, #108] // .........................................*..... + sub.w r10, r10, r6, lsl #1 // .........................................e..... + add r12, r14, r7 // ..........................................e.... + mul r6, r11, r2 // ..........................................e.... + vmov r14, s10 // ...........................................*... + smull r1, r5, r12, r5 // ...........................................e... + cmp.w r0, r14 // ............................................*.. // @slothy:id=cmp + smlal r11, r9, r6, r3 // ............................................e.. + ldr.w r14, [r0, #80] // .............................................e. + mul r11, r1, r2 // .............................................e. + str.w r8, [r0, #12] // ..............................................* + bne layer456_loop // ..............................................* // @slothy:branch - // Instructions: 8 - // Expected cycles: 6 - // Expected IPC: 1.33 - // - // Cycle bound: 6.0 - // IPC bound: 1.33 - // - // Wall time: 0.02s - // User time: 0.02s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - add r1, r12, r5 // *............................. - str.w r8, [r0, #32] // *............................. - sub.w r11, r1, r5, lsl #1 // .*............................ - str.w r11, [r0, #16] // .*............................ - str r1, [r0], #4 // ..*........................... // @slothy:core - vmov r1, s10 // ..*........................... - cmp.w r0, r1 // ...*.......................... + // ---------------------- cycle (expected) -----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + // ldr.w R5, [R0] // .................'................*............................. + // ldr.w R6, [R0, #1*64/4] // .................'........*..................................... + // ldr.w R7, [R0, #2*64/4] // ....e............'.................................~............ + // ldr.w R8, [R0, #3*64/4] // ..........e......'.......................................~...... + // ldr.w R4, [R0, #4*64/4] // .................'.....*........................................ + // ldr.w R11, [R0, #5*64/4] // ...............e.'............................................~. + // ldr.w R12, [R0, #6*64/4] // e................'.............................~................ + // ldr.w R14, [R0, #7*64/4] // ...e.............'................................~............. + // vmov R1, s2 // .e...............'..............................~............... + // smull R9, R4, R4, R1 // .................'.........*.................................... + // mul R10, R9, R2 // .................'...........*.................................. + // smlal R9, R4, R10, R3 // .................'.............*................................ + // smull R9, R11, R11, R1 // .................*.............................................. + // mul R10, R9, R2 // .................'.*............................................ + // smlal R9, R11, R10, R3 // .................'...*.......................................... + // smull R9, R12, R12, R1 // ..e..............'...............................~.............. + // mul R10, R9, R2 // ....e............'.................................~............ + // smlal R9, R12, R10, R3 // ......e..........'...................................~.......... + // smull R9, R14, R14, R1 // .....e...........'..................................~........... + // mul R10, R9, R2 // .......e.........'....................................~......... + // smlal R9, R14, R10, R3 // .........e.......'......................................~....... + // add R5, R5, R4 // .................'..................*........................... + // add R6, R6, R11 // .................'.........*.................................... + // add R7, R7, R12 // ........e........'.....................................~........ + // add R8, R8, R14 // ............e....'.........................................~.... + // sub.w R4, R5, R4, lsl #1 // .................'...................*.......................... + // sub.w R11, R6, R11, lsl #1 // .................'..........*................................... + // sub.w R12, R7, R12, lsl #1 // ...........e.....'........................................~..... + // sub.w R14, R8, R14, lsl #1 // .................'...*.......................................... + // vmov R1, s3 // .........e.......'......................................~....... + // smull R9, R7, R7, R1 // ..........e......'.......................................~...... + // mul R10, R9, R2 // ............e....'.........................................~.... + // smlal R9, R7, R10, R3 // ..............e..'...........................................~.. + // smull R9, R8, R8, R1 // .............e...'..........................................~... + // mul R10, R9, R2 // ...............e.'............................................~. + // smlal R9, R8, R10, R3 // .................'*............................................. + // vmov R1, s4 // .................'*............................................. + // smull R9, R12, R12, R1 // .................'..*........................................... + // mul R10, R9, R2 // .................'.....*........................................ + // smlal R9, R12, R10, R3 // .................'.......*...................................... + // smull R9, R14, R14, R1 // .................'....*......................................... + // mul R10, R9, R2 // .................'......*....................................... + // smlal R9, R14, R10, R3 // .................'........*..................................... + // add R5, R5, R7 // .................'....................*......................... + // add R6, R6, R8 // .................'............*................................. + // add R4, R4, R12 // .................'....................*......................... + // add R11, R11, R14 // .................'...........*.................................. + // sub.w R7, R5, R7, lsl #1 // .................'........................*..................... + // sub.w R8, R6, R8, lsl #1 // .................'.............*................................ + // sub.w R12, R4, R12, lsl #1 // .................'......................*....................... + // sub.w R14, R11, R14, lsl #1 // .................'...............*.............................. + // vmov R1, s5 // .................'...........................*.................. + // smull R9, R6, R6, R1 // .................'............................*................. + // mul R10, R9, R2 // .~...............'..............................*............... + // smlal R9, R6, R10, R3 // ...~.............'................................*............. + // vmov R1, s6 // .................'..............*............................... + // smull R9, R8, R8, R1 // .................'...............*.............................. + // mul R10, R9, R2 // .................'.................*............................ + // smlal R9, R8, R10, R3 // .................'...................*.......................... + // vmov R1, s7 // .................'..........*................................... + // smull R9, R11, R11, R1 // .................'............*................................. + // mul R10, R9, R2 // .................'..............*............................... + // smlal R9, R11, R10, R3 // .................'................*............................. + // vmov R1, s8 // .................'.................*............................ + // smull R9, R14, R14, R1 // .................'......................*....................... + // mul R10, R9, R2 // .................'........................*..................... + // smlal R9, R14, R10, R3 // .................'..........................*................... + // add R5, R5, R6 // .....~...........'..................................*........... + // add R7, R7, R8 // .................'.........................*.................... + // add R4, R4, R11 // .................'.....................*........................ + // add R12, R12, R14 // .................'............................*................. + // sub.w R6, R5, R6, lsl #1 // .......~.........'....................................*......... + // sub.w R8, R7, R8, lsl #1 // .................'..........................*................... + // sub.w R11, R4, R11, lsl #1 // .................'.......................*...................... + // sub.w R14, R12, R14, lsl #1 // ..~..............'...............................*.............. + // str.w R6, [R0, #1*64/4] // ................~'.............................................* + // str.w R7, [R0, #2*64/4] // .................'.........................*.................... + // str.w R8, [R0, #3*64/4] // .................'...........................*.................. + // str.w R4, [R0, #4*64/4] // .................'.....................*........................ + // str.w R11, [R0, #5*64/4] // .................'.......................*...................... + // str.w R12, [R0, #6*64/4] // ~................'.............................*................ + // str.w R14, [R0, #7*64/4] // ...........~.....'........................................*..... + // str R5, [R0], #4 // ........~........'.....................................*........ + // vmov R10, s10 // .............~...'..........................................*... + // cmp.w R0, R10 // ..............~..'...........................................*.. + // bne layer456_loop // ................~'.............................................* - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // add r11, r12, r5 // *.............................. - // str.w r8, [r0, #32] // *.............................. - // sub.w r1, r11, r5, lsl #1 // .*............................. - // str.w r1, [r0, #16] // .*............................. - // str r11, [r0], #4 // ..*............................ - // vmov r10, s10 // ..*............................ - // cmp.w r0, r10 // ...*........................... - // bne layer456_loop // .....*......................... + + // Instructions: 65 + // Expected cycles: 34 + // Expected IPC: 1.91 + // + // Cycle bound: 33.0 + // IPC bound: 1.97 + // + // Wall time: 300.06s + // User time: 300.06s + // + // ------- cycle (expected) --------> + // 0 25 + // |------------------------|-------- + sub.w r12, r12, r7, lsl #1 // *................................. + smull r14, r7, r14, r4 // *................................. + smlal r1, r5, r11, r3 // .*................................ + vmov r11, s4 // ..*............................... + mul r8, r14, r2 // ..*............................... + ldr.w r1, [r0, #64] // ...*.............................. + smull r6, r10, r10, r11 // ...*.............................. + smlal r14, r7, r8, r3 // ....*............................. + ldr.w r14, [r0, #16] // .....*............................ + mul r8, r6, r2 // .....*............................ + add r14, r14, r7 // ......*........................... + smull r4, r1, r1, r4 // ......*........................... + sub.w r7, r14, r7, lsl #1 // .......*.......................... + smlal r6, r10, r8, r3 // .......*.......................... + add r6, r14, r5 // ........*......................... + mul r8, r4, r2 // ........*......................... + sub.w r5, r6, r5, lsl #1 // .........*........................ + smull r14, r12, r12, r11 // .........*........................ + ldr.w r11, [r0] // ..........*....................... + smlal r4, r1, r8, r3 // ..........*....................... + vmov r8, s6 // ...........*...................... + mul r4, r14, r2 // ...........*...................... + add r11, r11, r1 // ............*..................... + smull r5, r8, r5, r8 // ............*..................... + sub.w r1, r11, r1, lsl #1 // .............*.................... + smlal r14, r12, r4, r3 // .............*.................... + add r14, r11, r9 // ..............*................... + mul r4, r5, r2 // ..............*................... + add r11, r7, r12 // ...............*.................. + vmov r7, s8 // ...............*.................. + sub.w r12, r11, r12, lsl #1 // ................*................. + smlal r5, r8, r4, r3 // ................*................. + sub.w r5, r14, r9, lsl #1 // .................*................ + smull r4, r9, r12, r7 // .................*................ + add r7, r5, r8 // ..................*............... + str.w r7, [r0, #32] // ..................*............... + sub.w r8, r7, r8, lsl #1 // ...................*.............. + str.w r8, [r0, #48] // ...................*.............. + vmov r8, s7 // ....................*............. + mul r12, r4, r2 // ....................*............. + add r5, r1, r10 // .....................*............ + smull r8, r1, r11, r8 // .....................*............ + sub.w r10, r5, r10, lsl #1 // ......................*........... + smlal r4, r9, r12, r3 // ......................*........... + vmov r7, s5 // .......................*.......... + mul r11, r8, r2 // .......................*.......... + add r12, r10, r9 // ........................*......... + smull r10, r4, r6, r7 // ........................*......... + sub.w r6, r12, r9, lsl #1 // .........................*........ + smlal r8, r1, r11, r3 // .........................*........ + mul r11, r10, r2 // ..........................*....... + str.w r12, [r0, #96] // ...........................*...... + add r5, r5, r1 // ...........................*...... + sub.w r1, r5, r1, lsl #1 // ............................*..... + smlal r10, r4, r11, r3 // ............................*..... + str.w r1, [r0, #80] // .............................*.... + vmov r9, s10 // .............................*.... + str.w r6, [r0, #112] // ..............................*... + add r14, r14, r4 // ..............................*... + str r14, [r0], #4 // ...............................*.. // @slothy:core // @slothy:before=cmp + cmp.w r0, r9 // ...............................*.. // @slothy:id=cmp + str.w r5, [r0, #60] // ................................*. + sub.w r11, r14, r4, lsl #1 // ................................*. + str.w r11, [r0, #12] // .................................* + + // ------- cycle (expected) --------> + // 0 25 + // |------------------------|-------- + // smull r6, r14, r14, r4 // *................................. + // vmov r8, s4 // ..*............................... + // smlal r1, r5, r11, r3 // .*................................ + // mul r1, r6, r2 // ..*............................... + // smull r11, r10, r10, r8 // ...*.............................. + // sub.w r7, r12, r7, lsl #1 // *................................. + // smlal r6, r14, r1, r3 // ....*............................. + // smull r7, r6, r7, r8 // .........*........................ + // ldr.w r8, [r0, #64] // ...*.............................. + // mul r1, r11, r2 // .....*............................ + // mul r12, r7, r2 // ...........*...................... + // smlal r11, r10, r1, r3 // .......*.......................... + // ldr.w r11, [r0, #16] // .....*............................ + // smlal r7, r6, r12, r3 // .............*.................... + // add r12, r11, r14 // ......*........................... + // smull r11, r4, r8, r4 // ......*........................... + // vmov r7, s7 // ....................*............. + // sub.w r14, r12, r14, lsl #1 // .......*.......................... + // add r1, r14, r6 // ...............*.................. + // mul r8, r11, r2 // ........*......................... + // add r12, r12, r5 // ........*......................... + // smull r14, r7, r1, r7 // .....................*............ + // sub.w r5, r12, r5, lsl #1 // .........*........................ + // smlal r11, r4, r8, r3 // ..........*....................... + // vmov r8, s6 // ...........*...................... + // mul r11, r14, r2 // .......................*.......... + // sub.w r6, r1, r6, lsl #1 // ................*................. + // smull r5, r8, r5, r8 // ............*..................... + // ldr.w r1, [r0] // ..........*....................... + // smlal r14, r7, r11, r3 // .........................*........ + // vmov r14, s8 // ...............*.................. + // mul r11, r5, r2 // ..............*................... + // add r1, r1, r4 // ............*..................... + // sub.w r4, r1, r4, lsl #1 // .............*.................... + // smlal r5, r8, r11, r3 // ................*................. + // add r1, r1, r9 // ..............*................... + // add r4, r4, r10 // .....................*............ + // add r11, r4, r7 // ...........................*...... + // str.w r11, [r0, #64] // ................................*. + // sub.w r10, r4, r10, lsl #1 // ......................*........... + // smull r14, r5, r6, r14 // .................*................ + // sub.w r7, r11, r7, lsl #1 // ............................*..... + // str.w r7, [r0, #80] // .............................*.... + // sub.w r9, r1, r9, lsl #1 // .................*................ + // mul r11, r14, r2 // ....................*............. + // add r9, r9, r8 // ..................*............... + // str.w r9, [r0, #32] // ..................*............... + // sub.w r9, r9, r8, lsl #1 // ...................*.............. + // smlal r14, r5, r11, r3 // ......................*........... + // str.w r9, [r0, #48] // ...................*.............. + // vmov r11, s5 // .......................*.......... + // add r7, r10, r5 // ........................*......... + // smull r10, r8, r12, r11 // ........................*......... + // str.w r7, [r0, #96] // ...........................*...... + // mul r9, r10, r2 // ..........................*....... + // sub.w r12, r7, r5, lsl #1 // .........................*........ + // smlal r10, r8, r9, r3 // ............................*..... + // add r5, r1, r8 // ..............................*... + // sub.w r8, r5, r8, lsl #1 // ................................*. + // str r5, [r0], #4 // ...............................*.. + // str.w r12, [r0, #108] // ..............................*... + // vmov r14, s10 // .............................*.... + // cmp.w r0, r14 // ...............................*.. + // str.w r8, [r0, #12] // .................................* + // bne layer456_loop // .................................* add.w ptr_p, #112 @@ -1075,193 +1075,193 @@ layer456_loop: // stage 7 and 8 add cntr, ptr_p, #1024 // 64 iterations - // Instructions: 12 - // Expected cycles: 10 - // Expected IPC: 1.20 - // - // Cycle bound: 10.0 - // IPC bound: 1.20 - // - // Wall time: 0.03s - // User time: 0.03s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r8, [r1, #8] // *............................. - ldr.w r5, [r1, #4] // *............................. - ldr r11, [r1], #12 // .*............................ - ldr.w r14, [r0, #8] // .*............................ - ldr.w r6, [r0, #12] // ..*........................... - ldr.w r12, [r0] // ..*........................... - ldr.w r10, [r0, #4] // ...*.......................... - smull r9, r6, r6, r11 // ....*......................... - smull r7, r11, r14, r11 // .....*........................ - mul r14, r9, r2 // ......*....................... - smlal r9, r6, r14, r3 // ........*..................... - mul r9, r7, r2 // .........*.................... + // Instructions: 5 + // Expected cycles: 6 + // Expected IPC: 0.83 + // + // Cycle bound: 6.0 + // IPC bound: 0.83 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r14, [r0, #12] // *............................. + ldr.w r8, [r0, #8] // *............................. + ldr r5, [r1], #12 // .*............................ + smull r14, r7, r14, r5 // ...*.......................... + mul r9, r14, r2 // .....*........................ - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w r8, [r1, #8] // *.............................. - // ldr.w r5, [r1, #4] // *.............................. - // ldr r7, [r1], #12 // .*............................. - // ldr.w r6, [r0, #12] // ..*............................ - // ldr.w r14, [r0, #8] // .*............................. - // smull r10, r6, r6, r7 // ....*.......................... - // smull r7, r11, r14, r7 // .....*......................... - // mul r14, r10, r2 // ......*........................ - // ldr.w r12, [r0] // ..*............................ - // mul r9, r7, r2 // .........*..................... - // smlal r10, r6, r14, r3 // ........*...................... - // ldr.w r10, [r0, #4] // ...*........................... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r7, [r0, #12] // *.............................. + // ldr r5, [r1], #12 // .*............................. + // ldr.w r8, [r0, #8] // *.............................. + // smull r14, r7, r7, r5 // ...*........................... + // mul r9, r14, r2 // .....*......................... sub r4, r4, #16 layer78_loop: - // Instructions: 33 - // Expected cycles: 21 - // Expected IPC: 1.57 - // - // Cycle bound: 21.0 - // IPC bound: 1.57 - // - // Wall time: 6.03s - // User time: 6.03s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - add r10, r10, r6 // *............................. - smlal r7, r11, r9, r3 // *............................. - sub.w r14, r10, r6, lsl #1 // .*............................ - smull r7, r6, r10, r5 // .*............................ - smull r5, r14, r14, r8 // ..*........................... - ldr.w r8, [r1, #8] // ...e.......................... - mul r9, r7, r2 // ...*.......................... - mul r10, r5, r2 // ....*......................... - add r12, r12, r11 // .....*........................ - smlal r7, r6, r9, r3 // .....*........................ - sub.w r7, r12, r11, lsl #1 // ......*....................... - smlal r5, r14, r10, r3 // ......*....................... - add r11, r12, r6 // .......*...................... - ldr.w r5, [r1, #4] // .......e...................... - add r12, r7, r14 // ........*..................... - str.w r12, [r0, #8] // ........*..................... - sub.w r7, r12, r14, lsl #1 // .........*.................... - str.w r7, [r0, #12] // .........*.................... - sub.w r12, r11, r6, lsl #1 // ..........*................... - str.w r12, [r0, #4] // ..........*................... - str r11, [r0], #16 // ...........*.................. // @slothy:core - ldr r7, [r1], #12 // ............e................. - cmp.w r0, r4 // ............*................. - ldr.w r6, [r0, #12] // .............e................ - ldr.w r14, [r0, #8] // ..............e............... - smull r10, r6, r6, r7 // ...............e.............. - smull r7, r11, r14, r7 // ................e............. - mul r14, r10, r2 // .................e............ - ldr.w r12, [r0] // ..................e........... - mul r9, r7, r2 // ..................e........... - smlal r10, r6, r14, r3 // ...................e.......... - ldr.w r10, [r0, #4] // ....................e......... - bne.w layer78_loop // ....................*......... // @slothy:branch - - // ---------- cycle (expected) ----------> - // 0 25 - // |------------------------|------------- - // ldr.w R12, [R1, #4] // ....e.............'......~............. - // ldr.w R14, [R1, #8] // e.................'..~................. - // ldr R11, [R1], #12 // .........e........'...........~........ - // ldr.w R5, [R0] // ...............e..'.................~.. - // ldr.w R6, [R0, #4] // .................e'.................... - // ldr.w R7, [R0, #8] // ...........e......'.............~...... - // ldr.w R8, [R0, #12] // ..........e.......'............~....... - // smull R9, R7, R7, R11 // .............e....'...............~.... - // mul R10, R9, R2 // ...............e..'.................~.. - // smlal R9, R7, R10, R3 // ..................*.................... - // smull R9, R8, R8, R11 // ............e.....'..............~..... - // mul R10, R9, R2 // ..............e...'................~... - // smlal R9, R8, R10, R3 // ................e.'..................~. - // add R5, R5, R7 // ..~...............'....*............... - // add R6, R6, R8 // ..................*.................... - // sub.w R7, R5, R7, lsl #1 // ...~..............'.....*.............. - // sub.w R8, R6, R8, lsl #1 // ..................'*................... - // smull R9, R6, R6, R12 // ..................'*................... - // mul R10, R9, R2 // ~.................'..*................. - // smlal R9, R6, R10, R3 // ..~...............'....*............... - // smull R9, R8, R8, R14 // ..................'.*.................. - // mul R10, R9, R2 // .~................'...*................ - // smlal R9, R8, R10, R3 // ...~..............'.....*.............. - // add R5, R5, R6 // ....~.............'......*............. - // add R7, R7, R8 // .....~............'.......*............ - // sub.w R6, R5, R6, lsl #1 // .......~..........'.........*.......... - // sub.w R8, R7, R8, lsl #1 // ......~...........'........*........... - // str.w R6, [R0, #4] // .......~..........'.........*.......... - // str.w R7, [R0, #8] // .....~............'.......*............ - // str.w R8, [R0, #12] // ......~...........'........*........... - // str R5, [R0], #16 // ........~.........'..........*......... - // cmp.w R0, R4 // .........~........'...........*........ - // bne.w layer78_loop // .................~'...................* - - - // Instructions: 21 - // Expected cycles: 15 - // Expected IPC: 1.40 + // Instructions: 33 + // Expected cycles: 17 + // Expected IPC: 1.94 // - // Cycle bound: 15.0 - // IPC bound: 1.40 + // Cycle bound: 17.0 + // IPC bound: 1.94 // - // Wall time: 0.08s - // User time: 0.08s + // Wall time: 5.86s + // User time: 5.86s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - add r10, r10, r6 // *............................. - smlal r7, r11, r9, r3 // *............................. - sub.w r6, r10, r6, lsl #1 // .*............................ - smull r10, r14, r10, r5 // .*............................ - add r5, r12, r11 // ..*........................... - smull r6, r8, r6, r8 // ..*........................... - sub.w r11, r5, r11, lsl #1 // ...*.......................... - mul r12, r10, r2 // ...*.......................... - mul r9, r6, r2 // ....*......................... - smlal r10, r14, r12, r3 // .....*........................ - smlal r6, r8, r9, r3 // ......*....................... - add r10, r5, r14 // .......*...................... - add r11, r11, r8 // ........*..................... - str.w r11, [r0, #8] // ........*..................... - sub.w r11, r11, r8, lsl #1 // .........*.................... - str.w r11, [r0, #12] // .........*.................... - sub.w r11, r10, r14, lsl #1 // ..........*................... - str.w r11, [r0, #4] // ..........*................... - str r10, [r0], #16 // ...........*.................. // @slothy:core - cmp.w r0, r4 // ............*................. + smlal r14, r7, r9, r3 // *............................. + ldr r10, [r1, #-8] // .*............................ + smull r11, r6, r8, r5 // .*............................ + ldr.w r8, [r0, #4] // ..*........................... + ldr.w r9, [r0] // ..*........................... + add r8, r8, r7 // ...*.......................... + mul r5, r11, r2 // ...*.......................... + ldr r12, [r1, #-4] // ....*......................... + smull r14, r10, r8, r10 // ....*......................... + sub.w r8, r8, r7, lsl #1 // .....*........................ + smlal r11, r6, r5, r3 // .....*........................ + ldr.w r7, [r0, #28] // ......e....................... + mul r5, r14, r2 // ......*....................... + add r11, r9, r6 // .......*...................... + smull r12, r9, r8, r12 // .......*...................... + sub.w r6, r11, r6, lsl #1 // ........*..................... + smlal r14, r10, r5, r3 // ........*..................... + ldr r5, [r1], #12 // .........e.................... + mul r8, r12, r2 // .........*.................... + add r11, r11, r10 // ..........*................... + str r11, [r0], #16 // ..........*................... // @slothy:core // @slothy:before=cmp + sub.w r11, r11, r10, lsl #1 // ...........*.................. + smlal r12, r9, r8, r3 // ...........*.................. + ldr.w r8, [r0, #8] // ............e................. + smull r14, r7, r7, r5 // ............e................. + add r6, r6, r9 // .............*................ + str r6, [r0, #-8] // .............*................ + str r11, [r0, #-12] // ..............*............... + cmp.w r0, r4 // ..............*............... // @slothy:id=cmp + sub.w r6, r6, r9, lsl #1 // ...............*.............. + mul r9, r14, r2 // ...............e.............. + str r6, [r0, #-4] // ................*............. + bne.w layer78_loop // ................*............. // @slothy:branch - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // add r10, r10, r6 // *.............................. - // smlal r7, r11, r9, r3 // *.............................. - // sub.w r14, r10, r6, lsl #1 // .*............................. - // smull r7, r6, r10, r5 // .*............................. - // smull r5, r14, r14, r8 // ..*............................ - // mul r9, r7, r2 // ...*........................... - // mul r10, r5, r2 // ....*.......................... - // add r12, r12, r11 // ..*............................ - // smlal r7, r6, r9, r3 // .....*......................... - // sub.w r7, r12, r11, lsl #1 // ...*........................... - // smlal r5, r14, r10, r3 // ......*........................ - // add r11, r12, r6 // .......*....................... - // add r12, r7, r14 // ........*...................... - // str.w r12, [r0, #8] // ........*...................... - // sub.w r7, r12, r14, lsl #1 // .........*..................... - // str.w r7, [r0, #12] // .........*..................... - // sub.w r12, r11, r6, lsl #1 // ..........*.................... - // str.w r12, [r0, #4] // ..........*.................... - // str r11, [r0], #16 // ...........*................... - // cmp.w r0, r4 // ............*.................. - // bne.w layer78_loop // ..............*................ + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w R12, [R1, #4] // ...........'*...............'~. + // ldr.w R14, [R1, #8] // ...........'...*............'.. + // ldr R11, [R1], #12 // ...e.......'........~.......'.. + // ldr.w R5, [R0] // ...........'.*..............'.. + // ldr.w R6, [R0, #4] // ...........'.*..............'.. + // ldr.w R7, [R0, #8] // ......e....'...........~....'.. + // ldr.w R8, [R0, #12] // e..........'.....~..........'.. + // smull R9, R7, R7, R11 // ...........'*...............'~. + // mul R10, R9, R2 // ...........'..*.............'.. + // smlal R9, R7, R10, R3 // ...........'....*...........'.. + // smull R9, R8, R8, R11 // ......e....'...........~....'.. + // mul R10, R9, R2 // .........e.'..............~.'.. + // smlal R9, R8, R10, R3 // ...........*................~.. + // add R5, R5, R7 // .~.........'......*.........'.. + // add R6, R6, R8 // ...........'..*.............'.. + // sub.w R7, R5, R7, lsl #1 // ..~........'.......*........'.. + // sub.w R8, R6, R8, lsl #1 // ...........'....*...........'.. + // smull R9, R6, R6, R12 // ...........'...*............'.. + // mul R10, R9, R2 // ~..........'.....*..........'.. + // smlal R9, R6, R10, R3 // ..~........'.......*........'.. + // smull R9, R8, R8, R14 // .~.........'......*.........'.. + // mul R10, R9, R2 // ...~.......'........*.......'.. + // smlal R9, R8, R10, R3 // .....~.....'..........*.....'.. + // add R5, R5, R6 // ....~......'.........*......'.. + // add R7, R7, R8 // .......~...'............*...'.. + // sub.w R6, R5, R6, lsl #1 // .....~.....'..........*.....'.. + // sub.w R8, R7, R8, lsl #1 // .........~.'..............*.'.. + // str.w R6, [R0, #4] // ........~..'.............*..'.. + // str.w R7, [R0, #8] // .......~...'............*...'.. + // str.w R8, [R0, #12] // ..........~'...............*'.. + // str R5, [R0], #16 // ....~......'.........*......'.. + // cmp.w R0, R4 // ........~..'.............*..'.. + // bne.w layer78_loop // ..........~'...............*'.. + + + // Instructions: 28 + // Expected cycles: 16 + // Expected IPC: 1.75 + // + // Cycle bound: 16.0 + // IPC bound: 1.75 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r6, [r1, #-8] // *............................. + smlal r14, r7, r9, r3 // *............................. + ldr.w r14, [r0, #4] // .*............................ + smull r12, r10, r8, r5 // .*............................ + add r14, r14, r7 // ..*........................... + ldr.w r9, [r0] // ..*........................... + sub.w r7, r14, r7, lsl #1 // ...*.......................... + smull r6, r14, r14, r6 // ...*.......................... + ldr r8, [r1, #-4] // ....*......................... + mul r5, r12, r2 // ....*......................... + mul r11, r6, r2 // .....*........................ + smlal r12, r10, r5, r3 // ......*....................... + smull r7, r12, r7, r8 // .......*...................... + add r9, r9, r10 // ........*..................... + smlal r6, r14, r11, r3 // ........*..................... + sub.w r6, r9, r10, lsl #1 // .........*.................... + mul r10, r7, r2 // .........*.................... + add r9, r9, r14 // ..........*................... + str r9, [r0], #16 // ..........*................... // @slothy:core // @slothy:before=cmp + cmp.w r0, r4 // ...........*.................. // @slothy:id=cmp + smlal r7, r12, r10, r3 // ...........*.................. + sub.w r14, r9, r14, lsl #1 // ............*................. + str r14, [r0, #-12] // ............*................. + add r6, r6, r12 // .............*................ + str r6, [r0, #-8] // .............*................ + sub.w r6, r6, r12, lsl #1 // ..............*............... + str r6, [r0, #-4] // ..............*............... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // smlal r14, r7, r9, r3 // *.............................. + // ldr r10, [r1, #-8] // *.............................. + // smull r11, r6, r8, r5 // .*............................. + // ldr.w r8, [r0, #4] // .*............................. + // ldr.w r9, [r0] // ..*............................ + // add r8, r8, r7 // ..*............................ + // mul r5, r11, r2 // ....*.......................... + // ldr r12, [r1, #-4] // ....*.......................... + // smull r14, r10, r8, r10 // ...*........................... + // sub.w r8, r8, r7, lsl #1 // ...*........................... + // smlal r11, r6, r5, r3 // ......*........................ + // mul r5, r14, r2 // .....*......................... + // add r11, r9, r6 // ........*...................... + // smull r12, r9, r8, r12 // .......*....................... + // sub.w r6, r11, r6, lsl #1 // .........*..................... + // smlal r14, r10, r5, r3 // ........*...................... + // mul r8, r12, r2 // .........*..................... + // add r11, r11, r10 // ..........*.................... + // str r11, [r0], #16 // ..........*.................... + // sub.w r11, r11, r10, lsl #1 // ............*.................. + // smlal r12, r9, r8, r3 // ...........*................... + // add r6, r6, r9 // .............*................. + // str r6, [r0, #-8] // .............*................. + // str r11, [r0, #-12] // ............*.................. + // cmp.w r0, r4 // ...........*................... + // sub.w r6, r6, r9, lsl #1 // ..............*................ + // str r6, [r0, #-4] // ..............*................ + // bne.w layer78_loop // ...............*............... // restore registers diff --git a/examples/opt/armv7m/pointwise_acc_montgomery_dilithium_opt_m7.s b/examples/opt/armv7m/pointwise_acc_montgomery_dilithium_opt_m7.s index 471ffff6..3febf771 100644 --- a/examples/opt/armv7m/pointwise_acc_montgomery_dilithium_opt_m7.s +++ b/examples/opt/armv7m/pointwise_acc_montgomery_dilithium_opt_m7.s @@ -36,28 +36,28 @@ pqcrystals_dilithium_asm_pointwise_acc_montgomery_opt_m7: // 85x3 = 255 coefficients movw ctr, #85 - // Instructions: 4 - // Expected cycles: 4 - // Expected IPC: 1.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r5, [r2, #8] // *............................. - ldr r8, [r2], #12 // *............................. - ldr.w r14, [r1, #8] // .*............................ - smull r5, r14, r14, r5 // ...*.......................... + // Instructions: 4 + // Expected cycles: 4 + // Expected IPC: 1.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r5, [r2], #12 // *............................. + ldr r6, [r1], #12 // .*............................ + ldr r7, [r2, #-8] // ..*........................... + smull r11, r6, r6, r5 // ...*.......................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w r10, [r2, #8] // *.............................. - // ldr.w r7, [r1, #8] // .*............................. - // ldr r8, [r2], #12 // *.............................. - // smull r5, r14, r7, r10 // ...*........................... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r5, [r2], #12 // *.............................. + // ldr r6, [r1], #12 // .*............................. + // ldr r7, [r2, #-8] // ..*............................ + // smull r11, r6, r6, r5 // ...*........................... sub r12, r12, #1 1: @@ -65,127 +65,127 @@ pqcrystals_dilithium_asm_pointwise_acc_montgomery_opt_m7: // Expected cycles: 13 // Expected IPC: 2.00 // - // Wall time: 12.43s - // User time: 12.43s + // Wall time: 9.31s + // User time: 9.31s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr.w r10, [r2, #8] // e............................. - ldr r11, [r1], #12 // *............................. - ldr r7, [r2, #-8] // .*............................ - mul r9, r5, r3 // .*............................ - ldr r6, [r1, #-8] // ..*........................... - smull r11, r8, r11, r8 // ..*........................... - subs r12, #1 // ...*.......................... - smlal r5, r14, r9, r4 // ...*.......................... - ldr.w r5, [r0, #8] // ....*......................... - mul r9, r11, r3 // ....*......................... - add.w r14, r14, r5 // .....*........................ - smull r5, r6, r6, r7 // .....*........................ - ldr.w r7, [r1, #8] // ......e....................... - smlal r11, r8, r9, r4 // ......*....................... - ldr.w r9, [r0] // .......*...................... - mul r11, r5, r3 // .......*...................... - str r14, [r0, #8] // ........*..................... - ldr.w r14, [r0, #4] // ........*..................... - add.w r9, r8, r9 // .........*.................... - smlal r5, r6, r11, r4 // .........*.................... - str r9, [r0], #12 // ..........*................... - ldr r8, [r2], #12 // ..........e................... - add.w r9, r6, r14 // ...........*.................. - smull r5, r14, r7, r10 // ...........e.................. - str r9, [r0, #-8] // ............*................. + ldr r14, [r1, #-8] // *............................. + ldr r10, [r2, #-4] // *............................. + ldr.w r8, [r0] // .*............................ + mul r5, r11, r3 // .*............................ + subs r12, #1 // ..*........................... + smull r14, r7, r14, r7 // ..*........................... + ldr r9, [r1, #-4] // ...*.......................... + smlal r11, r6, r5, r4 // ...*.......................... + ldr r5, [r2], #12 // ....e......................... + mul r11, r14, r3 // ....*......................... + add.w r8, r6, r8 // .....*........................ + smull r10, r9, r9, r10 // .....*........................ + ldr r6, [r1], #12 // ......e....................... + smlal r14, r7, r11, r4 // ......*....................... + ldr.w r14, [r0, #4] // .......*...................... + mul r11, r10, r3 // .......*...................... + add.w r7, r7, r14 // ........*..................... + str r7, [r0, #4] // ........*..................... + ldr r7, [r2, #-8] // .........e.................... + smlal r10, r9, r11, r4 // .........*.................... + ldr.w r10, [r0, #8] // ..........*................... + smull r11, r6, r6, r5 // ..........e................... + add.w r5, r9, r10 // ...........*.................. + str r8, [r0], #12 // ...........*.................. + str r5, [r0, #-4] // ............*................. bne.w 1b // ............*................. // @slothy:branch // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r6, [r1, #4] // ..~..........'.*..........'.~.. - // ldr.w r7, [r1, #8] // ......e......'.....~......'.... - // ldr r5, [r1], #12 // ~............*............~.... - // ldr.w r9, [r2, #4] // .~...........'*...........'~... - // ldr.w r10, [r2, #8] // e............~............~.... - // ldr r8, [r2], #12 // ..........e..'.........~..'.... - // smull r5, r14, r5, r8 // ..~..........'.*..........'.~.. - // mul r8, r5, r3 // ....~........'...*........'.... - // smlal r5, r14, r8, r4 // ......~......'.....*......'.... - // smull r6, r5, r6, r9 // .....~.......'....*.......'.... - // mul r9, r6, r3 // .......~.....'......*.....'.... - // smlal r6, r5, r9, r4 // .........~...'........*...'.... - // smull r7, r6, r7, r10 // ...........e.'..........~.'.... - // mul r10, r7, r3 // .~...........'*...........'~... - // smlal r7, r6, r10, r4 // ...~.........'..*.........'..~. - // ldr.w r8, [r0] // .......~.....'......*.....'.... - // ldr.w r9, [r0, #4] // ........~....'.......*....'.... - // ldr.w r10, [r0, #8] // ....~........'...*........'.... - // add.w r14, r14, r8 // .........~...'........*...'.... - // str r14, [r0], #12 // ..........~..'.........*..'.... - // add.w r5, r5, r9 // ...........~.'..........*.'.... - // str r5, [r0, #-8] // ............~'...........*'.... - // add.w r6, r6, r10 // .....~.......'....*.......'.... - // str r6, [r0, #-4] // ........~....'.......*....'.... - // subs r12, #1 // ...~.........'..*.........'..~. - // bne.w 1b // ............~'...........*'.... + // ldr.w r6, [r1, #4] // .........*............~........ + // ldr.w r7, [r1, #8] // .........'..*.........'..~..... + // ldr r5, [r1], #12 // ..e......'.....~......'.....~.. + // ldr.w r9, [r2, #4] // .....e...'........~...'........ + // ldr.w r10, [r2, #8] // .........*............~........ + // ldr r8, [r2], #12 // e........'...~........'...~.... + // smull r5, r14, r5, r8 // ......e..'.........~..'........ + // mul r8, r5, r3 // .........'*...........'~....... + // smlal r5, r14, r8, r4 // .........'..*.........'..~..... + // smull r6, r5, r6, r9 // .........'.*..........'.~...... + // mul r9, r6, r3 // ~........'...*........'...~.... + // smlal r6, r5, r9, r4 // ..~......'.....*......'.....~.. + // smull r7, r6, r7, r10 // .~.......'....*.......'....~... + // mul r10, r7, r3 // ...~.....'......*.....'......~. + // smlal r7, r6, r10, r4 // .....~...'........*...'........ + // ldr.w r8, [r0] // .........'*...........'~....... + // ldr.w r9, [r0, #4] // ...~.....'......*.....'......~. + // ldr.w r10, [r0, #8] // ......~..'.........*..'........ + // add.w r14, r14, r8 // .~.......'....*.......'....~... + // str r14, [r0], #12 // .......~.'..........*.'........ + // add.w r5, r5, r9 // ....~....'.......*....'........ + // str r5, [r0, #-8] // ....~....'.......*....'........ + // add.w r6, r6, r10 // .......~.'..........*.'........ + // str r6, [r0, #-4] // ........~'...........*'........ + // subs r12, #1 // .........'.*..........'.~...... + // bne.w 1b // ........~'...........*'........ // Instructions: 22 // Expected cycles: 12 // Expected IPC: 1.83 // - // Wall time: 0.12s - // User time: 0.12s + // Wall time: 0.06s + // User time: 0.06s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r7, [r2, #-8] // *............................. - mul r6, r5, r3 // *............................. - ldr.w r10, [r0, #8] // .*............................ - ldr r11, [r1], #12 // .*............................ - ldr.w r9, [r0] // ..*........................... - smlal r5, r14, r6, r4 // ..*........................... - ldr r5, [r1, #-8] // ...*.......................... - smull r11, r6, r11, r8 // ...*.......................... - add.w r8, r14, r10 // ....*......................... - str r8, [r0, #8] // ....*......................... - subs r12, #1 // .....*........................ - smull r8, r5, r5, r7 // .....*........................ - mul r7, r11, r3 // ......*....................... - mul r10, r8, r3 // .......*...................... - smlal r11, r6, r7, r4 // ........*..................... - ldr.w r14, [r0, #4] // .........*.................... - smlal r8, r5, r10, r4 // .........*.................... - add.w r8, r6, r9 // ..........*................... - str r8, [r0], #12 // ..........*................... - add.w r8, r5, r14 // ...........*.................. - str r8, [r0, #-8] // ...........*.................. + ldr r8, [r2, #-4] // *............................. + mul r9, r11, r3 // *............................. + ldr r5, [r1, #-8] // .*............................ + ldr r10, [r1, #-4] // ..*........................... + smlal r11, r6, r9, r4 // ..*........................... + ldr.w r11, [r0] // ...*.......................... + smull r5, r7, r5, r7 // ...*.......................... + smull r10, r9, r10, r8 // ....*......................... + add.w r8, r6, r11 // .....*........................ + mul r11, r5, r3 // .....*........................ + ldr.w r6, [r0, #8] // ......*....................... + mul r14, r10, r3 // ......*....................... + subs r12, #1 // .......*...................... + smlal r5, r7, r11, r4 // .......*...................... + ldr.w r11, [r0, #4] // ........*..................... + smlal r10, r9, r14, r4 // ........*..................... + str r8, [r0], #12 // .........*.................... + add.w r11, r7, r11 // .........*.................... + add.w r14, r9, r6 // ..........*................... + str r14, [r0, #-4] // ..........*................... + str r11, [r0, #-8] // ...........*.................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r11, [r1], #12 // .*............................. - // ldr r7, [r2, #-8] // *.............................. - // mul r9, r5, r3 // *.............................. - // ldr r6, [r1, #-8] // ...*........................... - // smull r11, r8, r11, r8 // ...*........................... - // subs r12, #1 // .....*......................... - // smlal r5, r14, r9, r4 // ..*............................ - // ldr.w r5, [r0, #8] // .*............................. - // mul r9, r11, r3 // ......*........................ - // add.w r14, r14, r5 // ....*.......................... - // smull r5, r6, r6, r7 // .....*......................... - // smlal r11, r8, r9, r4 // ........*...................... - // ldr.w r9, [r0] // ..*............................ - // mul r11, r5, r3 // .......*....................... - // str r14, [r0, #8] // ....*.......................... - // ldr.w r14, [r0, #4] // .........*..................... - // add.w r9, r8, r9 // ..........*.................... - // smlal r5, r6, r11, r4 // .........*..................... - // str r9, [r0], #12 // ..........*.................... - // add.w r9, r6, r14 // ...........*................... - // str r9, [r0, #-8] // ...........*................... - // bne.w 1b // .......*....................... + // ldr r14, [r1, #-8] // .*............................. + // ldr r10, [r2, #-4] // *.............................. + // ldr.w r8, [r0] // ...*........................... + // mul r5, r11, r3 // *.............................. + // subs r12, #1 // .......*....................... + // smull r14, r7, r14, r7 // ...*........................... + // ldr r9, [r1, #-4] // ..*............................ + // smlal r11, r6, r5, r4 // ..*............................ + // mul r11, r14, r3 // .....*......................... + // add.w r8, r6, r8 // .....*......................... + // smull r10, r9, r9, r10 // ....*.......................... + // smlal r14, r7, r11, r4 // .......*....................... + // ldr.w r14, [r0, #4] // ........*...................... + // mul r11, r10, r3 // ......*........................ + // add.w r7, r7, r14 // .........*..................... + // str r7, [r0, #4] // ...........*................... + // smlal r10, r9, r11, r4 // ........*...................... + // ldr.w r10, [r0, #8] // ......*........................ + // add.w r5, r9, r10 // ..........*.................... + // str r8, [r0], #12 // .........*..................... + // str r5, [r0, #-4] // ..........*.................... + // bne.w 1b // ...........*................... // final coefficient diff --git a/examples/opt/armv7m/pointwise_montgomery_dilithium_opt_m7.s b/examples/opt/armv7m/pointwise_montgomery_dilithium_opt_m7.s index 48cee3f3..e0b4c9c3 100644 --- a/examples/opt/armv7m/pointwise_montgomery_dilithium_opt_m7.s +++ b/examples/opt/armv7m/pointwise_montgomery_dilithium_opt_m7.s @@ -37,28 +37,28 @@ pqcrystals_dilithium_asm_pointwise_montgomery_opt_m7: // 85x3 = 255 coefficients movw ctr, #85 - // Instructions: 4 - // Expected cycles: 2 - // Expected IPC: 2.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> + // Instructions: 4 + // Expected cycles: 4 + // Expected IPC: 1.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r8, [r1], #12 // *............................. + ldr r9, [r2], #12 // .*............................ + ldr r5, [r1, #-4] // ..*........................... + ldr r7, [r2, #-4] // ...*.......................... + + // ------ cycle (expected) ------> // 0 25 - // |------------------------|---- - ldr r11, [r2], #12 // *............................. - ldr r5, [r2, #-8] // *............................. - ldr r8, [r1], #12 // .*............................ - ldr r6, [r1, #-8] // .*............................ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr r11, [r2], #12 // *.............................. - // ldr r8, [r1], #12 // .*............................. - // ldr r5, [r2, #-8] // *.............................. - // ldr r6, [r1, #-8] // .*............................. + // |------------------------|----- + // ldr r8, [r1], #12 // *.............................. + // ldr r9, [r2], #12 // .*............................. + // ldr r5, [r1, #-4] // ..*............................ + // ldr r7, [r2, #-4] // ...*........................... sub r12, r12, #1 1: @@ -66,101 +66,101 @@ pqcrystals_dilithium_asm_pointwise_montgomery_opt_m7: // Expected cycles: 12 // Expected IPC: 1.67 // - // Wall time: 0.52s - // User time: 0.52s + // Wall time: 0.39s + // User time: 0.39s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r7, [r1, #-4] // *............................. - smull r11, r8, r8, r11 // *............................. - ldr r14, [r2, #-4] // .*............................ - smull r5, r6, r6, r5 // .*............................ + smull r11, r6, r8, r9 // *............................. + smull r9, r14, r5, r7 // .*............................ + ldr r8, [r1, #-8] // ..*........................... mul r10, r11, r3 // ..*........................... - smull r7, r14, r7, r14 // ...*.......................... - mul r9, r5, r3 // ....*......................... - smlal r11, r8, r10, r4 // .....*........................ - subs r12, #1 // ......*....................... - str r8, [r0], #4 // ......*....................... - ldr r11, [r2], #12 // .......e...................... - mul r10, r7, r3 // .......*...................... - ldr r8, [r1], #12 // ........e..................... - smlal r5, r6, r9, r4 // ........*..................... - ldr r5, [r2, #-8] // .........e.................... - smlal r7, r14, r10, r4 // .........*.................... - str r6, [r0], #4 // ..........*................... - ldr r6, [r1, #-8] // ..........e................... + ldr r5, [r2, #-8] // ...*.......................... + mul r7, r9, r3 // ...*.......................... + smlal r11, r6, r10, r4 // ....*......................... + smull r11, r10, r8, r5 // .....*........................ + ldr r8, [r1], #12 // ......e....................... + smlal r9, r14, r7, r4 // ......*....................... + ldr r9, [r2], #12 // .......e...................... + mul r7, r11, r3 // .......*...................... + str r6, [r0], #4 // ........*..................... + subs r12, #1 // ........*..................... + ldr r5, [r1, #-4] // .........e.................... + smlal r11, r10, r7, r4 // .........*.................... + str r10, [r0], #4 // ..........*................... + ldr r7, [r2, #-4] // ..........e................... str r14, [r0], #4 // ...........*.................. bne.w 1b // ...........*.................. // @slothy:branch // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r6, [r1, #4] // ...e.'.........~.'.........~.'. - // ldr.w r7, [r1, #8] // .....*...........~...........~. - // ldr r5, [r1], #12 // .e...'.......~...'.......~...'. - // ldr.w r9, [r2, #4] // ..e..'........~..'........~..'. - // ldr.w r10, [r2, #8] // .....'*..........'~..........'. - // ldr r8, [r2], #12 // e....'......~....'......~....'. - // smull r5, r14, r5, r8 // .....*...........~...........~. - // mul r8, r5, r3 // .....'.*.........'.~.........'. - // smlal r5, r14, r8, r4 // .....'....*......'....~......'. - // str r14, [r0], #4 // .....'.....*.....'.....~.....'. - // smull r6, r14, r6, r9 // .....'*..........'~..........'. - // mul r9, r6, r3 // .....'...*.......'...~.......'. - // smlal r6, r14, r9, r4 // .~...'.......*...'.......~...'. - // str r14, [r0], #4 // ...~.'.........*.'.........~.'. - // smull r7, r14, r7, r10 // .....'..*........'..~........'. - // mul r10, r7, r3 // ~....'......*....'......~....'. - // smlal r7, r14, r10, r4 // ..~..'........*..'........~..'. - // str r14, [r0], #4 // ....~'..........*'..........~'. - // subs r12, #1 // .....'.....*.....'.....~.....'. - // bne.w 1b // ....~'..........*'..........~'. - - - // Instructions: 16 - // Expected cycles: 12 - // Expected IPC: 1.33 - // - // Wall time: 0.06s - // User time: 0.06s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - subs r12, #1 // *............................. - smull r7, r11, r8, r11 // *............................. - ldr r14, [r2, #-4] // .*............................ - smull r9, r10, r6, r5 // .*............................ - ldr r6, [r1, #-4] // ..*........................... - mul r8, r7, r3 // ..*........................... - mul r5, r9, r3 // ...*.......................... - smull r6, r14, r6, r14 // ....*......................... - smlal r7, r11, r8, r4 // .....*........................ - mul r8, r6, r3 // ......*....................... - str r11, [r0], #4 // .......*...................... - smlal r9, r10, r5, r4 // ........*..................... - smlal r6, r14, r8, r4 // .........*.................... - str r10, [r0], #4 // ..........*................... - str r14, [r0], #4 // ...........*.................. + // ldr.w r6, [r1, #4] // ......'.*.........'.~.......... + // ldr.w r7, [r1, #8] // ...e..'........~..'........~... + // ldr r5, [r1], #12 // e.....'.....~.....'.....~...... + // ldr.w r9, [r2, #4] // ......'..*........'..~......... + // ldr.w r10, [r2, #8] // ....e.'.........~.'.........~.. + // ldr r8, [r2], #12 // .e....'......~....'......~..... + // smull r5, r14, r5, r8 // ......*...........~............ + // mul r8, r5, r3 // ......'.*.........'.~.......... + // smlal r5, r14, r8, r4 // ......'...*.......'...~........ + // str r14, [r0], #4 // ..~...'.......*...'.......~.... + // smull r6, r14, r6, r9 // ......'....*......'....~....... + // mul r9, r6, r3 // .~....'......*....'......~..... + // smlal r6, r14, r9, r4 // ...~..'........*..'........~... + // str r14, [r0], #4 // ....~.'.........*.'.........~.. + // smull r7, r14, r7, r10 // ......'*..........'~........... + // mul r10, r7, r3 // ......'..*........'..~......... + // smlal r7, r14, r10, r4 // ~.....'.....*.....'.....~...... + // str r14, [r0], #4 // .....~'..........*'..........~. + // subs r12, #1 // ..~...'.......*...'.......~.... + // bne.w 1b // .....~'..........*'..........~. + + + // Instructions: 16 + // Expected cycles: 12 + // Expected IPC: 1.33 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r11, [r2, #-8] // *............................. + smull r6, r8, r8, r9 // *............................. + ldr r10, [r1, #-8] // .*............................ + smull r5, r14, r5, r7 // .*............................ + mul r9, r6, r3 // ..*........................... + smull r10, r7, r10, r11 // ...*.......................... + smlal r6, r8, r9, r4 // ....*......................... + mul r6, r10, r3 // .....*........................ + mul r9, r5, r3 // ......*....................... + smlal r10, r7, r6, r4 // .......*...................... + smlal r5, r14, r9, r4 // ........*..................... + str r8, [r0], #4 // .........*.................... + subs r12, #1 // .........*.................... + str r7, [r0], #4 // ..........*................... + str r14, [r0], #4 // ...........*.................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r7, [r1, #-4] // ..*............................ - // smull r11, r8, r8, r11 // *.............................. - // ldr r14, [r2, #-4] // .*............................. - // smull r5, r6, r6, r5 // .*............................. + // smull r11, r6, r8, r9 // *.............................. + // smull r9, r14, r5, r7 // .*............................. + // ldr r8, [r1, #-8] // .*............................. // mul r10, r11, r3 // ..*............................ - // smull r7, r14, r7, r14 // ....*.......................... - // mul r9, r5, r3 // ...*........................... - // smlal r11, r8, r10, r4 // .....*......................... - // subs r12, #1 // *.............................. - // str r8, [r0], #4 // .......*....................... - // mul r10, r7, r3 // ......*........................ - // smlal r5, r6, r9, r4 // ........*...................... - // smlal r7, r14, r10, r4 // .........*..................... - // str r6, [r0], #4 // ..........*.................... + // ldr r5, [r2, #-8] // *.............................. + // mul r7, r9, r3 // ......*........................ + // smlal r11, r6, r10, r4 // ....*.......................... + // smull r11, r10, r8, r5 // ...*........................... + // smlal r9, r14, r7, r4 // ........*...................... + // mul r7, r11, r3 // .....*......................... + // str r6, [r0], #4 // .........*..................... + // subs r12, #1 // .........*..................... + // smlal r11, r10, r7, r4 // .......*....................... + // str r10, [r0], #4 // ..........*.................... // str r14, [r0], #4 // ...........*................... // bne.w 1b // ...........*................... diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py index 1de96011..8ab920f5 100644 --- a/slothy/targets/arm_v7m/arch_v7m.py +++ b/slothy/targets/arm_v7m/arch_v7m.py @@ -1486,7 +1486,6 @@ def make(cls, src): obj.increment = None obj.pre_index = 0 obj.addr = obj.args_in[0] - obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra return obj def write(self): @@ -1505,7 +1504,6 @@ def make(cls, src): obj.increment = None obj.pre_index = obj.immediate obj.addr = obj.args_in[0] - obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra return obj def write(self): @@ -1528,7 +1526,6 @@ def make(cls, src): obj = Armv7mInstruction.build(cls, src) obj.increment = None obj.pre_index = obj.immediate - obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra obj.addr = obj.args_in[0] return obj @@ -1545,7 +1542,6 @@ def make(cls, src): obj = Armv7mInstruction.build(cls, src) obj.increment = None obj.pre_index = obj.immediate - obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra obj.addr = obj.args_in[0] return obj @@ -1940,6 +1936,19 @@ def core(inst,t,log=None): add_comments(inst.source_line.comments) ldr.source_line = ldr_src + # In case the address register is also contained in the + # register list, we need to overwrite the address register + # in the last ldr + ldrs_reordered = [] + for ldr, reg in zip(ldrs, regs): + if reg != ptr: + ldrs_reordered.append(ldr) + + for ldr, reg in zip(ldrs, regs): + if reg == ptr: + ldrs_reordered.append(ldr) + ldrs = ldrs_reordered + if log is not None: log(f"ldm splitting: {t.inst}; {[ldr for ldr in ldrs]}") @@ -2128,6 +2137,19 @@ def core(inst,t,log=None): add_comments(inst.source_line.comments) ldr.source_line = ldr_src + # In case the address register is also contained in the + # register list, we need to overwrite the address register + # in the last ldr + ldrs_reordered = [] + for ldr, reg in zip(ldrs, regs): + if reg != ptr: + ldrs_reordered.append(ldr) + + for ldr, reg in zip(ldrs, regs): + if reg == ptr: + ldrs_reordered.append(ldr) + ldrs = ldrs_reordered + if log is not None: log(f"ldrd splitting: {t.inst}; {[ldr for ldr in ldrs]}")