fast(left) vs slow(right)
103 removals
Lines | |
---|---|
Total | 156 |
Removed | -64.7%101 |
Words | |
Total | 460 |
Removed | -63.9%294 |
156 lines
43 additions
Lines | |
---|---|
Total | 98 |
Added | +43.9%43 |
Words | |
Total | 270 |
Added | +38.5%104 |
98 lines
.file "s352.c"
.file "s352.c"
.text
.text
.globl dummy // -- Begin function dummy
.globl dummy // -- Begin function dummy
.p2align 4
.p2align 4
.type dummy,@function
.type dummy,@function
dummy: // @dummy
dummy: // @dummy
.cfi_startproc
.cfi_startproc
// %bb.0:
// %bb.0:
mov w0, wzr
mov w0, wzr
ret
ret
.Lfunc_end0:
.Lfunc_end0:
.size dummy, .Lfunc_end0-dummy
.size dummy, .Lfunc_end0-dummy
.cfi_endproc
.cfi_endproc
// -- End function
// -- End function
.globl s352 // -- Begin function s352
.globl s352 // -- Begin function s352
.p2align 4
.p2align 4
.type s352,@function
.type s352,@function
s352: // @s352
s352: // @s352
.cfi_startproc
.cfi_startproc
// %bb.0:
// %bb.0:
stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
.cfi_def_cfa_offset 32
stp d9, d8, [sp, #16] // 16-byte Folded Spill
.cfi_offset b8, -8
.cfi_offset b9, -16
.cfi_offset b10, -24
.cfi_offset b11, -32
adrp x9, :got:b
adrp x9, :got:b
adrp x10, :got:a
adrp x10, :got:a
mov w11, #13568 // =0x3500
mov w11, #13568 // =0x3500
mov w8, wzr
mov w8, wzr
ldr x9, [x9, :got_lo12:b]
ldr x9, [x9, :got_lo12:b]
ldr x10, [x10, :got_lo12:a]
ldr x10, [x10, :got_lo12:a]
movk w11, #12, lsl #16
movk w11, #12, lsl #16
.p2align 5, , 16
.p2align 5, , 16
.LBB1_1: // =>This Loop Header: Depth=1
.LBB1_1: // =>This Loop Header: Depth=1
// Child Loop BB1_2 Depth 2
// Child Loop BB1_2 Depth 2
add x12, x9, #80
add x13, x10, #80
mov w14, #6400 // =0x1900
mov w14, #6400 // =0x1900
movi d0, #0000000000000000
movi d0, #0000000000000000
mov x12, x10
mov x13, x9
movi d1, #0000000000000000
movi d1, #0000000000000000
movi d2, #0000000000000000
movi d3, #0000000000000000
.p2align 5, , 16
.p2align 5, , 16
.LBB1_2: // Parent Loop BB1_1 Depth=1
.LBB1_2: // Parent Loop BB1_1 Depth=1
// => This Inner Loop Header: Depth=2
// => This Inner Loop Header: Depth=2
ldp q8, q4, [x13, #-80]
ldp q2, q3, [x12]
ldp q9, q10, [x12, #-80]
ldp q16, q17, [x13]
subs x14, x14, #8
subs x14, x14, #4
fmul v4.4s, v10.4s, v4.4s
fmul v3.4s, v17.4s, v3.4s
ldur q5, [x13, #-48]
ldr q4, [x12, #32]
ldur q21, [x13, #-40]
ldur q5, [x12, #40]
ldur q23, [x13, #-24]
ldur q6, [x12, #56]
ldur q6, [x13, #40]
ldr q18, [x13, #32]
ldur q7, [x13, #56]
ldur q19, [x13, #40]
ldur q11, [x12, #-48]
ldur q20, [x13, #56]
ldur q29, [x12, #-40]
ldr d7, [x12, #72]
ldur q30, [x12, #-24]
ldr d21, [x13, #72]
ldur q20, [x12, #40]
add x13, x13, #80
ldur q22, [x12, #56]
add x12, x12, #80
ldur d25, [x13, #-8]
fmul v2.4s, v16.4s, v2.4s
fmul v8.4s, v9.4s, v8.4s
fmul v4.4s, v18.4s, v4.4s
ldp q17, q18, [x13]
ldr d16, [x13, #72]
ldur d31, [x12, #-8]
ldp q26, q27, [x12]
fmul v5.4s, v11.4s, v5.4s
fmul v23.4s, v30.4s, v23.4s
fmul v21.4s, v29.4s, v21.4s
fmul v7.4s, v22.4s, v7.4s
fmul v6.4s, v20.4s, v6.4s
fmul v6.4s, v20.4s, v6.4s
fmul v25.4s, v31.4s, v25.4s
fmul v5.4s, v19.4s, v5.4s
fmul v18.4s, v27.4s, v18.4s
fmul v7.4s, v21.4s, v7.4s
fmul v17.4s, v26.4s, v17.4s
zip2 v16.4s, v2.4s, v3.4s
zip2 v9.4s, v8.4s, v4.4s
zip2 v17.4s, v3.4s, v2.4s
ldr d24, [x12, #72]
ext v18.16b, v2.16b, v4.16b, #12
ldr q19, [x13, #32]
zip2 v19.4s, v5.4s, v6.4s
ldr q28, [x12, #32]
zip2 v20.4s, v6.4s, v5.4s
zip2 v10.4s, v4.4s, v8.4s
ext v21.16b, v5.16b, v7.16b, #12
fmul v16.4s, v24.4s, v16.4s
trn2 v16.4s, v2.4s, v16.4s
ext v11.16b, v8.16b, v5.16b, #12
ext v17.16b, v17.16b, v2.16b, #4
zip2 v29.4s, v21.4s, v23.4s
mov v2.s[1], v3.s[1]
zip2 v30.4s, v23.4s, v21.4s
trn2 v19.4s, v5.4s, v19.4s
zip2 v20.4s, v6.4s, v7.4s
ext v20.16b, v20.16b, v5.16b, #4
zip2 v22.4s, v7.4s, v6.4s
mov v5.s[1], v6.s[1]
fmul v19.4s, v28.4s, v19.4s
mov v3.s[1], v4.s[1]
ext v31.16b, v21.16b, v25.16b, #12
zip2 v26.4s, v17.4s, v18.4s
zip2 v27.4s, v18.4s, v17.4s
ext v24.16b, v6.16b, v16.16b, #12
ext v28.16b, v17.16b, v19.16b, #12
add x12, x12, #160
add x13, x13, #160
trn2 v9.4s, v8.4s, v9.4s
ext v10.16b, v10.16b, v8.16b, #4
mov v8.s[1], v4.s[1]
trn2 v29.4s, v21.4s, v29.4s
ext v30.16b, v30.16b, v21.16b, #4
mov v21.s[1], v23.s[1]
trn2 v20.4s, v6.4s, v20.4s
ext v22.16b, v22.16b, v6.16b, #4
mov v6.s[1], v7.s[1]
mov v6.s[1], v7.s[1]
trn2 v26.4s, v17.4s, v26.4s
fadd v0.2s, v2.2s, v0.2s
ext v27.16b, v27.16b, v17.16b, #4
fadd v1.2s, v5.2s, v1.2s
mov v17.s[1], v18.s[1]
fadd v2.2s, v17.2s, v18.2s
mov v4.s[1], v5.s[1]
fadd v0.2s, v0.2s, v16.2s
mov v23.s[1], v25.s[1]
fadd v1.2s, v1.2s, v19.2s
mov v18.s[1], v19.s[1]
fadd v0.2s, v0.2s, v2.2s
mov v7.s[1], v16.s[1]
fadd v2.2s, v20.2s, v21.2s
fadd v0.2s, v8.2s, v0.2s
fadd v1.2s, v1.2s, v2.2s
fadd v1.2s, v21.2s, v1.2s
fadd v0.2s, v0.2s, v3.2s
fadd v3.2s, v6.2s, v3.2s
fadd v6.2s, v10.2s, v11.2s
fadd v2.2s, v17.2s, v2.2s
fadd v0.2s, v0.2s, v9.2s
fadd v1.2s, v1.2s, v29.2s
fadd v2.2s, v2.2s, v26.2s
fadd v3.2s, v3.2s, v20.2s
fadd v0.2s, v0.2s, v6.2s
fadd v6.2s, v30.2s, v31.2s
fadd v1.2s, v1.2s, v6.2s
fadd v1.2s, v1.2s, v6.2s
fadd v6.2s, v27.2s, v28.2s
fadd v0.2s, v0.2s, v4.2s
fadd v2.2s, v2.2s, v6.2s
fadd v6.2s, v22.2s, v24.2s
fadd v1.2s, v1.2s, v23.2s
fadd v3.2s, v3.2s, v6.2s
fadd v2.2s, v2.2s, v18.2s
fadd v3.2s, v3.2s, v7.2s
b.ne .LBB1_2
b.ne .LBB1_2
// %bb.3: // in Loop: Header=BB1_1 Depth=1
// %bb.3: // in Loop: Header=BB1_1 Depth=1
add w8, w8, #1
add w8, w8, #1
cmp w8, w11
cmp w8, w11
b.ne .LBB1_1
b.ne .LBB1_1
// %bb.4:
// %bb.4:
fadd v0.2s, v1.2s, v0.2s
fadd v0.2s, v1.2s, v0.2s
ldp d9, d8, [sp, #16] // 16-byte Folded Reload
fadd v0.2s, v2.2s, v0.2s
fadd v0.2s, v3.2s, v0.2s
faddp s0, v0.2s
faddp s0, v0.2s
ldp d11, d10, [sp], #32 // 16-byte Folded Reload
.cfi_def_cfa_offset 0
.cfi_restore b8
.cfi_restore b9
.cfi_restore b10
.cfi_restore b11
ret
ret
.Lfunc_end1:
.Lfunc_end1:
.size s352, .Lfunc_end1-s352
.size s352, .Lfunc_end1-s352
.cfi_endproc
.cfi_endproc
// -- End function
// -- End function
.ident "clang version 21.0.0git (https://github.com/llvm/llvm-project.git 50349d12aa2824a0418388c9de320a2e96ad2ca5)"
.ident "clang version 21.0.0git (https://github.com/llvm/llvm-project.git bb2f7596a8b963af06e9dde821dcea1252ba4892)"
.section ".note.GNU-stack","",@progbits
.section ".note.GNU-stack","",@progbits
.addrsig
.addrsig