1 dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. 2 3 dnl Contributed to the GNU project by Torbjrn Granlund. 4 5 dnl Copyright 2012, 2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C AMD K8,K9 - 37 C AMD K10 - 38 C AMD bd1 - 39 C AMD bd2 - 40 C AMD bd3 - 41 C AMD bd4 - 42 C AMD zen ? 43 C AMD bt1 - 44 C AMD bt2 - 45 C Intel P4 - 46 C Intel PNR - 47 C Intel NHM - 48 C Intel SBR - 49 C Intel IBR - 50 C Intel HWL - 51 C Intel BWL ? 52 C Intel SKL ? 53 C Intel atom - 54 C Intel SLM - 55 C VIA nano - 56 57 define(`rp', `%rdi') dnl rcx 58 define(`up', `%rsi') dnl rdx 59 define(`n_param', `%rdx') dnl r8 60 define(`v0_param',`%rcx') dnl r9 61 62 define(`n', `%rcx') dnl 63 define(`v0', `%rdx') dnl 64 65 C Testing mechanism for running this on older AMD64 processors 66 ifelse(FAKE_MULXADX,1,` 67 include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4') 68 ',` 69 define(`adox', ``adox' $1, $2') 70 define(`adcx', ``adcx' $1, $2') 71 define(`mulx', ``mulx' $1, $2, $3') 72 ') 73 74 ASM_START() 75 TEXT 76 ALIGN(16) 77 PROLOGUE(mpn_addmul_1) 78 mov (up), %r8 79 80 push %rbx 81 push %r12 82 push %r13 83 84 lea (up,n_param,8), up 85 lea -16(rp,n_param,8), rp 86 mov R32(n_param), R32(%rax) 87 xchg v0_param, v0 C FIXME: is this insn fast? 88 89 neg n 90 91 and $3, R8(%rax) 92 jz L(b0) 93 cmp $2, R8(%rax) 94 jl L(b1) 95 jz L(b2) 96 97 L(b3): mulx( (up,n,8), %r11, %r10) 98 mulx( 8(up,n,8), %r13, %r12) 99 mulx( 16(up,n,8), %rbx, %rax) 100 dec n 101 jmp L(lo3) 102 103 L(b0): mulx( (up,n,8), %r9, %r8) 104 mulx( 8(up,n,8), %r11, %r10) 105 mulx( 16(up,n,8), %r13, %r12) 106 jmp L(lo0) 107 108 L(b2): mulx( (up,n,8), %r13, %r12) 109 mulx( 8(up,n,8), %rbx, %rax) 110 lea 2(n), n 111 jrcxz L(wd2) 112 L(gt2): mulx( (up,n,8), %r9, %r8) 113 jmp L(lo2) 114 115 L(b1): and R8(%rax), R8(%rax) 116 mulx( (up,n,8), %rbx, %rax) 117 lea 1(n), n 118 jrcxz L(wd1) 119 mulx( (up,n,8), %r9, %r8) 120 mulx( 8(up,n,8), %r11, %r10) 121 jmp L(lo1) 122 123 L(end): adcx( %r10, %r13) 124 mov %r11, -8(rp) 125 L(wd2): adox( (rp), %r13) 126 adcx( %r12, %rbx) 127 mov %r13, (rp) 128 L(wd1): adox( 8(rp), %rbx) 129 adcx( %rcx, %rax) 130 adox( %rcx, %rax) 131 mov %rbx, 8(rp) 132 pop %r13 133 pop %r12 134 pop %rbx 135 ret 136 137 L(top): jrcxz L(end) 138 mulx( (up,n,8), %r9, %r8) 139 adcx( %r10, %r13) 140 mov %r11, -8(rp,n,8) 141 L(lo2): adox( (rp,n,8), %r13) 142 mulx( 8(up,n,8), %r11, %r10) 143 adcx( %r12, %rbx) 144 mov %r13, (rp,n,8) 145 L(lo1): adox( 8(rp,n,8), %rbx) 146 mulx( 16(up,n,8), %r13, %r12) 147 adcx( %rax, %r9) 148 mov %rbx, 8(rp,n,8) 149 L(lo0): adox( 16(rp,n,8), %r9) 150 mulx( 24(up,n,8), %rbx, %rax) 151 adcx( %r8, %r11) 152 mov %r9, 16(rp,n,8) 153 L(lo3): adox( 24(rp,n,8), %r11) 154 lea 4(n), n 155 jmp L(top) 156 EPILOGUE() 157 ASM_END() 158