1 1.1 mrg dnl AMD64 mpn_mod_1s_2p 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjorn Granlund. 4 1.1 mrg 5 1.1.1.2 mrg dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1.1.2 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1.1.2 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1.1.2 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1.1.2 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C cycles/limb 36 1.1 mrg C AMD K8,K9 4 37 1.1 mrg C AMD K10 4 38 1.1 mrg C Intel P4 19 39 1.1 mrg C Intel core2 8 40 1.1 mrg C Intel NHM 6.5 41 1.1 mrg C Intel SBR 4.5 42 1.1 mrg C Intel atom 28 43 1.1 mrg C VIA nano 8 44 1.1 mrg 45 1.1 mrg ABI_SUPPORT(DOS64) 46 1.1 mrg ABI_SUPPORT(STD64) 47 1.1 mrg 48 1.1 mrg ASM_START() 49 1.1 mrg TEXT 50 1.1 mrg ALIGN(16) 51 1.1 mrg PROLOGUE(mpn_mod_1s_2p) 52 1.1 mrg FUNC_ENTRY(4) 53 1.1 mrg push %r14 54 1.1 mrg test $1, R8(%rsi) 55 1.1 mrg mov %rdx, %r14 56 1.1 mrg push %r13 57 1.1 mrg mov %rcx, %r13 58 1.1 mrg push %r12 59 1.1 mrg push %rbp 60 1.1 mrg push %rbx 61 1.1 mrg mov 16(%rcx), %r10 62 1.1 mrg mov 24(%rcx), %rbx 63 1.1 mrg mov 32(%rcx), %rbp 64 1.1 mrg je L(b0) 65 1.1 mrg dec %rsi 66 1.1 mrg je L(one) 67 1.1 mrg mov -8(%rdi,%rsi,8), %rax 68 1.1 mrg mul %r10 69 1.1 mrg mov %rax, %r9 70 1.1 mrg mov %rdx, %r8 71 1.1 mrg mov (%rdi,%rsi,8), %rax 72 1.1 mrg add -16(%rdi,%rsi,8), %r9 73 1.1 mrg adc $0, %r8 74 1.1 mrg mul %rbx 75 1.1 mrg add %rax, %r9 76 1.1 mrg adc %rdx, %r8 77 1.1 mrg jmp L(11) 78 1.1 mrg 79 1.1 mrg L(b0): mov -8(%rdi,%rsi,8), %r8 80 1.1 mrg mov -16(%rdi,%rsi,8), %r9 81 1.1 mrg 82 1.1 mrg L(11): sub $4, %rsi 83 1.1 mrg jb L(ed2) 84 1.1 mrg lea 40(%rdi,%rsi,8), %rdi 85 1.1 mrg mov -40(%rdi), %r11 86 1.1 mrg mov -32(%rdi), %rax 87 1.1 mrg jmp L(m0) 88 1.1 mrg 89 1.1 mrg ALIGN(16) 90 1.1 mrg L(top): mov -24(%rdi), %r9 91 1.1 mrg add %rax, %r11 92 1.1 mrg mov -16(%rdi), %rax 93 1.1 mrg adc %rdx, %r12 94 1.1 mrg mul %r10 95 1.1 mrg add %rax, %r9 96 1.1 mrg mov %r11, %rax 97 1.1 mrg mov %rdx, %r8 98 1.1 mrg adc $0, %r8 99 1.1 mrg mul %rbx 100 1.1 mrg add %rax, %r9 101 1.1 mrg mov %r12, %rax 102 1.1 mrg adc %rdx, %r8 103 1.1 mrg mul %rbp 104 1.1 mrg sub $2, %rsi 105 1.1 mrg jb L(ed1) 106 1.1 mrg mov -40(%rdi), %r11 107 1.1 mrg add %rax, %r9 108 1.1 mrg mov -32(%rdi), %rax 109 1.1 mrg adc %rdx, %r8 110 1.1 mrg L(m0): mul %r10 111 1.1 mrg add %rax, %r11 112 1.1 mrg mov %r9, %rax 113 1.1 mrg mov %rdx, %r12 114 1.1 mrg adc $0, %r12 115 1.1 mrg mul %rbx 116 1.1 mrg add %rax, %r11 117 1.1 mrg lea -32(%rdi), %rdi C ap -= 4 118 1.1 mrg mov %r8, %rax 119 1.1 mrg adc %rdx, %r12 120 1.1 mrg mul %rbp 121 1.1 mrg sub $2, %rsi 122 1.1 mrg jae L(top) 123 1.1 mrg 124 1.1 mrg L(ed0): mov %r11, %r9 125 1.1 mrg mov %r12, %r8 126 1.1 mrg L(ed1): add %rax, %r9 127 1.1 mrg adc %rdx, %r8 128 1.1 mrg L(ed2): mov 8(%r13), R32(%rdi) C cnt 129 1.1 mrg mov %r8, %rax 130 1.1 mrg mov %r9, %r8 131 1.1 mrg mul %r10 132 1.1 mrg add %rax, %r8 133 1.1 mrg adc $0, %rdx 134 1.1 mrg L(1): xor R32(%rcx), R32(%rcx) 135 1.1 mrg mov %r8, %r9 136 1.1 mrg sub R32(%rdi), R32(%rcx) 137 1.1 mrg shr R8(%rcx), %r9 138 1.1 mrg mov R32(%rdi), R32(%rcx) 139 1.1 mrg sal R8(%rcx), %rdx 140 1.1 mrg or %rdx, %r9 141 1.1 mrg sal R8(%rcx), %r8 142 1.1 mrg mov %r9, %rax 143 1.1 mrg mulq (%r13) 144 1.1 mrg mov %rax, %rsi 145 1.1 mrg inc %r9 146 1.1 mrg add %r8, %rsi 147 1.1 mrg adc %r9, %rdx 148 1.1 mrg imul %r14, %rdx 149 1.1 mrg sub %rdx, %r8 150 1.1 mrg lea (%r8,%r14), %rax 151 1.1 mrg cmp %r8, %rsi 152 1.1 mrg cmovc %rax, %r8 153 1.1 mrg mov %r8, %rax 154 1.1 mrg sub %r14, %rax 155 1.1 mrg cmovc %r8, %rax 156 1.1 mrg mov R32(%rdi), R32(%rcx) 157 1.1 mrg shr R8(%rcx), %rax 158 1.1 mrg pop %rbx 159 1.1 mrg pop %rbp 160 1.1 mrg pop %r12 161 1.1 mrg pop %r13 162 1.1 mrg pop %r14 163 1.1 mrg FUNC_EXIT() 164 1.1 mrg ret 165 1.1 mrg L(one): 166 1.1 mrg mov (%rdi), %r8 167 1.1 mrg mov 8(%rcx), R32(%rdi) 168 1.1 mrg xor %rdx, %rdx 169 1.1 mrg jmp L(1) 170 1.1 mrg EPILOGUE() 171 1.1 mrg 172 1.1 mrg ALIGN(16) 173 1.1 mrg PROLOGUE(mpn_mod_1s_2p_cps) 174 1.1 mrg FUNC_ENTRY(2) 175 1.1 mrg push %rbp 176 1.1 mrg bsr %rsi, %rcx 177 1.1 mrg push %rbx 178 1.1 mrg mov %rdi, %rbx 179 1.1 mrg push %r12 180 1.1 mrg xor $63, R32(%rcx) 181 1.1 mrg mov %rsi, %r12 182 1.1 mrg mov R32(%rcx), R32(%rbp) C preserve cnt over call 183 1.1 mrg sal R8(%rcx), %r12 C b << cnt 184 1.1 mrg IFSTD(` mov %r12, %rdi ') C pass parameter 185 1.1 mrg IFDOS(` mov %r12, %rcx ') C pass parameter 186 1.1.1.3 mrg IFDOS(` sub $32, %rsp ') 187 1.1.1.2 mrg ASSERT(nz, `test $15, %rsp') 188 1.1 mrg CALL( mpn_invert_limb) 189 1.1.1.3 mrg IFDOS(` add $32, %rsp ') 190 1.1 mrg mov %r12, %r8 191 1.1 mrg mov %rax, %r11 192 1.1 mrg mov %rax, (%rbx) C store bi 193 1.1 mrg mov %rbp, 8(%rbx) C store cnt 194 1.1 mrg neg %r8 195 1.1 mrg mov R32(%rbp), R32(%rcx) 196 1.1 mrg mov $1, R32(%rsi) 197 1.1 mrg ifdef(`SHLD_SLOW',` 198 1.1 mrg shl R8(%rcx), %rsi 199 1.1 mrg neg R32(%rcx) 200 1.1 mrg mov %rax, %rbp 201 1.1 mrg shr R8(%rcx), %rax 202 1.1 mrg or %rax, %rsi 203 1.1 mrg mov %rbp, %rax 204 1.1 mrg neg R32(%rcx) 205 1.1 mrg ',` 206 1.1 mrg shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano 207 1.1 mrg ') 208 1.1 mrg imul %r8, %rsi 209 1.1 mrg mul %rsi 210 1.1 mrg 211 1.1 mrg add %rsi, %rdx 212 1.1 mrg shr R8(%rcx), %rsi 213 1.1 mrg mov %rsi, 16(%rbx) C store B1modb 214 1.1 mrg 215 1.1 mrg not %rdx 216 1.1 mrg imul %r12, %rdx 217 1.1 mrg lea (%rdx,%r12), %rsi 218 1.1 mrg cmp %rdx, %rax 219 1.1 mrg cmovnc %rdx, %rsi 220 1.1 mrg mov %r11, %rax 221 1.1 mrg mul %rsi 222 1.1 mrg 223 1.1 mrg add %rsi, %rdx 224 1.1 mrg shr R8(%rcx), %rsi 225 1.1 mrg mov %rsi, 24(%rbx) C store B2modb 226 1.1 mrg 227 1.1 mrg not %rdx 228 1.1 mrg imul %r12, %rdx 229 1.1 mrg add %rdx, %r12 230 1.1 mrg cmp %rdx, %rax 231 1.1 mrg cmovnc %rdx, %r12 232 1.1 mrg 233 1.1 mrg shr R8(%rcx), %r12 234 1.1 mrg mov %r12, 32(%rbx) C store B3modb 235 1.1 mrg 236 1.1 mrg pop %r12 237 1.1 mrg pop %rbx 238 1.1 mrg pop %rbp 239 1.1 mrg FUNC_EXIT() 240 1.1 mrg ret 241 1.1 mrg EPILOGUE() 242