1 1.1 mrg dnl X86-64 mpn_redc_1 optimised for AMD bobcat. 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjrn Granlund. 4 1.1 mrg 5 1.1 mrg dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1 mrg dnl option) any later version. 15 1.1 mrg dnl 16 1.1 mrg dnl or 17 1.1 mrg dnl 18 1.1 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1 mrg dnl later version. 21 1.1 mrg dnl 22 1.1 mrg dnl or both in parallel, as here. 23 1.1 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1 mrg dnl for more details. 28 1.1 mrg dnl 29 1.1 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C cycles/limb 36 1.1 mrg C AMD K8,K9 ? 37 1.1 mrg C AMD K10 ? 38 1.1 mrg C AMD bull ? 39 1.1 mrg C AMD pile ? 40 1.1 mrg C AMD steam ? 41 1.1 mrg C AMD bobcat 5.0 42 1.1 mrg C AMD jaguar ? 43 1.1 mrg C Intel P4 ? 44 1.1 mrg C Intel core ? 45 1.1 mrg C Intel NHM ? 46 1.1 mrg C Intel SBR ? 47 1.1 mrg C Intel IBR ? 48 1.1 mrg C Intel HWL ? 49 1.1 mrg C Intel BWL ? 50 1.1 mrg C Intel atom ? 51 1.1 mrg C VIA nano ? 52 1.1 mrg 53 1.1 mrg C TODO 54 1.1 mrg C * Micro-optimise, none performed thus far. 55 1.1 mrg C * Consider inlining mpn_add_n. 56 1.1 mrg C * Single basecases out before the pushes. 57 1.1 mrg 58 1.1 mrg C When playing with pointers, set this to $2 to fall back to conservative 59 1.1 mrg C indexing in wind-down code. 60 1.1 mrg define(`I',`$1') 61 1.1 mrg 62 1.1 mrg define(`rp', `%rdi') C rcx 63 1.1 mrg define(`up', `%rsi') C rdx 64 1.1 mrg define(`mp_param', `%rdx') C r8 65 1.1 mrg define(`n', `%rcx') C r9 66 1.1 mrg define(`u0inv', `%r8') C stack 67 1.1 mrg 68 1.1 mrg define(`i', `%r14') 69 1.1 mrg define(`j', `%r15') 70 1.1 mrg define(`mp', `%r12') 71 1.1 mrg define(`q0', `%r13') 72 1.1 mrg define(`w0', `%rbp') 73 1.1 mrg define(`w1', `%r9') 74 1.1 mrg define(`w2', `%r10') 75 1.1 mrg define(`w3', `%r11') 76 1.1 mrg 77 1.1 mrg C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 78 1.1 mrg 79 1.1 mrg ABI_SUPPORT(DOS64) 80 1.1 mrg ABI_SUPPORT(STD64) 81 1.1 mrg 82 1.1 mrg define(`ALIGNx', `ALIGN(16)') 83 1.1 mrg 84 1.1 mrg ASM_START() 85 1.1 mrg TEXT 86 1.1 mrg ALIGN(32) 87 1.1 mrg PROLOGUE(mpn_redc_1) 88 1.1 mrg FUNC_ENTRY(4) 89 1.1 mrg IFDOS(` mov 56(%rsp), %r8 ') 90 1.1 mrg push %rbx 91 1.1 mrg push %rbp 92 1.1 mrg push %r12 93 1.1 mrg push %r13 94 1.1 mrg push %r14 95 1.1 mrg push %r15 96 1.1 mrg 97 1.1 mrg mov (up), q0 98 1.1 mrg mov n, j C outer loop induction var 99 1.1 mrg lea (mp_param,n,8), mp 100 1.1 mrg lea (up,n,8), up 101 1.1 mrg neg n 102 1.1 mrg imul u0inv, q0 C first iteration q0 103 1.1 mrg 104 1.1 mrg test $1, R8(n) 105 1.1 mrg jz L(bx0) 106 1.1 mrg 107 1.1 mrg L(bx1): test $2, R8(n) 108 1.1 mrg jz L(b3) 109 1.1 mrg 110 1.1 mrg L(b1): cmp $-1, R32(n) 111 1.1 mrg jz L(n1) 112 1.1 mrg 113 1.1 mrg L(otp1):lea 1(n), i 114 1.1 mrg mov (mp,n,8), %rax 115 1.1 mrg mul q0 116 1.1 mrg mov %rax, w2 117 1.1 mrg mov %rdx, w3 118 1.1 mrg mov 8(mp,n,8), %rax 119 1.1 mrg mul q0 120 1.1 mrg mov %rax, %rbx 121 1.1 mrg mov %rdx, w1 122 1.1 mrg add (up,n,8), w2 123 1.1 mrg adc w3, %rbx 124 1.1 mrg adc $0, w1 125 1.1 mrg mov 16(mp,n,8), %rax 126 1.1 mrg mul q0 127 1.1 mrg mov %rax, w2 128 1.1 mrg mov %rdx, w3 129 1.1 mrg add 8(up,n,8), %rbx 130 1.1 mrg mov %rbx, 8(up,n,8) 131 1.1 mrg adc w1, w2 132 1.1 mrg adc $0, w3 133 1.1 mrg imul u0inv, %rbx C next q limb 134 1.1 mrg jmp L(e1) 135 1.1 mrg 136 1.1 mrg ALIGNx 137 1.1 mrg L(tp1): add w0, -16(up,i,8) 138 1.1 mrg adc w1, w2 139 1.1 mrg adc $0, w3 140 1.1 mrg mov (mp,i,8), %rax 141 1.1 mrg mul q0 142 1.1 mrg mov %rax, w0 143 1.1 mrg mov %rdx, w1 144 1.1 mrg add w2, -8(up,i,8) 145 1.1 mrg adc w3, w0 146 1.1 mrg adc $0, w1 147 1.1 mrg mov 8(mp,i,8), %rax 148 1.1 mrg mul q0 149 1.1 mrg mov %rax, w2 150 1.1 mrg mov %rdx, w3 151 1.1 mrg add w0, (up,i,8) 152 1.1 mrg adc w1, w2 153 1.1 mrg adc $0, w3 154 1.1 mrg L(e1): mov 16(mp,i,8), %rax 155 1.1 mrg mul q0 156 1.1 mrg mov %rax, w0 157 1.1 mrg mov %rdx, w1 158 1.1 mrg add w2, 8(up,i,8) 159 1.1 mrg adc w3, w0 160 1.1 mrg adc $0, w1 161 1.1 mrg mov 24(mp,i,8), %rax 162 1.1 mrg mul q0 163 1.1 mrg mov %rax, w2 164 1.1 mrg mov %rdx, w3 165 1.1 mrg add $4, i 166 1.1 mrg js L(tp1) 167 1.1 mrg 168 1.1 mrg L(ed1): add w0, I(-16(up),-16(up,i,8)) 169 1.1 mrg adc w1, w2 170 1.1 mrg adc $0, w3 171 1.1 mrg add w2, I(-8(up),-8(up,i,8)) 172 1.1 mrg adc $0, w3 173 1.1 mrg mov w3, (up,n,8) C up[0] 174 1.1 mrg mov %rbx, q0 C previously computed q limb -> q0 175 1.1 mrg lea 8(up), up C up++ 176 1.1 mrg dec j 177 1.1 mrg jnz L(otp1) 178 1.1 mrg jmp L(cj) 179 1.1 mrg 180 1.1 mrg L(b3): cmp $-3, R32(n) 181 1.1 mrg jz L(n3) 182 1.1 mrg 183 1.1 mrg L(otp3):lea 3(n), i 184 1.1 mrg mov (mp,n,8), %rax 185 1.1 mrg mul q0 186 1.1 mrg mov %rax, w2 187 1.1 mrg mov %rdx, w3 188 1.1 mrg mov 8(mp,n,8), %rax 189 1.1 mrg mul q0 190 1.1 mrg mov %rax, %rbx 191 1.1 mrg mov %rdx, w1 192 1.1 mrg add (up,n,8), w2 193 1.1 mrg adc w3, %rbx 194 1.1 mrg adc $0, w1 195 1.1 mrg mov 16(mp,n,8), %rax 196 1.1 mrg mul q0 197 1.1 mrg mov %rax, w2 198 1.1 mrg mov %rdx, w3 199 1.1 mrg add 8(up,n,8), %rbx 200 1.1 mrg mov %rbx, 8(up,n,8) 201 1.1 mrg adc w1, w2 202 1.1 mrg adc $0, w3 203 1.1 mrg imul u0inv, %rbx C next q limb 204 1.1 mrg jmp L(e3) 205 1.1 mrg 206 1.1 mrg ALIGNx 207 1.1 mrg L(tp3): add w0, -16(up,i,8) 208 1.1 mrg adc w1, w2 209 1.1 mrg adc $0, w3 210 1.1 mrg L(e3): mov (mp,i,8), %rax 211 1.1 mrg mul q0 212 1.1 mrg mov %rax, w0 213 1.1 mrg mov %rdx, w1 214 1.1 mrg add w2, -8(up,i,8) 215 1.1 mrg adc w3, w0 216 1.1 mrg adc $0, w1 217 1.1 mrg mov 8(mp,i,8), %rax 218 1.1 mrg mul q0 219 1.1 mrg mov %rax, w2 220 1.1 mrg mov %rdx, w3 221 1.1 mrg add w0, (up,i,8) 222 1.1 mrg adc w1, w2 223 1.1 mrg adc $0, w3 224 1.1 mrg mov 16(mp,i,8), %rax 225 1.1 mrg mul q0 226 1.1 mrg mov %rax, w0 227 1.1 mrg mov %rdx, w1 228 1.1 mrg add w2, 8(up,i,8) 229 1.1 mrg adc w3, w0 230 1.1 mrg adc $0, w1 231 1.1 mrg mov 24(mp,i,8), %rax 232 1.1 mrg mul q0 233 1.1 mrg mov %rax, w2 234 1.1 mrg mov %rdx, w3 235 1.1 mrg add $4, i 236 1.1 mrg js L(tp3) 237 1.1 mrg 238 1.1 mrg L(ed3): add w0, I(-16(up),-16(up,i,8)) 239 1.1 mrg adc w1, w2 240 1.1 mrg adc $0, w3 241 1.1 mrg add w2, I(-8(up),-8(up,i,8)) 242 1.1 mrg adc $0, w3 243 1.1 mrg mov w3, (up,n,8) C up[0] 244 1.1 mrg mov %rbx, q0 C previously computed q limb -> q0 245 1.1 mrg lea 8(up), up C up++ 246 1.1 mrg dec j 247 1.1 mrg jnz L(otp3) 248 1.1 mrg C jmp L(cj) 249 1.1 mrg 250 1.1 mrg L(cj): 251 1.1 mrg IFSTD(` lea (up,n,8), up C param 2: up 252 1.1 mrg lea (up,n,8), %rdx C param 3: up - n 253 1.1 mrg neg R32(n) ') C param 4: n 254 1.1 mrg 255 1.1 mrg IFDOS(` lea (up,n,8), %rdx C param 2: up 256 1.1 mrg lea (%rdx,n,8), %r8 C param 3: up - n 257 1.1 mrg neg R32(n) 258 1.1 mrg mov n, %r9 C param 4: n 259 1.1 mrg mov rp, %rcx ') C param 1: rp 260 1.1 mrg 261 1.1 mrg IFSTD(` sub $8, %rsp ') 262 1.1 mrg IFDOS(` sub $40, %rsp ') 263 1.1 mrg ASSERT(nz, `test $15, %rsp') 264 1.1 mrg CALL( mpn_add_n) 265 1.1 mrg IFSTD(` add $8, %rsp ') 266 1.1 mrg IFDOS(` add $40, %rsp ') 267 1.1 mrg 268 1.1 mrg L(ret): pop %r15 269 1.1 mrg pop %r14 270 1.1 mrg pop %r13 271 1.1 mrg pop %r12 272 1.1 mrg pop %rbp 273 1.1 mrg pop %rbx 274 1.1 mrg FUNC_EXIT() 275 1.1 mrg ret 276 1.1 mrg 277 1.1 mrg L(bx0): test $2, R8(n) 278 1.1 mrg jnz L(b2) 279 1.1 mrg 280 1.1 mrg L(b0): 281 1.1 mrg L(otp0):lea (n), i 282 1.1 mrg mov (mp,n,8), %rax 283 1.1 mrg mul q0 284 1.1 mrg mov %rax, w0 285 1.1 mrg mov %rdx, w1 286 1.1 mrg mov 8(mp,n,8), %rax 287 1.1 mrg mul q0 288 1.1 mrg mov %rax, %rbx 289 1.1 mrg mov %rdx, w3 290 1.1 mrg add (up,n,8), w0 291 1.1 mrg adc w1, %rbx 292 1.1 mrg adc $0, w3 293 1.1 mrg mov 16(mp,n,8), %rax 294 1.1 mrg mul q0 295 1.1 mrg mov %rax, w0 296 1.1 mrg mov %rdx, w1 297 1.1 mrg add 8(up,n,8), %rbx 298 1.1 mrg mov %rbx, 8(up,n,8) 299 1.1 mrg adc w3, w0 300 1.1 mrg adc $0, w1 301 1.1 mrg imul u0inv, %rbx C next q limb 302 1.1 mrg jmp L(e0) 303 1.1 mrg 304 1.1 mrg ALIGNx 305 1.1 mrg L(tp0): add w0, -16(up,i,8) 306 1.1 mrg adc w1, w2 307 1.1 mrg adc $0, w3 308 1.1 mrg mov (mp,i,8), %rax 309 1.1 mrg mul q0 310 1.1 mrg mov %rax, w0 311 1.1 mrg mov %rdx, w1 312 1.1 mrg add w2, -8(up,i,8) 313 1.1 mrg adc w3, w0 314 1.1 mrg adc $0, w1 315 1.1 mrg mov 8(mp,i,8), %rax 316 1.1 mrg mul q0 317 1.1 mrg mov %rax, w2 318 1.1 mrg mov %rdx, w3 319 1.1 mrg add w0, (up,i,8) 320 1.1 mrg adc w1, w2 321 1.1 mrg adc $0, w3 322 1.1 mrg mov 16(mp,i,8), %rax 323 1.1 mrg mul q0 324 1.1 mrg mov %rax, w0 325 1.1 mrg mov %rdx, w1 326 1.1 mrg add w2, 8(up,i,8) 327 1.1 mrg adc w3, w0 328 1.1 mrg adc $0, w1 329 1.1 mrg L(e0): mov 24(mp,i,8), %rax 330 1.1 mrg mul q0 331 1.1 mrg mov %rax, w2 332 1.1 mrg mov %rdx, w3 333 1.1 mrg add $4, i 334 1.1 mrg js L(tp0) 335 1.1 mrg 336 1.1 mrg L(ed0): add w0, I(-16(up),-16(up,i,8)) 337 1.1 mrg adc w1, w2 338 1.1 mrg adc $0, w3 339 1.1 mrg add w2, I(-8(up),-8(up,i,8)) 340 1.1 mrg adc $0, w3 341 1.1 mrg mov w3, (up,n,8) C up[0] 342 1.1 mrg mov %rbx, q0 C previously computed q limb -> q0 343 1.1 mrg lea 8(up), up C up++ 344 1.1 mrg dec j 345 1.1 mrg jnz L(otp0) 346 1.1 mrg jmp L(cj) 347 1.1 mrg 348 1.1 mrg L(b2): cmp $-2, R32(n) 349 1.1 mrg jz L(n2) 350 1.1 mrg 351 1.1 mrg L(otp2):lea 2(n), i 352 1.1 mrg mov (mp,n,8), %rax 353 1.1 mrg mul q0 354 1.1 mrg mov %rax, w0 355 1.1 mrg mov %rdx, w1 356 1.1 mrg mov 8(mp,n,8), %rax 357 1.1 mrg mul q0 358 1.1 mrg mov %rax, %rbx 359 1.1 mrg mov %rdx, w3 360 1.1 mrg add (up,n,8), w0 361 1.1 mrg adc w1, %rbx 362 1.1 mrg adc $0, w3 363 1.1 mrg mov 16(mp,n,8), %rax 364 1.1 mrg mul q0 365 1.1 mrg mov %rax, w0 366 1.1 mrg mov %rdx, w1 367 1.1 mrg add 8(up,n,8), %rbx 368 1.1 mrg mov %rbx, 8(up,n,8) 369 1.1 mrg adc w3, w0 370 1.1 mrg adc $0, w1 371 1.1 mrg imul u0inv, %rbx C next q limb 372 1.1 mrg jmp L(e2) 373 1.1 mrg 374 1.1 mrg ALIGNx 375 1.1 mrg L(tp2): add w0, -16(up,i,8) 376 1.1 mrg adc w1, w2 377 1.1 mrg adc $0, w3 378 1.1 mrg mov (mp,i,8), %rax 379 1.1 mrg mul q0 380 1.1 mrg mov %rax, w0 381 1.1 mrg mov %rdx, w1 382 1.1 mrg add w2, -8(up,i,8) 383 1.1 mrg adc w3, w0 384 1.1 mrg adc $0, w1 385 1.1 mrg L(e2): mov 8(mp,i,8), %rax 386 1.1 mrg mul q0 387 1.1 mrg mov %rax, w2 388 1.1 mrg mov %rdx, w3 389 1.1 mrg add w0, (up,i,8) 390 1.1 mrg adc w1, w2 391 1.1 mrg adc $0, w3 392 1.1 mrg mov 16(mp,i,8), %rax 393 1.1 mrg mul q0 394 1.1 mrg mov %rax, w0 395 1.1 mrg mov %rdx, w1 396 1.1 mrg add w2, 8(up,i,8) 397 1.1 mrg adc w3, w0 398 1.1 mrg adc $0, w1 399 1.1 mrg mov 24(mp,i,8), %rax 400 1.1 mrg mul q0 401 1.1 mrg mov %rax, w2 402 1.1 mrg mov %rdx, w3 403 1.1 mrg add $4, i 404 1.1 mrg js L(tp2) 405 1.1 mrg 406 1.1 mrg L(ed2): add w0, I(-16(up),-16(up,i,8)) 407 1.1 mrg adc w1, w2 408 1.1 mrg adc $0, w3 409 1.1 mrg add w2, I(-8(up),-8(up,i,8)) 410 1.1 mrg adc $0, w3 411 1.1 mrg mov w3, (up,n,8) C up[0] 412 1.1 mrg mov %rbx, q0 C previously computed q limb -> q0 413 1.1 mrg lea 8(up), up C up++ 414 1.1 mrg dec j 415 1.1 mrg jnz L(otp2) 416 1.1 mrg jmp L(cj) 417 1.1 mrg 418 1.1 mrg L(n1): mov (mp_param), %rax 419 1.1 mrg mul q0 420 1.1 mrg add -8(up), %rax 421 1.1 mrg adc (up), %rdx 422 1.1 mrg mov %rdx, (rp) 423 1.1 mrg mov $0, R32(%rax) 424 1.1 mrg adc R32(%rax), R32(%rax) 425 1.1 mrg jmp L(ret) 426 1.1 mrg 427 1.1 mrg L(n2): mov (mp_param), %rax 428 1.1 mrg mov -16(up), %rbp 429 1.1 mrg mul q0 430 1.1 mrg add %rax, %rbp 431 1.1 mrg mov %rdx, %r9 432 1.1 mrg adc $0, %r9 433 1.1 mrg mov -8(mp), %rax 434 1.1 mrg mov -8(up), %r10 435 1.1 mrg mul q0 436 1.1 mrg add %rax, %r10 437 1.1 mrg mov %rdx, %r11 438 1.1 mrg adc $0, %r11 439 1.1 mrg add %r9, %r10 440 1.1 mrg adc $0, %r11 441 1.1 mrg mov %r10, q0 442 1.1 mrg imul u0inv, q0 C next q0 443 1.1 mrg mov -16(mp), %rax 444 1.1 mrg mul q0 445 1.1 mrg add %rax, %r10 446 1.1 mrg mov %rdx, %r9 447 1.1 mrg adc $0, %r9 448 1.1 mrg mov -8(mp), %rax 449 1.1 mrg mov (up), %r14 450 1.1 mrg mul q0 451 1.1 mrg add %rax, %r14 452 1.1 mrg adc $0, %rdx 453 1.1 mrg add %r9, %r14 454 1.1 mrg adc $0, %rdx 455 1.1 mrg xor R32(%rax), R32(%rax) 456 1.1 mrg add %r11, %r14 457 1.1 mrg adc 8(up), %rdx 458 1.1 mrg mov %r14, (rp) 459 1.1 mrg mov %rdx, 8(rp) 460 1.1 mrg adc R32(%rax), R32(%rax) 461 1.1 mrg jmp L(ret) 462 1.1 mrg 463 1.1 mrg ALIGNx 464 1.1 mrg L(n3): mov -24(mp), %rax 465 1.1 mrg mov -24(up), %r10 466 1.1 mrg mul q0 467 1.1 mrg add %rax, %r10 468 1.1 mrg mov -16(mp), %rax 469 1.1 mrg mov %rdx, %r11 470 1.1 mrg adc $0, %r11 471 1.1 mrg mov -16(up), %rbp 472 1.1 mrg mul q0 473 1.1 mrg add %rax, %rbp 474 1.1 mrg mov %rdx, %r9 475 1.1 mrg adc $0, %r9 476 1.1 mrg mov -8(mp), %rax 477 1.1 mrg add %r11, %rbp 478 1.1 mrg mov -8(up), %r10 479 1.1 mrg adc $0, %r9 480 1.1 mrg mul q0 481 1.1 mrg mov %rbp, q0 482 1.1 mrg imul u0inv, q0 C next q0 483 1.1 mrg add %rax, %r10 484 1.1 mrg mov %rdx, %r11 485 1.1 mrg adc $0, %r11 486 1.1 mrg mov %rbp, -16(up) 487 1.1 mrg add %r9, %r10 488 1.1 mrg adc $0, %r11 489 1.1 mrg mov %r10, -8(up) 490 1.1 mrg mov %r11, -24(up) C up[0] 491 1.1 mrg lea 8(up), up C up++ 492 1.1 mrg dec j 493 1.1 mrg jnz L(n3) 494 1.1 mrg 495 1.1 mrg mov -48(up), %rdx 496 1.1 mrg mov -40(up), %rbx 497 1.1 mrg xor R32(%rax), R32(%rax) 498 1.1 mrg add %rbp, %rdx 499 1.1 mrg adc %r10, %rbx 500 1.1 mrg adc -8(up), %r11 501 1.1 mrg mov %rdx, (rp) 502 1.1 mrg mov %rbx, 8(rp) 503 1.1 mrg mov %r11, 16(rp) 504 1.1 mrg adc R32(%rax), R32(%rax) 505 1.1 mrg jmp L(ret) 506 1.1 mrg EPILOGUE() 507 1.1 mrg ASM_END() 508