1 dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 1.77 35 C AMD K10 1.76\1.82 36 C AMD bd1 1.67\2.12 37 C AMD bd2 1.62\1.82 38 C AMD bd3 39 C AMD bd4 1.55\2.2 40 C AMD zen 41 C AMD bt1 2.54 42 C AMD bt2 2 43 C Intel P4 11 44 C Intel PNR 4.76 45 C Intel NHM 5.27 46 C Intel SBR 2 47 C Intel IBR 1.94 48 C Intel HWL 1.63 49 C Intel BWL 1.51 50 C Intel SKL 1.51 51 C Intel atom 3.56 52 C Intel SLM 4 53 C VIA nano 54 55 C The loop of this code is the result of running a code generation and 56 C optimization tool suite written by David Harvey and Torbjorn Granlund. 57 58 C INPUT PARAMETERS 59 define(`rp', `%rdi') C rcx 60 define(`up', `%rsi') C rdx 61 define(`vp', `%rdx') C r8 62 define(`n', `%rcx') C r9 63 define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) 64 65 ifdef(`OPERATION_add_n', ` 66 define(ADCSBB, adc) 67 define(func, mpn_add_n) 68 define(func_nc, mpn_add_nc)') 69 ifdef(`OPERATION_sub_n', ` 70 define(ADCSBB, sbb) 71 define(func, mpn_sub_n) 72 define(func_nc, mpn_sub_nc)') 73 74 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 75 76 ABI_SUPPORT(DOS64) 77 ABI_SUPPORT(STD64) 78 79 ASM_START() 80 TEXT 81 ALIGN(16) 82 PROLOGUE(func) 83 FUNC_ENTRY(4) 84 xor %r8, %r8 85 L(ent): test $1, R8(n) 86 jnz L(bx1) 87 88 L(bx0): test $2, R8(n) 89 jnz L(b10) 90 91 L(b00): shr $2, n 92 neg %r8 93 mov $3, R32(%rax) 94 mov (up), %r10 95 mov 8(up), %r11 96 jmp L(lo0) 97 98 L(b10): shr $2, n 99 neg %r8 100 mov $1, R32(%rax) 101 mov (up), %r8 102 mov 8(up), %r9 103 jrcxz L(cj2) 104 jmp L(top) 105 106 L(bx1): test $2, R8(n) 107 jnz L(b11) 108 109 L(b01): shr $2, n 110 neg %r8 111 mov $0, R32(%rax) 112 mov (up), %r9 113 jrcxz L(cj1) 114 mov 8(up), %r10 115 jmp L(lo1) 116 117 ALIGN(8) 118 L(b11): inc n 119 shr $2, n 120 neg %r8 121 mov $2, R32(%rax) 122 mov (up), %r11 123 jmp L(lo3) 124 125 ALIGN(4) 126 L(top): mov 8(up,%rax,8), %r10 127 ADCSBB -8(vp,%rax,8), %r8 128 mov %r8, -8(rp,%rax,8) 129 L(lo1): mov 16(up,%rax,8), %r11 130 ADCSBB (vp,%rax,8), %r9 131 lea 4(%rax), %rax 132 mov %r9, -32(rp,%rax,8) 133 L(lo0): ADCSBB -24(vp,%rax,8), %r10 134 mov %r10, -24(rp,%rax,8) 135 L(lo3): ADCSBB -16(vp,%rax,8), %r11 136 dec n 137 mov -8(up,%rax,8), %r8 138 mov %r11, -16(rp,%rax,8) 139 L(lo2): mov (up,%rax,8), %r9 140 jnz L(top) 141 142 L(cj2): ADCSBB -8(vp,%rax,8), %r8 143 mov %r8, -8(rp,%rax,8) 144 L(cj1): ADCSBB (vp,%rax,8), %r9 145 mov %r9, (rp,%rax,8) 146 147 mov $0, R32(%rax) 148 adc $0, R32(%rax) 149 150 FUNC_EXIT() 151 ret 152 EPILOGUE() 153 154 ALIGN(16) 155 PROLOGUE(func_nc) 156 FUNC_ENTRY(4) 157 IFDOS(` mov 56(%rsp), %r8 ') 158 jmp L(ent) 159 EPILOGUE() 160