1 dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 3.30 3.58 35 C AMD K10 3.09 36 C AMD bull 4.47 4.72 37 C AMD pile 4.66 38 C AMD steam 39 C AMD excavator 40 C AMD bobcat 6.30 41 C AMD jaguar 6.29 42 C Intel P4 17.3 17.8 43 C Intel core2 5.13 44 C Intel NHM 4.85 45 C Intel SBR 3.83 46 C Intel IBR 3.75 47 C Intel HWL 3.45 48 C Intel BWL 2.56 49 C Intel SKL 2.53 50 C Intel atom 20.3 51 C Intel SLM 9 52 C VIA nano 53 54 C The loop of this code is the result of running a code generation and 55 C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57 C TODO 58 C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver. 59 60 define(`rp', `%rdi') C rcx 61 define(`up', `%rsi') C rdx 62 define(`n_param', `%rdx') C r8 63 define(`v0', `%rcx') C r9 64 65 define(`n', `%r11') 66 67 ifdef(`OPERATION_addmul_1',` 68 define(`ADDSUB', `add') 69 define(`func', `mpn_addmul_1') 70 ') 71 ifdef(`OPERATION_submul_1',` 72 define(`ADDSUB', `sub') 73 define(`func', `mpn_submul_1') 74 ') 75 76 ABI_SUPPORT(DOS64) 77 ABI_SUPPORT(STD64) 78 79 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 80 81 IFDOS(` define(`up', ``%rsi'') ') dnl 82 IFDOS(` define(`rp', ``%rcx'') ') dnl 83 IFDOS(` define(`v0', ``%r9'') ') dnl 84 IFDOS(` define(`r9', ``rdi'') ') dnl 85 IFDOS(` define(`n', ``%r8'') ') dnl 86 IFDOS(` define(`r8', ``r11'') ') dnl 87 88 ASM_START() 89 TEXT 90 ALIGN(16) 91 PROLOGUE(func) 92 IFDOS(``push %rsi '') 93 IFDOS(``push %rdi '') 94 IFDOS(``mov %rdx, %rsi '') 95 96 mov (up), %rax C read first u limb early 97 push %rbx 98 IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 99 IFDOS(` mov n, %rbx ') 100 mul v0 101 102 IFSTD(` mov %rbx, n ') 103 104 and $3, R32(%rbx) 105 lea -16(rp,n,8), rp 106 jz L(b0) 107 cmp $2, R32(%rbx) 108 jb L(b1) 109 jz L(b2) 110 111 L(b3): mov $0, R32(%r8) 112 mov %rax, %rbx 113 mov $0, R32(%r9) 114 mov 8(up), %rax 115 mov %rdx, %r10 116 lea (up,n,8), up 117 not n 118 jmp L(L3) 119 120 L(b0): mov $0, R32(%r10) 121 mov %rax, %r8 122 mov %rdx, %rbx 123 mov 8(up), %rax 124 lea (up,n,8), up 125 neg n 126 jmp L(L0) 127 128 L(b1): cmp $1, n 129 jz L(n1) 130 mov %rax, %r9 131 mov 8(up), %rax 132 mov %rdx, %r8 133 mov $0, R32(%rbx) 134 lea (up,n,8), up 135 neg n 136 inc n 137 jmp L(L1) 138 139 L(b2): mov $0, R32(%rbx) 140 mov %rax, %r10 141 mov %rdx, %r9 142 mov 8(up), %rax 143 mov $0, R32(%r8) 144 lea (up,n,8), up 145 neg n 146 add $2, n 147 jns L(end) 148 149 ALIGN(32) 150 L(top): mul v0 151 ADDSUB %r10, (rp,n,8) 152 adc %rax, %r9 153 mov (up,n,8), %rax 154 adc %rdx, %r8 155 L(L1): mul v0 156 mov $0, R32(%r10) 157 ADDSUB %r9, 8(rp,n,8) 158 adc %rax, %r8 159 adc %rdx, %rbx 160 mov 8(up,n,8), %rax 161 L(L0): mul v0 162 ADDSUB %r8, 16(rp,n,8) 163 mov $0, R32(%r8) 164 adc %rax, %rbx 165 mov $0, R32(%r9) 166 mov 16(up,n,8), %rax 167 adc %rdx, %r10 168 L(L3): mul v0 169 ADDSUB %rbx, 24(rp,n,8) 170 mov $0, R32(%rbx) 171 adc %rax, %r10 172 adc %rdx, %r9 173 mov 24(up,n,8), %rax 174 add $4, n 175 js L(top) 176 177 L(end): mul v0 178 ADDSUB %r10, (rp) 179 adc %r9, %rax 180 adc %r8, %rdx 181 L(n1): ADDSUB %rax, 8(rp) 182 adc $0, %rdx 183 mov %rdx, %rax 184 185 pop %rbx 186 IFDOS(``pop %rdi '') 187 IFDOS(``pop %rsi '') 188 ret 189 EPILOGUE() 190 ASM_END() 191