1 dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 3.65 35 C AMD K10 3.30 3.68 36 C AMD bull 4.04 4.29 37 C AMD pile 4.33 38 C AMD steam 39 C AMD excavator 40 C AMD bobcat 5.73 41 C AMD jaguar 5.87 42 C Intel P4 12.5 43 C Intel core2 4.38 44 C Intel NHM 4.28 45 C Intel SBR 2.69 46 C Intel IBR 2.55 47 C Intel HWL 2.41 48 C Intel BWL 2.49 49 C Intel SKL 2.50 50 C Intel atom 20.3 51 C Intel SLM 7.8 52 C VIA nano 4.25 53 54 C The loop of this code is the result of running a code generation and 55 C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57 C TODO 58 C * Move loop code into feed-in blocks, to save insn for zeroing regs. 59 60 define(`rp', `%rdi') C rcx 61 define(`up', `%rsi') C rdx 62 define(`n_param', `%rdx') C r8 63 define(`v0', `%rcx') C r9 64 65 define(`n', `%rbx') 66 67 ABI_SUPPORT(DOS64) 68 ABI_SUPPORT(STD64) 69 70 IFDOS(` define(`up', ``%rsi'') ') dnl 71 IFDOS(` define(`rp', ``%rcx'') ') dnl 72 IFDOS(` define(`v0', ``%r9'') ') dnl 73 IFDOS(` define(`r9', ``rdi'') ') dnl 74 IFDOS(` define(`n', ``%r8'') ') dnl 75 IFDOS(` define(`r8', ``rbx'') ') dnl 76 77 ASM_START() 78 TEXT 79 ALIGN(16) 80 PROLOGUE(mpn_mul_1c) 81 IFDOS(``push %rsi '') 82 IFDOS(``push %rdi '') 83 IFDOS(``mov %rdx, %rsi '') 84 85 mov (up), %rax C read first u limb early 86 push %rbx 87 IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 88 IFDOS(` mov n, %r11 ') 89 mul v0 90 91 IFSTD(` add %r8, %rax ') 92 IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) 93 adc $0, %rdx 94 jmp L(common) 95 96 EPILOGUE() 97 98 ALIGN(16) 99 PROLOGUE(mpn_mul_1) 100 IFDOS(``push %rsi '') 101 IFDOS(``push %rdi '') 102 IFDOS(``mov %rdx, %rsi '') 103 104 mov (up), %rax C read first u limb early 105 push %rbx 106 IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 107 IFDOS(` mov n, %r11 ') 108 mul v0 109 110 L(common): 111 IFSTD(` mov %r11, n ') 112 113 and $3, R32(%r11) 114 lea -16(rp,n,8), rp 115 jz L(b0) 116 cmp $2, R32(%r11) 117 jb L(b1) 118 jz L(b2) 119 120 L(b3): mov %rax, %r10 121 mov %rdx, %r11 122 mov 8(up), %rax 123 mul v0 124 lea (up,n,8), up 125 not n 126 jmp L(L3) 127 128 L(b0): mov %rax, %r9 129 mov %rdx, %r10 130 mov 8(up), %rax 131 lea (up,n,8), up 132 neg n 133 jmp L(L0) 134 135 L(b1): mov %rax, %r8 136 cmp $1, n 137 jz L(n1) 138 mov %rdx, %r9 139 lea (up,n,8), up 140 neg n 141 mov %r8, 16(rp,n,8) 142 inc n 143 jmp L(L1) 144 145 L(b2): mov %rax, %r11 146 mov %rdx, %r8 147 mov 8(up), %rax 148 lea (up,n,8), up 149 neg n 150 add $2, n 151 jns L(end) 152 153 ALIGN(16) 154 L(top): mul v0 155 mov %rdx, %r9 156 add %rax, %r8 157 adc $0, %r9 158 mov %r8, 8(rp,n,8) 159 mov %r11, (rp,n,8) 160 L(L1): mov (up,n,8), %rax 161 mul v0 162 add %rax, %r9 163 mov %rdx, %r10 164 mov 8(up,n,8), %rax 165 adc $0, %r10 166 L(L0): mul v0 167 add %rax, %r10 168 mov %rdx, %r11 169 mov 16(up,n,8), %rax 170 adc $0, %r11 171 mul v0 172 mov %r9, 16(rp,n,8) 173 L(L3): add %rax, %r11 174 mov %r10, 24(rp,n,8) 175 mov %rdx, %r8 176 adc $0, %r8 177 add $4, n 178 mov -8(up,n,8), %rax 179 js L(top) 180 181 L(end): mul v0 182 add %rax, %r8 183 adc $0, %rdx 184 mov %r11, (rp) 185 L(n1): mov %r8, 8(rp) 186 mov %rdx, %rax 187 188 pop %rbx 189 IFDOS(``pop %rdi '') 190 IFDOS(``pop %rsi '') 191 ret 192 EPILOGUE() 193 ASM_END() 194