1 1.1 mrg dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.2 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1.1.2 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg C cycles/limb 34 1.1.1.3 mrg C AMD K8,K9 3.65 35 1.1.1.3 mrg C AMD K10 3.30 3.68 36 1.1.1.3 mrg C AMD bull 4.04 4.29 37 1.1.1.3 mrg C AMD pile 4.33 38 1.1.1.3 mrg C AMD steam 39 1.1.1.3 mrg C AMD excavator 40 1.1.1.3 mrg C AMD bobcat 5.73 41 1.1.1.3 mrg C AMD jaguar 5.87 42 1.1.1.3 mrg C Intel P4 12.5 43 1.1.1.3 mrg C Intel core2 4.38 44 1.1.1.3 mrg C Intel NHM 4.28 45 1.1.1.3 mrg C Intel SBR 2.69 46 1.1.1.3 mrg C Intel IBR 2.55 47 1.1.1.3 mrg C Intel HWL 2.41 48 1.1.1.3 mrg C Intel BWL 2.49 49 1.1.1.3 mrg C Intel SKL 2.50 50 1.1.1.3 mrg C Intel atom 20.3 51 1.1.1.3 mrg C Intel SLM 7.8 52 1.1.1.3 mrg C VIA nano 4.25 53 1.1 mrg 54 1.1 mrg C The loop of this code is the result of running a code generation and 55 1.1 mrg C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 1.1 mrg 57 1.1 mrg C TODO 58 1.1 mrg C * Move loop code into feed-in blocks, to save insn for zeroing regs. 59 1.1 mrg 60 1.1 mrg define(`rp', `%rdi') C rcx 61 1.1 mrg define(`up', `%rsi') C rdx 62 1.1 mrg define(`n_param', `%rdx') C r8 63 1.1 mrg define(`v0', `%rcx') C r9 64 1.1 mrg 65 1.1 mrg define(`n', `%rbx') 66 1.1 mrg 67 1.1 mrg ABI_SUPPORT(DOS64) 68 1.1 mrg ABI_SUPPORT(STD64) 69 1.1 mrg 70 1.1 mrg IFDOS(` define(`up', ``%rsi'') ') dnl 71 1.1 mrg IFDOS(` define(`rp', ``%rcx'') ') dnl 72 1.1 mrg IFDOS(` define(`v0', ``%r9'') ') dnl 73 1.1 mrg IFDOS(` define(`r9', ``rdi'') ') dnl 74 1.1 mrg IFDOS(` define(`n', ``%r8'') ') dnl 75 1.1 mrg IFDOS(` define(`r8', ``rbx'') ') dnl 76 1.1 mrg 77 1.1 mrg ASM_START() 78 1.1 mrg TEXT 79 1.1 mrg ALIGN(16) 80 1.1 mrg PROLOGUE(mpn_mul_1c) 81 1.1 mrg IFDOS(``push %rsi '') 82 1.1 mrg IFDOS(``push %rdi '') 83 1.1 mrg IFDOS(``mov %rdx, %rsi '') 84 1.1 mrg 85 1.1 mrg mov (up), %rax C read first u limb early 86 1.1 mrg push %rbx 87 1.1 mrg IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 88 1.1 mrg IFDOS(` mov n, %r11 ') 89 1.1 mrg mul v0 90 1.1 mrg 91 1.1 mrg IFSTD(` add %r8, %rax ') 92 1.1 mrg IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) 93 1.1 mrg adc $0, %rdx 94 1.1 mrg jmp L(common) 95 1.1 mrg 96 1.1 mrg EPILOGUE() 97 1.1 mrg 98 1.1 mrg ALIGN(16) 99 1.1 mrg PROLOGUE(mpn_mul_1) 100 1.1 mrg IFDOS(``push %rsi '') 101 1.1 mrg IFDOS(``push %rdi '') 102 1.1 mrg IFDOS(``mov %rdx, %rsi '') 103 1.1 mrg 104 1.1 mrg mov (up), %rax C read first u limb early 105 1.1 mrg push %rbx 106 1.1 mrg IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 107 1.1 mrg IFDOS(` mov n, %r11 ') 108 1.1 mrg mul v0 109 1.1 mrg 110 1.1 mrg L(common): 111 1.1 mrg IFSTD(` mov %r11, n ') 112 1.1 mrg 113 1.1 mrg and $3, R32(%r11) 114 1.1 mrg lea -16(rp,n,8), rp 115 1.1 mrg jz L(b0) 116 1.1 mrg cmp $2, R32(%r11) 117 1.1 mrg jb L(b1) 118 1.1 mrg jz L(b2) 119 1.1 mrg 120 1.1 mrg L(b3): mov %rax, %r10 121 1.1 mrg mov %rdx, %r11 122 1.1 mrg mov 8(up), %rax 123 1.1 mrg mul v0 124 1.1 mrg lea (up,n,8), up 125 1.1 mrg not n 126 1.1 mrg jmp L(L3) 127 1.1 mrg 128 1.1 mrg L(b0): mov %rax, %r9 129 1.1 mrg mov %rdx, %r10 130 1.1 mrg mov 8(up), %rax 131 1.1 mrg lea (up,n,8), up 132 1.1 mrg neg n 133 1.1 mrg jmp L(L0) 134 1.1 mrg 135 1.1 mrg L(b1): mov %rax, %r8 136 1.1 mrg cmp $1, n 137 1.1 mrg jz L(n1) 138 1.1 mrg mov %rdx, %r9 139 1.1 mrg lea (up,n,8), up 140 1.1 mrg neg n 141 1.1 mrg mov %r8, 16(rp,n,8) 142 1.1 mrg inc n 143 1.1 mrg jmp L(L1) 144 1.1 mrg 145 1.1 mrg L(b2): mov %rax, %r11 146 1.1 mrg mov %rdx, %r8 147 1.1 mrg mov 8(up), %rax 148 1.1 mrg lea (up,n,8), up 149 1.1 mrg neg n 150 1.1 mrg add $2, n 151 1.1 mrg jns L(end) 152 1.1 mrg 153 1.1 mrg ALIGN(16) 154 1.1 mrg L(top): mul v0 155 1.1 mrg mov %rdx, %r9 156 1.1 mrg add %rax, %r8 157 1.1 mrg adc $0, %r9 158 1.1 mrg mov %r8, 8(rp,n,8) 159 1.1 mrg mov %r11, (rp,n,8) 160 1.1 mrg L(L1): mov (up,n,8), %rax 161 1.1 mrg mul v0 162 1.1 mrg add %rax, %r9 163 1.1 mrg mov %rdx, %r10 164 1.1 mrg mov 8(up,n,8), %rax 165 1.1 mrg adc $0, %r10 166 1.1 mrg L(L0): mul v0 167 1.1 mrg add %rax, %r10 168 1.1 mrg mov %rdx, %r11 169 1.1 mrg mov 16(up,n,8), %rax 170 1.1 mrg adc $0, %r11 171 1.1 mrg mul v0 172 1.1 mrg mov %r9, 16(rp,n,8) 173 1.1 mrg L(L3): add %rax, %r11 174 1.1 mrg mov %r10, 24(rp,n,8) 175 1.1 mrg mov %rdx, %r8 176 1.1 mrg adc $0, %r8 177 1.1 mrg add $4, n 178 1.1 mrg mov -8(up,n,8), %rax 179 1.1 mrg js L(top) 180 1.1 mrg 181 1.1 mrg L(end): mul v0 182 1.1 mrg add %rax, %r8 183 1.1 mrg adc $0, %rdx 184 1.1 mrg mov %r11, (rp) 185 1.1 mrg L(n1): mov %r8, 8(rp) 186 1.1 mrg mov %rdx, %rax 187 1.1 mrg 188 1.1 mrg pop %rbx 189 1.1 mrg IFDOS(``pop %rdi '') 190 1.1 mrg IFDOS(``pop %rsi '') 191 1.1 mrg ret 192 1.1 mrg EPILOGUE() 193 1.1 mrg ASM_END() 194