1 dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. 2 3 dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. 4 5 dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb cycles/limb good 37 C 16-byte aligned 16-byte unaligned for cpu? 38 C AMD K8,K9 ? ? 39 C AMD K10 1.85 (1.635) 1.9 (1.67) Y 40 C AMD bd1 1.82 (1.75) 1.82 (1.75) Y 41 C AMD bobcat 4.5 4.5 42 C Intel P4 3.6 (3.125) 3.6 (3.125) Y 43 C Intel core2 2.05 (1.67) 2.55 (1.75) 44 C Intel NHM 2.05 (1.875) 2.6 (2.25) 45 C Intel SBR 1.55 (1.44) 2 (1.57) Y 46 C Intel atom ? ? 47 C VIA nano 2.5 (2.5) 2.5 (2.5) Y 48 49 C We try to do as many 16-byte operations as possible. The top-most and 50 C bottom-most writes might need 8-byte operations. We always write using 51 C 16-byte operations, we read with both 8-byte and 16-byte operations. 52 53 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is 54 C not true. The aligned case reads 16+8 bytes, the unaligned case reads 55 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. 56 57 C This is not yet great code: 58 C (1) The unaligned case makes too many reads. 59 C (2) We should do some unrolling, at least 2-way. 60 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on 61 C Nano. 62 63 C INPUT PARAMETERS 64 define(`rp', `%rdi') 65 define(`ap', `%rsi') 66 define(`n', `%rdx') 67 define(`cnt', `%rcx') 68 69 ASM_START() 70 TEXT 71 ALIGN(16) 72 PROLOGUE(mpn_lshiftc) 73 FUNC_ENTRY(4) 74 movd R32(%rcx), %xmm4 75 mov $64, R32(%rax) 76 sub R32(%rcx), R32(%rax) 77 movd R32(%rax), %xmm5 78 79 neg R32(%rcx) 80 mov -8(ap,n,8), %rax 81 shr R8(%rcx), %rax 82 83 pcmpeqb %xmm2, %xmm2 C set to 111...111 84 85 cmp $2, n 86 jle L(le2) 87 88 lea (rp,n,8), R32(%rcx) 89 test $8, R8(%rcx) 90 je L(rp_aligned) 91 92 C Do one initial limb in order to make rp aligned 93 movq -8(ap,n,8), %xmm0 94 movq -16(ap,n,8), %xmm1 95 psllq %xmm4, %xmm0 96 psrlq %xmm5, %xmm1 97 por %xmm1, %xmm0 98 pxor %xmm2, %xmm0 99 movq %xmm0, -8(rp,n,8) 100 dec n 101 102 L(rp_aligned): 103 lea (ap,n,8), R32(%rcx) 104 test $8, R8(%rcx) 105 je L(aent) 106 jmp L(uent) 107 C ***************************************************************************** 108 109 C Handle the case when ap != rp (mod 16). 110 111 ALIGN(16) 112 L(utop):movq (ap,n,8), %xmm1 113 punpcklqdq 8(ap,n,8), %xmm1 114 movdqa -8(ap,n,8), %xmm0 115 psllq %xmm4, %xmm1 116 psrlq %xmm5, %xmm0 117 por %xmm1, %xmm0 118 pxor %xmm2, %xmm0 119 movdqa %xmm0, (rp,n,8) 120 L(uent):sub $2, n 121 ja L(utop) 122 123 jne L(end8) 124 125 movq (ap), %xmm1 126 pxor %xmm0, %xmm0 127 punpcklqdq %xmm1, %xmm0 128 punpcklqdq 8(ap), %xmm1 129 psllq %xmm4, %xmm1 130 psrlq %xmm5, %xmm0 131 por %xmm1, %xmm0 132 pxor %xmm2, %xmm0 133 movdqa %xmm0, (rp) 134 FUNC_EXIT() 135 ret 136 C ***************************************************************************** 137 138 C Handle the case when ap = rp (mod 16). 139 140 ALIGN(16) 141 L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] 142 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] 143 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] 144 psllq %xmm4, %xmm0 145 psrlq %xmm5, %xmm1 146 por %xmm1, %xmm0 147 pxor %xmm2, %xmm0 148 movdqa %xmm0, (rp,n,8) 149 L(aent):sub $2, n 150 ja L(atop) 151 152 jne L(end8) 153 154 movdqa (ap), %xmm0 155 pxor %xmm1, %xmm1 156 punpcklqdq %xmm0, %xmm1 157 psllq %xmm4, %xmm0 158 psrlq %xmm5, %xmm1 159 por %xmm1, %xmm0 160 pxor %xmm2, %xmm0 161 movdqa %xmm0, (rp) 162 FUNC_EXIT() 163 ret 164 C ***************************************************************************** 165 166 ALIGN(16) 167 L(le2): jne L(end8) 168 169 movq 8(ap), %xmm0 170 movq (ap), %xmm1 171 psllq %xmm4, %xmm0 172 psrlq %xmm5, %xmm1 173 por %xmm1, %xmm0 174 pxor %xmm2, %xmm0 175 movq %xmm0, 8(rp) 176 177 L(end8):movq (ap), %xmm0 178 psllq %xmm4, %xmm0 179 pxor %xmm2, %xmm0 180 movq %xmm0, (rp) 181 FUNC_EXIT() 182 ret 183 EPILOGUE() 184