1 1.1 mrg dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjorn Granlund. 4 1.1 mrg 5 1.1.1.2 mrg dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1.1.2 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1.1.2 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1.1.2 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1.1.2 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg 36 1.1 mrg C cycles/limb cycles/limb cycles/limb good 37 1.1 mrg C aligned unaligned best seen for cpu? 38 1.1 mrg C AMD K8,K9 3 3 2.35 no, use shl/shr 39 1.1 mrg C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 40 1.1 mrg C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 41 1.1 mrg C AMD bobcat 3.17 3.17 yes, bad for n < 20 42 1.1 mrg C Intel P4 4.67 4.67 2.7 no, slow movdqu 43 1.1 mrg C Intel core2 2.15 2.15 1.25 no, use shld/shrd 44 1.1 mrg C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 45 1.1 mrg C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 46 1.1 mrg C Intel atom 11.7 11.7 4.5 no 47 1.1 mrg C VIA nano 5.7 5.95 2.0 no, slow movdqu 48 1.1 mrg 49 1.1 mrg C We try to do as many aligned 16-byte operations as possible. The top-most 50 1.1 mrg C and bottom-most writes might need 8-byte operations. 51 1.1 mrg C 52 1.1 mrg C This variant rely on fast load movdqu, and uses it even for aligned operands, 53 1.1 mrg C in order to avoid the need for two separate loops. 54 1.1 mrg C 55 1.1 mrg C TODO 56 1.1 mrg C * Could 2-limb wind-down code be simplified? 57 1.1 mrg C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58 1.1 mrg C for other affected CPUs. 59 1.1 mrg 60 1.1 mrg C INPUT PARAMETERS 61 1.1 mrg define(`rp', `%rdi') 62 1.1 mrg define(`ap', `%rsi') 63 1.1 mrg define(`n', `%rdx') 64 1.1 mrg define(`cnt', `%rcx') 65 1.1 mrg 66 1.1 mrg ASM_START() 67 1.1 mrg TEXT 68 1.1 mrg ALIGN(64) 69 1.1 mrg PROLOGUE(mpn_lshift) 70 1.1 mrg FUNC_ENTRY(4) 71 1.1 mrg movd R32(%rcx), %xmm4 72 1.1 mrg mov $64, R32(%rax) 73 1.1 mrg sub R32(%rcx), R32(%rax) 74 1.1 mrg movd R32(%rax), %xmm5 75 1.1 mrg 76 1.1 mrg neg R32(%rcx) 77 1.1 mrg mov -8(ap,n,8), %rax 78 1.1 mrg shr R8(%rcx), %rax 79 1.1 mrg 80 1.1 mrg cmp $3, n 81 1.1 mrg jle L(bc) 82 1.1 mrg 83 1.1 mrg lea (rp,n,8), R32(%rcx) 84 1.1.1.2 mrg test $8, R8(%rcx) 85 1.1.1.2 mrg jz L(rp_aligned) 86 1.1 mrg 87 1.1 mrg C Do one initial limb in order to make rp aligned 88 1.1 mrg movq -8(ap,n,8), %xmm0 89 1.1 mrg movq -16(ap,n,8), %xmm1 90 1.1 mrg psllq %xmm4, %xmm0 91 1.1 mrg psrlq %xmm5, %xmm1 92 1.1 mrg por %xmm1, %xmm0 93 1.1 mrg movq %xmm0, -8(rp,n,8) 94 1.1 mrg dec n 95 1.1 mrg 96 1.1 mrg L(rp_aligned): 97 1.1 mrg lea 1(n), %r8d 98 1.1 mrg 99 1.1 mrg and $6, R32(%r8) 100 1.1 mrg jz L(ba0) 101 1.1 mrg cmp $4, R32(%r8) 102 1.1 mrg jz L(ba4) 103 1.1 mrg jc L(ba2) 104 1.1 mrg L(ba6): add $-4, n 105 1.1 mrg jmp L(i56) 106 1.1 mrg L(ba0): add $-6, n 107 1.1 mrg jmp L(i70) 108 1.1 mrg L(ba4): add $-2, n 109 1.1 mrg jmp L(i34) 110 1.1 mrg L(ba2): add $-8, n 111 1.1 mrg jle L(end) 112 1.1 mrg 113 1.1 mrg ALIGN(16) 114 1.1 mrg L(top): movdqu 40(ap,n,8), %xmm1 115 1.1 mrg movdqu 48(ap,n,8), %xmm0 116 1.1 mrg psllq %xmm4, %xmm0 117 1.1 mrg psrlq %xmm5, %xmm1 118 1.1 mrg por %xmm1, %xmm0 119 1.1 mrg movdqa %xmm0, 48(rp,n,8) 120 1.1 mrg L(i70): 121 1.1 mrg movdqu 24(ap,n,8), %xmm1 122 1.1 mrg movdqu 32(ap,n,8), %xmm0 123 1.1 mrg psllq %xmm4, %xmm0 124 1.1 mrg psrlq %xmm5, %xmm1 125 1.1 mrg por %xmm1, %xmm0 126 1.1 mrg movdqa %xmm0, 32(rp,n,8) 127 1.1 mrg L(i56): 128 1.1 mrg movdqu 8(ap,n,8), %xmm1 129 1.1 mrg movdqu 16(ap,n,8), %xmm0 130 1.1 mrg psllq %xmm4, %xmm0 131 1.1 mrg psrlq %xmm5, %xmm1 132 1.1 mrg por %xmm1, %xmm0 133 1.1 mrg movdqa %xmm0, 16(rp,n,8) 134 1.1 mrg L(i34): 135 1.1 mrg movdqu -8(ap,n,8), %xmm1 136 1.1 mrg movdqu (ap,n,8), %xmm0 137 1.1 mrg psllq %xmm4, %xmm0 138 1.1 mrg psrlq %xmm5, %xmm1 139 1.1 mrg por %xmm1, %xmm0 140 1.1 mrg movdqa %xmm0, (rp,n,8) 141 1.1 mrg sub $8, n 142 1.1 mrg jg L(top) 143 1.1 mrg 144 1.1.1.2 mrg L(end): test $1, R8(n) 145 1.1.1.2 mrg jnz L(end8) 146 1.1 mrg 147 1.1 mrg movdqu (ap), %xmm1 148 1.1 mrg pxor %xmm0, %xmm0 149 1.1 mrg punpcklqdq %xmm1, %xmm0 150 1.1 mrg psllq %xmm4, %xmm1 151 1.1 mrg psrlq %xmm5, %xmm0 152 1.1 mrg por %xmm1, %xmm0 153 1.1 mrg movdqa %xmm0, (rp) 154 1.1 mrg FUNC_EXIT() 155 1.1 mrg ret 156 1.1 mrg 157 1.1 mrg C Basecase 158 1.1 mrg ALIGN(16) 159 1.1 mrg L(bc): dec R32(n) 160 1.1 mrg jz L(end8) 161 1.1 mrg 162 1.1 mrg movq (ap,n,8), %xmm1 163 1.1 mrg movq -8(ap,n,8), %xmm0 164 1.1 mrg psllq %xmm4, %xmm1 165 1.1 mrg psrlq %xmm5, %xmm0 166 1.1 mrg por %xmm1, %xmm0 167 1.1 mrg movq %xmm0, (rp,n,8) 168 1.1 mrg sub $2, R32(n) 169 1.1 mrg jl L(end8) 170 1.1 mrg movq 8(ap), %xmm1 171 1.1 mrg movq (ap), %xmm0 172 1.1 mrg psllq %xmm4, %xmm1 173 1.1 mrg psrlq %xmm5, %xmm0 174 1.1 mrg por %xmm1, %xmm0 175 1.1 mrg movq %xmm0, 8(rp) 176 1.1 mrg 177 1.1 mrg L(end8):movq (ap), %xmm0 178 1.1 mrg psllq %xmm4, %xmm0 179 1.1 mrg movq %xmm0, (rp) 180 1.1 mrg FUNC_EXIT() 181 1.1 mrg ret 182 1.1 mrg EPILOGUE() 183