1 1.1 mrg dnl x86-64 mpn_rshift optimized for Pentium 4. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.3 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg dnl it under the terms of either: 9 1.1.1.3 mrg dnl 10 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg dnl option) any later version. 13 1.1.1.3 mrg dnl 14 1.1.1.3 mrg dnl or 15 1.1.1.3 mrg dnl 16 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg dnl later version. 19 1.1.1.3 mrg dnl 20 1.1.1.3 mrg dnl or both in parallel, as here. 21 1.1.1.3 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg dnl for more details. 26 1.1.1.3 mrg dnl 27 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C cycles/limb 35 1.1.1.2 mrg C AMD K8,K9 2.5 36 1.1.1.2 mrg C AMD K10 ? 37 1.1.1.2 mrg C Intel P4 3.29 38 1.1.1.2 mrg C Intel core2 2.1 (fluctuates, presumably cache related) 39 1.1.1.2 mrg C Intel corei ? 40 1.1.1.2 mrg C Intel atom 14.3 41 1.1.1.2 mrg C VIA nano ? 42 1.1 mrg 43 1.1 mrg C INPUT PARAMETERS 44 1.1 mrg define(`rp',`%rdi') 45 1.1 mrg define(`up',`%rsi') 46 1.1 mrg define(`n',`%rdx') 47 1.1 mrg define(`cnt',`%cl') 48 1.1 mrg 49 1.1.1.2 mrg ABI_SUPPORT(DOS64) 50 1.1.1.2 mrg ABI_SUPPORT(STD64) 51 1.1.1.2 mrg 52 1.1 mrg ASM_START() 53 1.1 mrg TEXT 54 1.1 mrg ALIGN(32) 55 1.1 mrg PROLOGUE(mpn_rshift) 56 1.1.1.2 mrg FUNC_ENTRY(4) 57 1.1 mrg mov (up), %rax 58 1.1.1.2 mrg movd R32(%rcx), %mm4 59 1.1.1.2 mrg neg R32(%rcx) C put lsh count in cl 60 1.1.1.2 mrg and $63, R32(%rcx) 61 1.1.1.2 mrg movd R32(%rcx), %mm5 62 1.1 mrg 63 1.1 mrg lea -8(up,n,8), up 64 1.1 mrg lea -8(rp,n,8), rp 65 1.1.1.2 mrg lea 1(n), R32(%r8) 66 1.1 mrg neg n 67 1.1 mrg 68 1.1.1.2 mrg shl R8(%rcx), %rax C function return value 69 1.1 mrg 70 1.1.1.2 mrg and $3, R32(%r8) 71 1.1 mrg je L(rol) C jump for n = 3, 7, 11, ... 72 1.1 mrg 73 1.1.1.2 mrg dec R32(%r8) 74 1.1 mrg jne L(1) 75 1.1 mrg C n = 4, 8, 12, ... 76 1.1 mrg movq 8(up,n,8), %mm2 77 1.1 mrg psrlq %mm4, %mm2 78 1.1 mrg movq 16(up,n,8), %mm0 79 1.1 mrg psllq %mm5, %mm0 80 1.1 mrg por %mm0, %mm2 81 1.1 mrg movq %mm2, 8(rp,n,8) 82 1.1 mrg inc n 83 1.1 mrg jmp L(rol) 84 1.1 mrg 85 1.1.1.2 mrg L(1): dec R32(%r8) 86 1.1 mrg je L(1x) C jump for n = 1, 5, 9, 13, ... 87 1.1 mrg C n = 2, 6, 10, 16, ... 88 1.1 mrg movq 8(up,n,8), %mm2 89 1.1 mrg psrlq %mm4, %mm2 90 1.1 mrg movq 16(up,n,8), %mm0 91 1.1 mrg psllq %mm5, %mm0 92 1.1 mrg por %mm0, %mm2 93 1.1 mrg movq %mm2, 8(rp,n,8) 94 1.1 mrg inc n 95 1.1 mrg L(1x): 96 1.1 mrg cmp $-1, n 97 1.1 mrg je L(ast) 98 1.1 mrg movq 8(up,n,8), %mm2 99 1.1 mrg psrlq %mm4, %mm2 100 1.1 mrg movq 16(up,n,8), %mm3 101 1.1 mrg psrlq %mm4, %mm3 102 1.1 mrg movq 16(up,n,8), %mm0 103 1.1 mrg movq 24(up,n,8), %mm1 104 1.1 mrg psllq %mm5, %mm0 105 1.1 mrg por %mm0, %mm2 106 1.1 mrg psllq %mm5, %mm1 107 1.1 mrg por %mm1, %mm3 108 1.1 mrg movq %mm2, 8(rp,n,8) 109 1.1 mrg movq %mm3, 16(rp,n,8) 110 1.1 mrg add $2, n 111 1.1 mrg 112 1.1 mrg L(rol): movq 8(up,n,8), %mm2 113 1.1 mrg psrlq %mm4, %mm2 114 1.1 mrg movq 16(up,n,8), %mm3 115 1.1 mrg psrlq %mm4, %mm3 116 1.1 mrg 117 1.1 mrg add $4, n C 4 118 1.1 mrg jb L(end) C 2 119 1.1 mrg ALIGN(32) 120 1.1 mrg L(top): 121 1.1 mrg C finish stuff from lsh block 122 1.1 mrg movq -16(up,n,8), %mm0 123 1.1 mrg movq -8(up,n,8), %mm1 124 1.1 mrg psllq %mm5, %mm0 125 1.1 mrg por %mm0, %mm2 126 1.1 mrg psllq %mm5, %mm1 127 1.1 mrg movq (up,n,8), %mm0 128 1.1 mrg por %mm1, %mm3 129 1.1 mrg movq 8(up,n,8), %mm1 130 1.1 mrg movq %mm2, -24(rp,n,8) 131 1.1 mrg movq %mm3, -16(rp,n,8) 132 1.1 mrg C start two new rsh 133 1.1 mrg psllq %mm5, %mm0 134 1.1 mrg psllq %mm5, %mm1 135 1.1 mrg 136 1.1 mrg C finish stuff from rsh block 137 1.1 mrg movq -8(up,n,8), %mm2 138 1.1 mrg movq (up,n,8), %mm3 139 1.1 mrg psrlq %mm4, %mm2 140 1.1 mrg por %mm2, %mm0 141 1.1 mrg psrlq %mm4, %mm3 142 1.1 mrg movq 8(up,n,8), %mm2 143 1.1 mrg por %mm3, %mm1 144 1.1 mrg movq 16(up,n,8), %mm3 145 1.1 mrg movq %mm0, -8(rp,n,8) 146 1.1 mrg movq %mm1, (rp,n,8) 147 1.1 mrg C start two new lsh 148 1.1 mrg add $4, n 149 1.1 mrg psrlq %mm4, %mm2 150 1.1 mrg psrlq %mm4, %mm3 151 1.1 mrg 152 1.1 mrg jae L(top) C 2 153 1.1 mrg L(end): 154 1.1.1.2 mrg movq -8(up), %mm0 155 1.1 mrg psllq %mm5, %mm0 156 1.1 mrg por %mm0, %mm2 157 1.1.1.2 mrg movq (up), %mm1 158 1.1 mrg psllq %mm5, %mm1 159 1.1 mrg por %mm1, %mm3 160 1.1.1.2 mrg movq %mm2, -16(rp) 161 1.1.1.2 mrg movq %mm3, -8(rp) 162 1.1 mrg 163 1.1 mrg L(ast): movq (up), %mm2 164 1.1 mrg psrlq %mm4, %mm2 165 1.1 mrg movq %mm2, (rp) 166 1.1 mrg emms 167 1.1.1.2 mrg FUNC_EXIT() 168 1.1 mrg ret 169 1.1 mrg EPILOGUE() 170