1 1.1 mrg dnl Intel Atom mpn_lshift -- mpn left shift. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2011 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1.1.2 mrg dnl 24 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1.1.2 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 36 1.1 mrg C unsigned cnt); 37 1.1 mrg 38 1.1 mrg C cycles/limb 39 1.1 mrg C cnt!=1 cnt==1 40 1.1 mrg C P5 41 1.1 mrg C P6 model 0-8,10-12 42 1.1 mrg C P6 model 9 (Banias) 43 1.1 mrg C P6 model 13 (Dothan) 44 1.1 mrg C P4 model 0 (Willamette) 45 1.1 mrg C P4 model 1 (?) 46 1.1 mrg C P4 model 2 (Northwood) 47 1.1 mrg C P4 model 3 (Prescott) 48 1.1 mrg C P4 model 4 (Nocona) 49 1.1 mrg C Intel Atom 5 2.5 50 1.1 mrg C AMD K6 51 1.1 mrg C AMD K7 52 1.1 mrg C AMD K8 53 1.1 mrg C AMD K10 54 1.1 mrg 55 1.1 mrg defframe(PARAM_CNT, 16) 56 1.1 mrg defframe(PARAM_SIZE,12) 57 1.1 mrg defframe(PARAM_SRC, 8) 58 1.1 mrg defframe(PARAM_DST, 4) 59 1.1 mrg 60 1.1 mrg dnl re-use parameter space 61 1.1 mrg define(SAVE_UP,`PARAM_CNT') 62 1.1 mrg define(VAR_COUNT,`PARAM_SIZE') 63 1.1 mrg define(SAVE_EBX,`PARAM_SRC') 64 1.1 mrg define(SAVE_EBP,`PARAM_DST') 65 1.1 mrg 66 1.1 mrg define(`rp', `%edi') 67 1.1 mrg define(`up', `%esi') 68 1.1 mrg define(`cnt', `%ecx') 69 1.1 mrg 70 1.1 mrg ASM_START() 71 1.1 mrg TEXT 72 1.1 mrg ALIGN(8) 73 1.1 mrg deflit(`FRAME',0) 74 1.1 mrg PROLOGUE(mpn_lshift) 75 1.1 mrg mov PARAM_CNT, cnt 76 1.1 mrg mov PARAM_SIZE, %edx 77 1.1 mrg mov up, SAVE_UP 78 1.1 mrg mov PARAM_SRC, up 79 1.1 mrg push rp FRAME_pushl() 80 1.1 mrg mov PARAM_DST, rp 81 1.1 mrg 82 1.1 mrg C We can use faster code for shift-by-1 under certain conditions. 83 1.1 mrg cmp $1,cnt 84 1.1 mrg jne L(normal) 85 1.1 mrg cmpl rp, up 86 1.1 mrg jnc L(special) C jump if s_ptr + 1 >= res_ptr 87 1.1 mrg leal (up,%edx,4),%eax 88 1.1 mrg cmpl %eax,rp 89 1.1 mrg jnc L(special) C jump if res_ptr >= s_ptr + size 90 1.1 mrg 91 1.1 mrg L(normal): 92 1.1 mrg lea -4(up,%edx,4), up 93 1.1 mrg mov %ebx, SAVE_EBX 94 1.1 mrg lea -4(rp,%edx,4), rp 95 1.1 mrg 96 1.1 mrg shr %edx 97 1.1 mrg mov (up), %eax 98 1.1 mrg mov %edx, VAR_COUNT 99 1.1 mrg jnc L(evn) 100 1.1 mrg 101 1.1 mrg mov %eax, %ebx 102 1.1 mrg shl %cl, %ebx 103 1.1 mrg neg cnt 104 1.1 mrg shr %cl, %eax 105 1.1 mrg test %edx, %edx 106 1.1 mrg jnz L(gt1) 107 1.1 mrg mov %ebx, (rp) 108 1.1 mrg jmp L(quit) 109 1.1 mrg 110 1.1 mrg L(gt1): mov %ebp, SAVE_EBP 111 1.1 mrg push %eax 112 1.1 mrg mov -4(up), %eax 113 1.1 mrg mov %eax, %ebp 114 1.1 mrg shr %cl, %eax 115 1.1 mrg jmp L(lo1) 116 1.1 mrg 117 1.1 mrg L(evn): mov %ebp, SAVE_EBP 118 1.1 mrg neg cnt 119 1.1 mrg mov %eax, %ebp 120 1.1 mrg mov -4(up), %edx 121 1.1 mrg shr %cl, %eax 122 1.1 mrg mov %edx, %ebx 123 1.1 mrg shr %cl, %edx 124 1.1 mrg neg cnt 125 1.1 mrg decl VAR_COUNT 126 1.1 mrg lea 4(rp), rp 127 1.1 mrg lea -4(up), up 128 1.1 mrg jz L(end) 129 1.1 mrg push %eax FRAME_pushl() 130 1.1 mrg 131 1.1 mrg ALIGN(8) 132 1.1 mrg L(top): shl %cl, %ebp 133 1.1 mrg or %ebp, %edx 134 1.1 mrg shl %cl, %ebx 135 1.1 mrg neg cnt 136 1.1 mrg mov -4(up), %eax 137 1.1 mrg mov %eax, %ebp 138 1.1 mrg mov %edx, -4(rp) 139 1.1 mrg shr %cl, %eax 140 1.1 mrg lea -8(rp), rp 141 1.1 mrg L(lo1): mov -8(up), %edx 142 1.1 mrg or %ebx, %eax 143 1.1 mrg mov %edx, %ebx 144 1.1 mrg shr %cl, %edx 145 1.1 mrg lea -8(up), up 146 1.1 mrg neg cnt 147 1.1 mrg mov %eax, (rp) 148 1.1 mrg decl VAR_COUNT 149 1.1 mrg jg L(top) 150 1.1 mrg 151 1.1 mrg pop %eax FRAME_popl() 152 1.1 mrg L(end): 153 1.1 mrg shl %cl, %ebp 154 1.1 mrg shl %cl, %ebx 155 1.1 mrg or %ebp, %edx 156 1.1 mrg mov SAVE_EBP, %ebp 157 1.1 mrg mov %edx, -4(rp) 158 1.1 mrg mov %ebx, -8(rp) 159 1.1 mrg 160 1.1 mrg L(quit): 161 1.1 mrg mov SAVE_UP, up 162 1.1 mrg mov SAVE_EBX, %ebx 163 1.1 mrg pop rp FRAME_popl() 164 1.1 mrg ret 165 1.1 mrg 166 1.1 mrg L(special): 167 1.1 mrg deflit(`FRAME',4) 168 1.1 mrg lea 3(%edx), %eax C size + 3 169 1.1 mrg dec %edx C size - 1 170 1.1 mrg mov (up), %ecx 171 1.1 mrg shr $2, %eax C (size + 3) / 4 172 1.1 mrg and $3, %edx C (size - 1) % 4 173 1.1 mrg jz L(goloop) C jmp if size == 1 (mod 4) 174 1.1 mrg shr %edx 175 1.1 mrg jnc L(odd) C jum if size == 3 (mod 4) 176 1.1 mrg 177 1.1 mrg add %ecx, %ecx 178 1.1 mrg lea 4(up), up 179 1.1 mrg mov %ecx, (rp) 180 1.1 mrg mov (up), %ecx 181 1.1 mrg lea 4(rp), rp 182 1.1 mrg 183 1.1 mrg dec %edx 184 1.1 mrg jnz L(goloop) C jump if size == 0 (mod 4) 185 1.1 mrg L(odd): lea -8(up), up 186 1.1 mrg lea -8(rp), rp 187 1.1 mrg jmp L(sentry) C reached if size == 2 or 3 (mod 4) 188 1.1 mrg 189 1.1 mrg L(sloop): 190 1.1 mrg adc %ecx, %ecx 191 1.1 mrg mov 4(up), %edx 192 1.1 mrg mov %ecx, (rp) 193 1.1 mrg adc %edx, %edx 194 1.1 mrg mov 8(up), %ecx 195 1.1 mrg mov %edx, 4(rp) 196 1.1 mrg L(sentry): 197 1.1 mrg adc %ecx, %ecx 198 1.1 mrg mov 12(up), %edx 199 1.1 mrg mov %ecx, 8(rp) 200 1.1 mrg adc %edx, %edx 201 1.1 mrg lea 16(up), up 202 1.1 mrg mov %edx, 12(rp) 203 1.1 mrg lea 16(rp), rp 204 1.1 mrg mov (up), %ecx 205 1.1 mrg L(goloop): 206 1.1 mrg decl %eax 207 1.1 mrg jnz L(sloop) 208 1.1 mrg 209 1.1 mrg L(squit): 210 1.1 mrg adc %ecx, %ecx 211 1.1 mrg mov %ecx, (rp) 212 1.1 mrg adc %eax, %eax 213 1.1 mrg 214 1.1 mrg mov SAVE_UP, up 215 1.1 mrg pop rp FRAME_popl() 216 1.1 mrg ret 217 1.1 mrg EPILOGUE() 218 1.1 mrg ASM_END() 219