Home | History | Annotate | Line # | Download | only in atom
      1 dnl  Intel Atom mpn_lshift -- mpn left shift.
      2 
      3 dnl  Copyright 2011 Free Software Foundation, Inc.
      4 
      5 dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
     36 C			unsigned cnt);
     37 
     38 C				  cycles/limb
     39 C				cnt!=1	cnt==1
     40 C P5
     41 C P6 model 0-8,10-12
     42 C P6 model 9  (Banias)
     43 C P6 model 13 (Dothan)
     44 C P4 model 0  (Willamette)
     45 C P4 model 1  (?)
     46 C P4 model 2  (Northwood)
     47 C P4 model 3  (Prescott)
     48 C P4 model 4  (Nocona)
     49 C Intel Atom			 5	 2.5
     50 C AMD K6
     51 C AMD K7
     52 C AMD K8
     53 C AMD K10
     54 
     55 defframe(PARAM_CNT, 16)
     56 defframe(PARAM_SIZE,12)
     57 defframe(PARAM_SRC,  8)
     58 defframe(PARAM_DST,  4)
     59 
     60 dnl  re-use parameter space
     61 define(SAVE_UP,`PARAM_CNT')
     62 define(VAR_COUNT,`PARAM_SIZE')
     63 define(SAVE_EBX,`PARAM_SRC')
     64 define(SAVE_EBP,`PARAM_DST')
     65 
     66 define(`rp',  `%edi')
     67 define(`up',  `%esi')
     68 define(`cnt',  `%ecx')
     69 
     70 ASM_START()
     71 	TEXT
     72 	ALIGN(8)
     73 deflit(`FRAME',0)
     74 PROLOGUE(mpn_lshift)
     75 	mov	PARAM_CNT, cnt
     76 	mov	PARAM_SIZE, %edx
     77 	mov	up, SAVE_UP
     78 	mov	PARAM_SRC, up
     79 	push	rp			FRAME_pushl()
     80 	mov	PARAM_DST, rp
     81 
     82 C We can use faster code for shift-by-1 under certain conditions.
     83 	cmp	$1,cnt
     84 	jne	L(normal)
     85 	cmpl	rp, up
     86 	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
     87 	leal	(up,%edx,4),%eax
     88 	cmpl	%eax,rp
     89 	jnc	L(special)		C jump if res_ptr >= s_ptr + size
     90 
     91 L(normal):
     92 	lea	-4(up,%edx,4), up
     93 	mov	%ebx, SAVE_EBX
     94 	lea	-4(rp,%edx,4), rp
     95 
     96 	shr	%edx
     97 	mov	(up), %eax
     98 	mov	%edx, VAR_COUNT
     99 	jnc	L(evn)
    100 
    101 	mov	%eax, %ebx
    102 	shl	%cl, %ebx
    103 	neg	cnt
    104 	shr	%cl, %eax
    105 	test	%edx, %edx
    106 	jnz	L(gt1)
    107 	mov	%ebx, (rp)
    108 	jmp	L(quit)
    109 
    110 L(gt1):	mov	%ebp, SAVE_EBP
    111 	push	%eax
    112 	mov	-4(up), %eax
    113 	mov	%eax, %ebp
    114 	shr	%cl, %eax
    115 	jmp	L(lo1)
    116 
    117 L(evn):	mov	%ebp, SAVE_EBP
    118 	neg	cnt
    119 	mov	%eax, %ebp
    120 	mov	-4(up), %edx
    121 	shr	%cl, %eax
    122 	mov	%edx, %ebx
    123 	shr	%cl, %edx
    124 	neg	cnt
    125 	decl	VAR_COUNT
    126 	lea	4(rp), rp
    127 	lea	-4(up), up
    128 	jz	L(end)
    129 	push	%eax			FRAME_pushl()
    130 
    131 	ALIGN(8)
    132 L(top):	shl	%cl, %ebp
    133 	or	%ebp, %edx
    134 	shl	%cl, %ebx
    135 	neg	cnt
    136 	mov	-4(up), %eax
    137 	mov	%eax, %ebp
    138 	mov	%edx, -4(rp)
    139 	shr	%cl, %eax
    140 	lea	-8(rp), rp
    141 L(lo1):	mov	-8(up), %edx
    142 	or	%ebx, %eax
    143 	mov	%edx, %ebx
    144 	shr	%cl, %edx
    145 	lea	-8(up), up
    146 	neg	cnt
    147 	mov	%eax, (rp)
    148 	decl	VAR_COUNT
    149 	jg	L(top)
    150 
    151 	pop	%eax			FRAME_popl()
    152 L(end):
    153 	shl	%cl, %ebp
    154 	shl	%cl, %ebx
    155 	or	%ebp, %edx
    156 	mov	SAVE_EBP, %ebp
    157 	mov	%edx, -4(rp)
    158 	mov	%ebx, -8(rp)
    159 
    160 L(quit):
    161 	mov	SAVE_UP, up
    162 	mov	SAVE_EBX, %ebx
    163 	pop	rp			FRAME_popl()
    164 	ret
    165 
    166 L(special):
    167 deflit(`FRAME',4)
    168 	lea	3(%edx), %eax		C size + 3
    169 	dec	%edx			C size - 1
    170 	mov	(up), %ecx
    171 	shr	$2, %eax		C (size + 3) / 4
    172 	and	$3, %edx		C (size - 1) % 4
    173 	jz	L(goloop)		C jmp if  size == 1 (mod 4)
    174 	shr	%edx
    175 	jnc	L(odd)			C jum if  size == 3 (mod 4)
    176 
    177 	add	%ecx, %ecx
    178 	lea	4(up), up
    179 	mov	%ecx, (rp)
    180 	mov	(up), %ecx
    181 	lea	4(rp), rp
    182 
    183 	dec	%edx
    184 	jnz	L(goloop)		C jump if  size == 0 (mod 4)
    185 L(odd):	lea	-8(up), up
    186 	lea	-8(rp), rp
    187 	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
    188 
    189 L(sloop):
    190 	adc	%ecx, %ecx
    191 	mov	4(up), %edx
    192 	mov	%ecx, (rp)
    193 	adc	%edx, %edx
    194 	mov	8(up), %ecx
    195 	mov	%edx, 4(rp)
    196 L(sentry):
    197 	adc	%ecx, %ecx
    198 	mov	12(up), %edx
    199 	mov	%ecx, 8(rp)
    200 	adc	%edx, %edx
    201 	lea	16(up), up
    202 	mov	%edx, 12(rp)
    203 	lea	16(rp), rp
    204 	mov	(up), %ecx
    205 L(goloop):
    206 	decl	%eax
    207 	jnz	L(sloop)
    208 
    209 L(squit):
    210 	adc	%ecx, %ecx
    211 	mov	%ecx, (rp)
    212 	adc	%eax, %eax
    213 
    214 	mov	SAVE_UP, up
    215 	pop	rp			FRAME_popl()
    216 	ret
    217 EPILOGUE()
    218 ASM_END()
    219