Home | History | Annotate | Line # | Download | only in pentium4
      1      1.1  mrg dnl  x86-64 mpn_rshift optimized for Pentium 4.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.3  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21  1.1.1.3  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26  1.1.1.3  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C	     cycles/limb
     35  1.1.1.2  mrg C AMD K8,K9	 2.5
     36  1.1.1.2  mrg C AMD K10	 ?
     37  1.1.1.2  mrg C Intel P4	 3.29
     38  1.1.1.2  mrg C Intel core2	 2.1 (fluctuates, presumably cache related)
     39  1.1.1.2  mrg C Intel corei	 ?
     40  1.1.1.2  mrg C Intel atom	14.3
     41  1.1.1.2  mrg C VIA nano	 ?
     42      1.1  mrg 
     43      1.1  mrg C INPUT PARAMETERS
     44      1.1  mrg define(`rp',`%rdi')
     45      1.1  mrg define(`up',`%rsi')
     46      1.1  mrg define(`n',`%rdx')
     47      1.1  mrg define(`cnt',`%cl')
     48      1.1  mrg 
     49  1.1.1.2  mrg ABI_SUPPORT(DOS64)
     50  1.1.1.2  mrg ABI_SUPPORT(STD64)
     51  1.1.1.2  mrg 
     52      1.1  mrg ASM_START()
     53      1.1  mrg 	TEXT
     54      1.1  mrg 	ALIGN(32)
     55      1.1  mrg PROLOGUE(mpn_rshift)
     56  1.1.1.2  mrg 	FUNC_ENTRY(4)
     57      1.1  mrg 	mov	(up), %rax
     58  1.1.1.2  mrg 	movd	R32(%rcx), %mm4
     59  1.1.1.2  mrg 	neg	R32(%rcx)			C put lsh count in cl
     60  1.1.1.2  mrg 	and	$63, R32(%rcx)
     61  1.1.1.2  mrg 	movd	R32(%rcx), %mm5
     62      1.1  mrg 
     63      1.1  mrg 	lea	-8(up,n,8), up
     64      1.1  mrg 	lea	-8(rp,n,8), rp
     65  1.1.1.2  mrg 	lea	1(n), R32(%r8)
     66      1.1  mrg 	neg	n
     67      1.1  mrg 
     68  1.1.1.2  mrg 	shl	R8(%rcx), %rax		C function return value
     69      1.1  mrg 
     70  1.1.1.2  mrg 	and	$3, R32(%r8)
     71      1.1  mrg 	je	L(rol)			C jump for n = 3, 7, 11, ...
     72      1.1  mrg 
     73  1.1.1.2  mrg 	dec	R32(%r8)
     74      1.1  mrg 	jne	L(1)
     75      1.1  mrg C	n = 4, 8, 12, ...
     76      1.1  mrg 	movq	8(up,n,8), %mm2
     77      1.1  mrg 	psrlq	%mm4, %mm2
     78      1.1  mrg 	movq	16(up,n,8), %mm0
     79      1.1  mrg 	psllq	%mm5, %mm0
     80      1.1  mrg 	por	%mm0, %mm2
     81      1.1  mrg 	movq	%mm2, 8(rp,n,8)
     82      1.1  mrg 	inc	n
     83      1.1  mrg 	jmp	L(rol)
     84      1.1  mrg 
     85  1.1.1.2  mrg L(1):	dec	R32(%r8)
     86      1.1  mrg 	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
     87      1.1  mrg C	n = 2, 6, 10, 16, ...
     88      1.1  mrg 	movq	8(up,n,8), %mm2
     89      1.1  mrg 	psrlq	%mm4, %mm2
     90      1.1  mrg 	movq	16(up,n,8), %mm0
     91      1.1  mrg 	psllq	%mm5, %mm0
     92      1.1  mrg 	por	%mm0, %mm2
     93      1.1  mrg 	movq	%mm2, 8(rp,n,8)
     94      1.1  mrg 	inc	n
     95      1.1  mrg L(1x):
     96      1.1  mrg 	cmp	$-1, n
     97      1.1  mrg 	je	L(ast)
     98      1.1  mrg 	movq	8(up,n,8), %mm2
     99      1.1  mrg 	psrlq	%mm4, %mm2
    100      1.1  mrg 	movq	16(up,n,8), %mm3
    101      1.1  mrg 	psrlq	%mm4, %mm3
    102      1.1  mrg 	movq	16(up,n,8), %mm0
    103      1.1  mrg 	movq	24(up,n,8), %mm1
    104      1.1  mrg 	psllq	%mm5, %mm0
    105      1.1  mrg 	por	%mm0, %mm2
    106      1.1  mrg 	psllq	%mm5, %mm1
    107      1.1  mrg 	por	%mm1, %mm3
    108      1.1  mrg 	movq	%mm2, 8(rp,n,8)
    109      1.1  mrg 	movq	%mm3, 16(rp,n,8)
    110      1.1  mrg 	add	$2, n
    111      1.1  mrg 
    112      1.1  mrg L(rol):	movq	8(up,n,8), %mm2
    113      1.1  mrg 	psrlq	%mm4, %mm2
    114      1.1  mrg 	movq	16(up,n,8), %mm3
    115      1.1  mrg 	psrlq	%mm4, %mm3
    116      1.1  mrg 
    117      1.1  mrg 	add	$4, n			C				      4
    118      1.1  mrg 	jb	L(end)			C				      2
    119      1.1  mrg 	ALIGN(32)
    120      1.1  mrg L(top):
    121      1.1  mrg 	C finish stuff from lsh block
    122      1.1  mrg 	movq	-16(up,n,8), %mm0
    123      1.1  mrg 	movq	-8(up,n,8), %mm1
    124      1.1  mrg 	psllq	%mm5, %mm0
    125      1.1  mrg 	por	%mm0, %mm2
    126      1.1  mrg 	psllq	%mm5, %mm1
    127      1.1  mrg 	movq	(up,n,8), %mm0
    128      1.1  mrg 	por	%mm1, %mm3
    129      1.1  mrg 	movq	8(up,n,8), %mm1
    130      1.1  mrg 	movq	%mm2, -24(rp,n,8)
    131      1.1  mrg 	movq	%mm3, -16(rp,n,8)
    132      1.1  mrg 	C start two new rsh
    133      1.1  mrg 	psllq	%mm5, %mm0
    134      1.1  mrg 	psllq	%mm5, %mm1
    135      1.1  mrg 
    136      1.1  mrg 	C finish stuff from rsh block
    137      1.1  mrg 	movq	-8(up,n,8), %mm2
    138      1.1  mrg 	movq	(up,n,8), %mm3
    139      1.1  mrg 	psrlq	%mm4, %mm2
    140      1.1  mrg 	por	%mm2, %mm0
    141      1.1  mrg 	psrlq	%mm4, %mm3
    142      1.1  mrg 	movq	8(up,n,8), %mm2
    143      1.1  mrg 	por	%mm3, %mm1
    144      1.1  mrg 	movq	16(up,n,8), %mm3
    145      1.1  mrg 	movq	%mm0, -8(rp,n,8)
    146      1.1  mrg 	movq	%mm1, (rp,n,8)
    147      1.1  mrg 	C start two new lsh
    148      1.1  mrg 	add	$4, n
    149      1.1  mrg 	psrlq	%mm4, %mm2
    150      1.1  mrg 	psrlq	%mm4, %mm3
    151      1.1  mrg 
    152      1.1  mrg 	jae	L(top)			C				      2
    153      1.1  mrg L(end):
    154  1.1.1.2  mrg 	movq	-8(up), %mm0
    155      1.1  mrg 	psllq	%mm5, %mm0
    156      1.1  mrg 	por	%mm0, %mm2
    157  1.1.1.2  mrg 	movq	(up), %mm1
    158      1.1  mrg 	psllq	%mm5, %mm1
    159      1.1  mrg 	por	%mm1, %mm3
    160  1.1.1.2  mrg 	movq	%mm2, -16(rp)
    161  1.1.1.2  mrg 	movq	%mm3, -8(rp)
    162      1.1  mrg 
    163      1.1  mrg L(ast):	movq	(up), %mm2
    164      1.1  mrg 	psrlq	%mm4, %mm2
    165      1.1  mrg 	movq	%mm2, (rp)
    166      1.1  mrg 	emms
    167  1.1.1.2  mrg 	FUNC_EXIT()
    168      1.1  mrg 	ret
    169      1.1  mrg EPILOGUE()
    170