Home | History | Annotate | Line # | Download | only in p6
      1      1.1  mrg dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2006 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
     34      1.1  mrg 
     35  1.1.1.2  mrg C (1) The loop is not scheduled in any way, and scheduling attempts have not
     36      1.1  mrg C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
     37      1.1  mrg C     at all wants to use MMX.
     38      1.1  mrg C (2) We could save a register by not alternatingly using eax and edx in the
     39      1.1  mrg C     loop.
     40      1.1  mrg 
     41      1.1  mrg define(`rp',	`%edi')
     42      1.1  mrg define(`up',	`%esi')
     43      1.1  mrg define(`vp',	`%ebx')
     44      1.1  mrg define(`n',	`%ecx')
     45      1.1  mrg define(`cnt',	`%mm7')
     46      1.1  mrg 
     47      1.1  mrg ASM_START()
     48      1.1  mrg 
     49      1.1  mrg 	TEXT
     50      1.1  mrg 	ALIGN(16)
     51      1.1  mrg 
     52      1.1  mrg PROLOGUE(mpn_lshsub_n)
     53      1.1  mrg 	push	%edi
     54      1.1  mrg 	push	%esi
     55      1.1  mrg 	push	%ebx
     56      1.1  mrg 
     57      1.1  mrg 	mov	16(%esp), rp
     58      1.1  mrg 	mov	20(%esp), up
     59      1.1  mrg 	mov	24(%esp), vp
     60      1.1  mrg 	mov	28(%esp), n
     61      1.1  mrg 	mov	$32, %eax
     62      1.1  mrg 	sub	32(%esp), %eax
     63      1.1  mrg 	movd	%eax, cnt
     64      1.1  mrg 
     65      1.1  mrg 	lea	(up,n,4), up
     66      1.1  mrg 	lea	(vp,n,4), vp
     67      1.1  mrg 	lea	(rp,n,4), rp
     68      1.1  mrg 
     69      1.1  mrg 	neg	n
     70      1.1  mrg 	mov	n, %eax
     71      1.1  mrg 	and	$-8, n
     72      1.1  mrg 	and	$7, %eax
     73      1.1  mrg 	shl	%eax				C eax = 2x
     74      1.1  mrg 	lea	(%eax,%eax,4), %edx		C edx = 10x
     75      1.1  mrg ifdef(`PIC',`
     76      1.1  mrg 	call	L(pic_calc)
     77      1.1  mrg L(here):
     78      1.1  mrg ',`
     79      1.1  mrg 	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
     80      1.1  mrg ')
     81      1.1  mrg 
     82      1.1  mrg 	pxor	%mm1, %mm1
     83      1.1  mrg 	pxor	%mm0, %mm0
     84      1.1  mrg 
     85      1.1  mrg 	jmp	*%eax
     86      1.1  mrg 
     87      1.1  mrg ifdef(`PIC',`
     88      1.1  mrg L(pic_calc):
     89      1.1  mrg 	C See mpn/x86/README about old gas bugs
     90      1.1  mrg 	lea	(%eax,%edx,2), %eax
     91      1.1  mrg 	add	$L(ent)-L(here), %eax
     92      1.1  mrg 	add	(%esp), %eax
     93      1.1  mrg 	ret_internal
     94      1.1  mrg ')
     95      1.1  mrg 
     96      1.1  mrg L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
     97      1.1  mrg 	sbb	%eax, %eax
     98      1.1  mrg 	neg	%eax
     99      1.1  mrg 	mov	32(%esp), %ecx
    100      1.1  mrg 	shld	%cl, %edx, %eax
    101      1.1  mrg 
    102      1.1  mrg 	emms
    103      1.1  mrg 
    104      1.1  mrg 	pop	%ebx
    105      1.1  mrg 	pop	%esi
    106      1.1  mrg 	pop	%edi
    107      1.1  mrg 	ret
    108      1.1  mrg 	ALIGN(16)
    109      1.1  mrg L(top):	jecxz	L(end)
    110      1.1  mrg L(ent):	mov	   0(up,n,4), %eax
    111      1.1  mrg 	sbb	   0(vp,n,4), %eax
    112      1.1  mrg 	movd	   %eax, %mm0
    113      1.1  mrg 	punpckldq  %mm0, %mm1
    114      1.1  mrg 	psrlq	   %mm7, %mm1
    115      1.1  mrg 	movd	   %mm1, 0(rp,n,4)
    116      1.1  mrg 
    117      1.1  mrg 	mov	   4(up,n,4), %edx
    118      1.1  mrg 	sbb	   4(vp,n,4), %edx
    119      1.1  mrg 	movd	   %edx, %mm1
    120      1.1  mrg 	punpckldq  %mm1, %mm0
    121      1.1  mrg 	psrlq	   %mm7, %mm0
    122      1.1  mrg 	movd	   %mm0, 4(rp,n,4)
    123      1.1  mrg 
    124      1.1  mrg 	mov	   8(up,n,4), %eax
    125      1.1  mrg 	sbb	   8(vp,n,4), %eax
    126      1.1  mrg 	movd	   %eax, %mm0
    127      1.1  mrg 	punpckldq  %mm0, %mm1
    128      1.1  mrg 	psrlq	   %mm7, %mm1
    129      1.1  mrg 	movd	   %mm1, 8(rp,n,4)
    130      1.1  mrg 
    131      1.1  mrg 	mov	   12(up,n,4), %edx
    132      1.1  mrg 	sbb	   12(vp,n,4), %edx
    133      1.1  mrg 	movd	   %edx, %mm1
    134      1.1  mrg 	punpckldq  %mm1, %mm0
    135      1.1  mrg 	psrlq	   %mm7, %mm0
    136      1.1  mrg 	movd	   %mm0, 12(rp,n,4)
    137      1.1  mrg 
    138      1.1  mrg 	mov	   16(up,n,4), %eax
    139      1.1  mrg 	sbb	   16(vp,n,4), %eax
    140      1.1  mrg 	movd	   %eax, %mm0
    141      1.1  mrg 	punpckldq  %mm0, %mm1
    142      1.1  mrg 	psrlq	   %mm7, %mm1
    143      1.1  mrg 	movd	   %mm1, 16(rp,n,4)
    144      1.1  mrg 
    145      1.1  mrg 	mov	   20(up,n,4), %edx
    146      1.1  mrg 	sbb	   20(vp,n,4), %edx
    147      1.1  mrg 	movd	   %edx, %mm1
    148      1.1  mrg 	punpckldq  %mm1, %mm0
    149      1.1  mrg 	psrlq	   %mm7, %mm0
    150      1.1  mrg 	movd	   %mm0, 20(rp,n,4)
    151      1.1  mrg 
    152      1.1  mrg 	mov	   24(up,n,4), %eax
    153      1.1  mrg 	sbb	   24(vp,n,4), %eax
    154      1.1  mrg 	movd	   %eax, %mm0
    155      1.1  mrg 	punpckldq  %mm0, %mm1
    156      1.1  mrg 	psrlq	   %mm7, %mm1
    157      1.1  mrg 	movd	   %mm1, 24(rp,n,4)
    158      1.1  mrg 
    159      1.1  mrg 	mov	   28(up,n,4), %edx
    160      1.1  mrg 	sbb	   28(vp,n,4), %edx
    161      1.1  mrg 	movd	   %edx, %mm1
    162      1.1  mrg 	punpckldq  %mm1, %mm0
    163      1.1  mrg 	psrlq	   %mm7, %mm0
    164      1.1  mrg 	movd	   %mm0, 28(rp,n,4)
    165      1.1  mrg 
    166      1.1  mrg 	lea	   8(n), n
    167      1.1  mrg 	jmp	   L(top)
    168      1.1  mrg 
    169      1.1  mrg EPILOGUE()
    170