Home | History | Annotate | Line # | Download | only in x86_64
      1      1.1  mrg dnl  AMD64 mpn_mod_1s_2p
      2      1.1  mrg 
      3      1.1  mrg dnl  Contributed to the GNU project by Torbjorn Granlund.
      4      1.1  mrg 
      5  1.1.1.2  mrg dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
      6      1.1  mrg 
      7      1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1.1.2  mrg dnl
      9      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1.1.2  mrg dnl  it under the terms of either:
     11  1.1.1.2  mrg dnl
     12  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1.1.2  mrg dnl      option) any later version.
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl  or
     17  1.1.1.2  mrg dnl
     18  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1.1.2  mrg dnl      later version.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  or both in parallel, as here.
     23  1.1.1.2  mrg dnl
     24      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1.1.2  mrg dnl  for more details.
     28  1.1.1.2  mrg dnl
     29  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     32      1.1  mrg 
     33      1.1  mrg include(`../config.m4')
     34      1.1  mrg 
     35      1.1  mrg C	     cycles/limb
     36      1.1  mrg C AMD K8,K9	 4
     37      1.1  mrg C AMD K10	 4
     38      1.1  mrg C Intel P4	19
     39      1.1  mrg C Intel core2	 8
     40      1.1  mrg C Intel NHM	 6.5
     41      1.1  mrg C Intel SBR	 4.5
     42      1.1  mrg C Intel atom	28
     43      1.1  mrg C VIA nano	 8
     44      1.1  mrg 
     45      1.1  mrg ABI_SUPPORT(DOS64)
     46      1.1  mrg ABI_SUPPORT(STD64)
     47      1.1  mrg 
     48      1.1  mrg ASM_START()
     49      1.1  mrg 	TEXT
     50      1.1  mrg 	ALIGN(16)
     51      1.1  mrg PROLOGUE(mpn_mod_1s_2p)
     52      1.1  mrg 	FUNC_ENTRY(4)
     53      1.1  mrg 	push	%r14
     54      1.1  mrg 	test	$1, R8(%rsi)
     55      1.1  mrg 	mov	%rdx, %r14
     56      1.1  mrg 	push	%r13
     57      1.1  mrg 	mov	%rcx, %r13
     58      1.1  mrg 	push	%r12
     59      1.1  mrg 	push	%rbp
     60      1.1  mrg 	push	%rbx
     61      1.1  mrg 	mov	16(%rcx), %r10
     62      1.1  mrg 	mov	24(%rcx), %rbx
     63      1.1  mrg 	mov	32(%rcx), %rbp
     64      1.1  mrg 	je	L(b0)
     65      1.1  mrg 	dec	%rsi
     66      1.1  mrg 	je	L(one)
     67      1.1  mrg 	mov	-8(%rdi,%rsi,8), %rax
     68      1.1  mrg 	mul	%r10
     69      1.1  mrg 	mov	%rax, %r9
     70      1.1  mrg 	mov	%rdx, %r8
     71      1.1  mrg 	mov	(%rdi,%rsi,8), %rax
     72      1.1  mrg 	add	-16(%rdi,%rsi,8), %r9
     73      1.1  mrg 	adc	$0, %r8
     74      1.1  mrg 	mul	%rbx
     75      1.1  mrg 	add	%rax, %r9
     76      1.1  mrg 	adc	%rdx, %r8
     77      1.1  mrg 	jmp	L(11)
     78      1.1  mrg 
     79      1.1  mrg L(b0):	mov	-8(%rdi,%rsi,8), %r8
     80      1.1  mrg 	mov	-16(%rdi,%rsi,8), %r9
     81      1.1  mrg 
     82      1.1  mrg L(11):	sub	$4, %rsi
     83      1.1  mrg 	jb	L(ed2)
     84      1.1  mrg 	lea	40(%rdi,%rsi,8), %rdi
     85      1.1  mrg 	mov	-40(%rdi), %r11
     86      1.1  mrg 	mov	-32(%rdi), %rax
     87      1.1  mrg 	jmp	L(m0)
     88      1.1  mrg 
     89      1.1  mrg 	ALIGN(16)
     90      1.1  mrg L(top):	mov	-24(%rdi), %r9
     91      1.1  mrg 	add	%rax, %r11
     92      1.1  mrg 	mov	-16(%rdi), %rax
     93      1.1  mrg 	adc	%rdx, %r12
     94      1.1  mrg 	mul	%r10
     95      1.1  mrg 	add	%rax, %r9
     96      1.1  mrg 	mov	%r11, %rax
     97      1.1  mrg 	mov	%rdx, %r8
     98      1.1  mrg 	adc	$0, %r8
     99      1.1  mrg 	mul	%rbx
    100      1.1  mrg 	add	%rax, %r9
    101      1.1  mrg 	mov	%r12, %rax
    102      1.1  mrg 	adc	%rdx, %r8
    103      1.1  mrg 	mul	%rbp
    104      1.1  mrg 	sub	$2, %rsi
    105      1.1  mrg 	jb	L(ed1)
    106      1.1  mrg 	mov	-40(%rdi), %r11
    107      1.1  mrg 	add	%rax, %r9
    108      1.1  mrg 	mov	-32(%rdi), %rax
    109      1.1  mrg 	adc	%rdx, %r8
    110      1.1  mrg L(m0):	mul	%r10
    111      1.1  mrg 	add	%rax, %r11
    112      1.1  mrg 	mov	%r9, %rax
    113      1.1  mrg 	mov	%rdx, %r12
    114      1.1  mrg 	adc	$0, %r12
    115      1.1  mrg 	mul	%rbx
    116      1.1  mrg 	add	%rax, %r11
    117      1.1  mrg 	lea	-32(%rdi), %rdi		C ap -= 4
    118      1.1  mrg 	mov	%r8, %rax
    119      1.1  mrg 	adc	%rdx, %r12
    120      1.1  mrg 	mul	%rbp
    121      1.1  mrg 	sub	$2, %rsi
    122      1.1  mrg 	jae	L(top)
    123      1.1  mrg 
    124      1.1  mrg L(ed0):	mov	%r11, %r9
    125      1.1  mrg 	mov	%r12, %r8
    126      1.1  mrg L(ed1):	add	%rax, %r9
    127      1.1  mrg 	adc	%rdx, %r8
    128      1.1  mrg L(ed2):	mov	8(%r13), R32(%rdi)		C cnt
    129      1.1  mrg 	mov	%r8, %rax
    130      1.1  mrg 	mov	%r9, %r8
    131      1.1  mrg 	mul	%r10
    132      1.1  mrg 	add	%rax, %r8
    133      1.1  mrg 	adc	$0, %rdx
    134      1.1  mrg L(1):	xor	R32(%rcx), R32(%rcx)
    135      1.1  mrg 	mov	%r8, %r9
    136      1.1  mrg 	sub	R32(%rdi), R32(%rcx)
    137      1.1  mrg 	shr	R8(%rcx), %r9
    138      1.1  mrg 	mov	R32(%rdi), R32(%rcx)
    139      1.1  mrg 	sal	R8(%rcx), %rdx
    140      1.1  mrg 	or	%rdx, %r9
    141      1.1  mrg 	sal	R8(%rcx), %r8
    142      1.1  mrg 	mov	%r9, %rax
    143      1.1  mrg 	mulq	(%r13)
    144      1.1  mrg 	mov	%rax, %rsi
    145      1.1  mrg 	inc	%r9
    146      1.1  mrg 	add	%r8, %rsi
    147      1.1  mrg 	adc	%r9, %rdx
    148      1.1  mrg 	imul	%r14, %rdx
    149      1.1  mrg 	sub	%rdx, %r8
    150      1.1  mrg 	lea	(%r8,%r14), %rax
    151      1.1  mrg 	cmp	%r8, %rsi
    152      1.1  mrg 	cmovc	%rax, %r8
    153      1.1  mrg 	mov	%r8, %rax
    154      1.1  mrg 	sub	%r14, %rax
    155      1.1  mrg 	cmovc	%r8, %rax
    156      1.1  mrg 	mov	R32(%rdi), R32(%rcx)
    157      1.1  mrg 	shr	R8(%rcx), %rax
    158      1.1  mrg 	pop	%rbx
    159      1.1  mrg 	pop	%rbp
    160      1.1  mrg 	pop	%r12
    161      1.1  mrg 	pop	%r13
    162      1.1  mrg 	pop	%r14
    163      1.1  mrg 	FUNC_EXIT()
    164      1.1  mrg 	ret
    165      1.1  mrg L(one):
    166      1.1  mrg 	mov	(%rdi), %r8
    167      1.1  mrg 	mov	8(%rcx), R32(%rdi)
    168      1.1  mrg 	xor	%rdx, %rdx
    169      1.1  mrg 	jmp	L(1)
    170      1.1  mrg EPILOGUE()
    171      1.1  mrg 
    172      1.1  mrg 	ALIGN(16)
    173      1.1  mrg PROLOGUE(mpn_mod_1s_2p_cps)
    174      1.1  mrg 	FUNC_ENTRY(2)
    175      1.1  mrg 	push	%rbp
    176      1.1  mrg 	bsr	%rsi, %rcx
    177      1.1  mrg 	push	%rbx
    178      1.1  mrg 	mov	%rdi, %rbx
    179      1.1  mrg 	push	%r12
    180      1.1  mrg 	xor	$63, R32(%rcx)
    181      1.1  mrg 	mov	%rsi, %r12
    182      1.1  mrg 	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
    183      1.1  mrg 	sal	R8(%rcx), %r12		C b << cnt
    184      1.1  mrg IFSTD(`	mov	%r12, %rdi	')	C pass parameter
    185      1.1  mrg IFDOS(`	mov	%r12, %rcx	')	C pass parameter
    186  1.1.1.3  mrg IFDOS(`	sub	$32, %rsp	')
    187  1.1.1.2  mrg 	ASSERT(nz, `test $15, %rsp')
    188      1.1  mrg 	CALL(	mpn_invert_limb)
    189  1.1.1.3  mrg IFDOS(`	add	$32, %rsp	')
    190      1.1  mrg 	mov	%r12, %r8
    191      1.1  mrg 	mov	%rax, %r11
    192      1.1  mrg 	mov	%rax, (%rbx)		C store bi
    193      1.1  mrg 	mov	%rbp, 8(%rbx)		C store cnt
    194      1.1  mrg 	neg	%r8
    195      1.1  mrg 	mov	R32(%rbp), R32(%rcx)
    196      1.1  mrg 	mov	$1, R32(%rsi)
    197      1.1  mrg ifdef(`SHLD_SLOW',`
    198      1.1  mrg 	shl	R8(%rcx), %rsi
    199      1.1  mrg 	neg	R32(%rcx)
    200      1.1  mrg 	mov	%rax, %rbp
    201      1.1  mrg 	shr	R8(%rcx), %rax
    202      1.1  mrg 	or	%rax, %rsi
    203      1.1  mrg 	mov	%rbp, %rax
    204      1.1  mrg 	neg	R32(%rcx)
    205      1.1  mrg ',`
    206      1.1  mrg 	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
    207      1.1  mrg ')
    208      1.1  mrg 	imul	%r8, %rsi
    209      1.1  mrg 	mul	%rsi
    210      1.1  mrg 
    211      1.1  mrg 	add	%rsi, %rdx
    212      1.1  mrg 	shr	R8(%rcx), %rsi
    213      1.1  mrg 	mov	%rsi, 16(%rbx)		C store B1modb
    214      1.1  mrg 
    215      1.1  mrg 	not	%rdx
    216      1.1  mrg 	imul	%r12, %rdx
    217      1.1  mrg 	lea	(%rdx,%r12), %rsi
    218      1.1  mrg 	cmp	%rdx, %rax
    219      1.1  mrg 	cmovnc	%rdx, %rsi
    220      1.1  mrg 	mov	%r11, %rax
    221      1.1  mrg 	mul	%rsi
    222      1.1  mrg 
    223      1.1  mrg 	add	%rsi, %rdx
    224      1.1  mrg 	shr	R8(%rcx), %rsi
    225      1.1  mrg 	mov	%rsi, 24(%rbx)		C store B2modb
    226      1.1  mrg 
    227      1.1  mrg 	not	%rdx
    228      1.1  mrg 	imul	%r12, %rdx
    229      1.1  mrg 	add	%rdx, %r12
    230      1.1  mrg 	cmp	%rdx, %rax
    231      1.1  mrg 	cmovnc	%rdx, %r12
    232      1.1  mrg 
    233      1.1  mrg 	shr	R8(%rcx), %r12
    234      1.1  mrg 	mov	%r12, 32(%rbx)		C store B3modb
    235      1.1  mrg 
    236      1.1  mrg 	pop	%r12
    237      1.1  mrg 	pop	%rbx
    238      1.1  mrg 	pop	%rbp
    239      1.1  mrg 	FUNC_EXIT()
    240      1.1  mrg 	ret
    241      1.1  mrg EPILOGUE()
    242