Home | History | Annotate | Line # | Download | only in adx
      1 dnl  AMD64 mpn_addmul_1 for CPUs with mulx and adx.
      2 
      3 dnl  Contributed to the GNU project by Torbjrn Granlund.
      4 
      5 dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C	     cycles/limb
     36 C AMD K8,K9	 -
     37 C AMD K10	 -
     38 C AMD bd1	 -
     39 C AMD bd2	 -
     40 C AMD bd3	 -
     41 C AMD bd4	 -
     42 C AMD zen	 ?
     43 C AMD bt1	 -
     44 C AMD bt2	 -
     45 C Intel P4	 -
     46 C Intel PNR	 -
     47 C Intel NHM	 -
     48 C Intel SBR	 -
     49 C Intel IBR	 -
     50 C Intel HWL	 -
     51 C Intel BWL	 ?
     52 C Intel SKL	 ?
     53 C Intel atom	 -
     54 C Intel SLM	 -
     55 C VIA nano	 -
     56 
     57 define(`rp',      `%rdi')	dnl rcx
     58 define(`up',      `%rsi')	dnl rdx
     59 define(`n_param', `%rdx')	dnl r8
     60 define(`v0_param',`%rcx')	dnl r9
     61 
     62 define(`n',       `%rcx')	dnl
     63 define(`v0',      `%rdx')	dnl
     64 
     65 C Testing mechanism for running this on older AMD64 processors
     66 ifelse(FAKE_MULXADX,1,`
     67   include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
     68 ',`
     69   define(`adox',	``adox'	$1, $2')
     70   define(`adcx',	``adcx'	$1, $2')
     71   define(`mulx',	``mulx'	$1, $2, $3')
     72 ')
     73 
     74 ASM_START()
     75 	TEXT
     76 	ALIGN(16)
     77 PROLOGUE(mpn_addmul_1)
     78 	mov	(up), %r8
     79 
     80 	push	%rbx
     81 	push	%r12
     82 	push	%r13
     83 
     84 	lea	(up,n_param,8), up
     85 	lea	-16(rp,n_param,8), rp
     86 	mov	R32(n_param), R32(%rax)
     87 	xchg	v0_param, v0		C FIXME: is this insn fast?
     88 
     89 	neg	n
     90 
     91 	and	$3, R8(%rax)
     92 	jz	L(b0)
     93 	cmp	$2, R8(%rax)
     94 	jl	L(b1)
     95 	jz	L(b2)
     96 
     97 L(b3):	mulx(	(up,n,8), %r11, %r10)
     98 	mulx(	8(up,n,8), %r13, %r12)
     99 	mulx(	16(up,n,8), %rbx, %rax)
    100 	dec	n
    101 	jmp	L(lo3)
    102 
    103 L(b0):	mulx(	(up,n,8), %r9, %r8)
    104 	mulx(	8(up,n,8), %r11, %r10)
    105 	mulx(	16(up,n,8), %r13, %r12)
    106 	jmp	L(lo0)
    107 
    108 L(b2):	mulx(	(up,n,8), %r13, %r12)
    109 	mulx(	8(up,n,8), %rbx, %rax)
    110 	lea	2(n), n
    111 	jrcxz	L(wd2)
    112 L(gt2):	mulx(	(up,n,8), %r9, %r8)
    113 	jmp	L(lo2)
    114 
    115 L(b1):	and	R8(%rax), R8(%rax)
    116 	mulx(	(up,n,8), %rbx, %rax)
    117 	lea	1(n), n
    118 	jrcxz	L(wd1)
    119 	mulx(	(up,n,8), %r9, %r8)
    120 	mulx(	8(up,n,8), %r11, %r10)
    121 	jmp	L(lo1)
    122 
    123 L(end):	adcx(	%r10, %r13)
    124 	mov	%r11, -8(rp)
    125 L(wd2):	adox(	(rp), %r13)
    126 	adcx(	%r12, %rbx)
    127 	mov	%r13, (rp)
    128 L(wd1):	adox(	8(rp), %rbx)
    129 	adcx(	%rcx, %rax)
    130 	adox(	%rcx, %rax)
    131 	mov	%rbx, 8(rp)
    132 	pop	%r13
    133 	pop	%r12
    134 	pop	%rbx
    135 	ret
    136 
    137 L(top):	jrcxz	L(end)
    138 	mulx(	(up,n,8), %r9, %r8)
    139 	adcx(	%r10, %r13)
    140 	mov	%r11, -8(rp,n,8)
    141 L(lo2):	adox(	(rp,n,8), %r13)
    142 	mulx(	8(up,n,8), %r11, %r10)
    143 	adcx(	%r12, %rbx)
    144 	mov	%r13, (rp,n,8)
    145 L(lo1):	adox(	8(rp,n,8), %rbx)
    146 	mulx(	16(up,n,8), %r13, %r12)
    147 	adcx(	%rax, %r9)
    148 	mov	%rbx, 8(rp,n,8)
    149 L(lo0):	adox(	16(rp,n,8), %r9)
    150 	mulx(	24(up,n,8), %rbx, %rax)
    151 	adcx(	%r8, %r11)
    152 	mov	%r9, 16(rp,n,8)
    153 L(lo3):	adox(	24(rp,n,8), %r11)
    154 	lea	4(n), n
    155 	jmp	L(top)
    156 EPILOGUE()
    157 ASM_END()
    158