Home | History | Annotate | Line # | Download | only in x86_64
      1 dnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
      2 
      3 dnl  Copyright 2011-2013 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C AMD K8,K9	 2
     35 C AMD K10	 2
     36 C AMD bd1	 2.32
     37 C AMD bobcat	 3
     38 C Intel P4	13
     39 C Intel core2	 2.9
     40 C Intel NHM	 2.8
     41 C Intel SBR	 2.4
     42 C Intel atom	 5.33
     43 C VIA nano	 3
     44 
     45 C NOTES
     46 C  * It might seem natural to use the cmov insn here, but since this function
     47 C    is supposed to have the exact same execution pattern for cnd true and
     48 C    false, and since cmov's documentation is not clear about whether it
     49 C    actually reads both source operands and writes the register for a false
     50 C    condition, we cannot use it.
     51 C  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
     52 C    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
     53 C    ADCSBB-to-memory, again saving 1 insn/limb.
     54 C  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
     55 C    for any other processor.
     56 
     57 C INPUT PARAMETERS
     58 define(`cnd',	`%rdi')	dnl rcx
     59 define(`rp',	`%rsi')	dnl rdx
     60 define(`up',	`%rdx')	dnl r8
     61 define(`vp',	`%rcx')	dnl r9
     62 define(`n',	`%r8')	dnl rsp+40
     63 
     64 ifdef(`OPERATION_cnd_add_n', `
     65 	define(ADDSUB,	      add)
     66 	define(ADCSBB,	      adc)
     67 	define(func,	      mpn_cnd_add_n)')
     68 ifdef(`OPERATION_cnd_sub_n', `
     69 	define(ADDSUB,	      sub)
     70 	define(ADCSBB,	      sbb)
     71 	define(func,	      mpn_cnd_sub_n)')
     72 
     73 MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
     74 
     75 ABI_SUPPORT(DOS64)
     76 ABI_SUPPORT(STD64)
     77 
     78 ASM_START()
     79 	TEXT
     80 	ALIGN(16)
     81 PROLOGUE(func)
     82 	FUNC_ENTRY(4)
     83 IFDOS(`	mov	56(%rsp), R32(%r8)')
     84 	push	%rbx
     85 	push	%rbp
     86 	push	%r12
     87 	push	%r13
     88 	push	%r14
     89 
     90 	neg	cnd
     91 	sbb	cnd, cnd		C make cnd mask
     92 
     93 	lea	(vp,n,8), vp
     94 	lea	(up,n,8), up
     95 	lea	(rp,n,8), rp
     96 
     97 	mov	R32(n), R32(%rax)
     98 	neg	n
     99 	and	$3, R32(%rax)
    100 	jz	L(top)			C carry-save reg rax = 0 in this arc
    101 	cmp	$2, R32(%rax)
    102 	jc	L(b1)
    103 	jz	L(b2)
    104 
    105 L(b3):	mov	(vp,n,8), %r12
    106 	mov	8(vp,n,8), %r13
    107 	mov	16(vp,n,8), %r14
    108 	and	cnd, %r12
    109 	mov	(up,n,8), %r10
    110 	and	cnd, %r13
    111 	mov	8(up,n,8), %rbx
    112 	and	cnd, %r14
    113 	mov	16(up,n,8), %rbp
    114 	ADDSUB	%r12, %r10
    115 	mov	%r10, (rp,n,8)
    116 	ADCSBB	%r13, %rbx
    117 	mov	%rbx, 8(rp,n,8)
    118 	ADCSBB	%r14, %rbp
    119 	mov	%rbp, 16(rp,n,8)
    120 	sbb	R32(%rax), R32(%rax)	C save carry
    121 	add	$3, n
    122 	js	L(top)
    123 	jmp	L(end)
    124 
    125 L(b2):	mov	(vp,n,8), %r12
    126 	mov	8(vp,n,8), %r13
    127 	mov	(up,n,8), %r10
    128 	and	cnd, %r12
    129 	mov	8(up,n,8), %rbx
    130 	and	cnd, %r13
    131 	ADDSUB	%r12, %r10
    132 	mov	%r10, (rp,n,8)
    133 	ADCSBB	%r13, %rbx
    134 	mov	%rbx, 8(rp,n,8)
    135 	sbb	R32(%rax), R32(%rax)	C save carry
    136 	add	$2, n
    137 	js	L(top)
    138 	jmp	L(end)
    139 
    140 L(b1):	mov	(vp,n,8), %r12
    141 	mov	(up,n,8), %r10
    142 	and	cnd, %r12
    143 	ADDSUB	%r12, %r10
    144 	mov	%r10, (rp,n,8)
    145 	sbb	R32(%rax), R32(%rax)	C save carry
    146 	add	$1, n
    147 	jns	L(end)
    148 
    149 	ALIGN(16)
    150 L(top):	mov	(vp,n,8), %r12
    151 	mov	8(vp,n,8), %r13
    152 	mov	16(vp,n,8), %r14
    153 	mov	24(vp,n,8), %r11
    154 	and	cnd, %r12
    155 	mov	(up,n,8), %r10
    156 	and	cnd, %r13
    157 	mov	8(up,n,8), %rbx
    158 	and	cnd, %r14
    159 	mov	16(up,n,8), %rbp
    160 	and	cnd, %r11
    161 	mov	24(up,n,8), %r9
    162 	add	R32(%rax), R32(%rax)	C restore carry
    163 	ADCSBB	%r12, %r10
    164 	mov	%r10, (rp,n,8)
    165 	ADCSBB	%r13, %rbx
    166 	mov	%rbx, 8(rp,n,8)
    167 	ADCSBB	%r14, %rbp
    168 	mov	%rbp, 16(rp,n,8)
    169 	ADCSBB	%r11, %r9
    170 	mov	%r9, 24(rp,n,8)
    171 	sbb	R32(%rax), R32(%rax)	C save carry
    172 	add	$4, n
    173 	js	L(top)
    174 
    175 L(end):	neg	R32(%rax)
    176 	pop	%r14
    177 	pop	%r13
    178 	pop	%r12
    179 	pop	%rbp
    180 	pop	%rbx
    181 	FUNC_EXIT()
    182 	ret
    183 EPILOGUE()
    184