Home | History | Annotate | Line # | Download | only in zen
      1 dnl  AMD64 mpn_mul_1 for CPUs with mulx.
      2 
      3 dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C AMD K8,K9	 -
     35 C AMD K10	 -
     36 C AMD bd1	 -
     37 C AMD bd2	 -
     38 C AMD bd3	 -
     39 C AMD bd4	 4.4
     40 C AMD zen	 2
     41 C AMD bobcat	 -
     42 C AMD jaguar	 -
     43 C Intel P4	 -
     44 C Intel PNR	 -
     45 C Intel NHM	 -
     46 C Intel SBR	 -
     47 C Intel IBR	 -
     48 C Intel HWL	 ?
     49 C Intel BWL	 ?
     50 C Intel SKL	 ?
     51 C Intel atom	 -
     52 C Intel SLM      -
     53 C VIA nano	 -
     54 
     55 define(`rp',      `%rdi')   C rcx
     56 define(`up',      `%rsi')   C rdx
     57 define(`n_param', `%rdx')   C r8
     58 define(`v0_param',`%rcx')   C r9
     59 
     60 define(`n',       `%rcx')
     61 define(`v0',      `%rdx')
     62 
     63 ABI_SUPPORT(DOS64)
     64 ABI_SUPPORT(STD64)
     65 
     66 ASM_START()
     67 	TEXT
     68 	ALIGN(16)
     69 PROLOGUE(mpn_mul_1c)
     70 	FUNC_ENTRY(4)
     71 IFDOS(` mov	56(%rsp), %r8	')
     72 	jmp	L(ent)
     73 EPILOGUE()
     74 	ALIGN(16)
     75 PROLOGUE(mpn_mul_1)
     76 	FUNC_ENTRY(4)
     77 	xor	R32(%r8), R32(%r8)	C carry-in limb
     78 L(ent):	mov	(up), %r9
     79 
     80 	push	%rbx
     81 	push	%r12
     82 	push	%r13
     83 
     84 	lea	(up,n_param,8), up
     85 	lea	-32(rp,n_param,8), rp
     86 	mov	R32(n_param), R32(%rax)
     87 	xchg	v0_param, v0		C FIXME: is this insn fast?
     88 
     89 	neg	n
     90 
     91 	and	$3, R8(%rax)
     92 	jz	L(b0)
     93 	cmp	$2, R8(%rax)
     94 	jz	L(b2)
     95 	jg	L(b3)
     96 
     97 L(b1):	mov	%r8, %r12
     98 	mulx(	%r9, %rbx, %rax)
     99 	sub	$-1, n
    100 	jz	L(wd1)
    101 	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
    102 	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
    103 	add	%r12, %rbx
    104 	jmp	L(lo1)
    105 
    106 L(b3):	mulx(	%r9, %r11, %r10)
    107 	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
    108 	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
    109 	sub	$-3, n
    110 	jz	L(wd3)
    111 	add	%r8, %r11
    112 	jmp	L(lo3)
    113 
    114 L(b2):	mov	%r8, %r10		C carry-in limb
    115 	mulx(	%r9, %r13, %r12)
    116 	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08	C mulx 8(up,n,8), %rbx, %rax
    117 	sub	$-2, n
    118 	jz	L(wd2)
    119 	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
    120 	add	%r10, %r13
    121 	jmp	L(lo2)
    122 
    123 L(b0):	mov	%r8, %rax		C carry-in limb
    124 	mulx(	%r9, %r9, %r8)
    125 	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
    126 	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
    127 	add	%rax, %r9
    128 	jmp	L(lo0)
    129 
    130 L(top):	jrcxz	L(end)
    131 	adc	%r8, %r11
    132 	mov	%r9, (rp,n,8)
    133 L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
    134 	adc	%r10, %r13
    135 	mov	%r11, 8(rp,n,8)
    136 L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
    137 	adc	%r12, %rbx
    138 	mov	%r13, 16(rp,n,8)
    139 L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
    140 	adc	%rax, %r9
    141 	mov	%rbx, 24(rp,n,8)
    142 L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
    143 	lea	4(n), n
    144 	jmp	L(top)
    145 
    146 L(end):	mov	%r9, (rp)
    147 L(wd3):	adc	%r8, %r11
    148 	mov	%r11, 8(rp)
    149 L(wd2):	adc	%r10, %r13
    150 	mov	%r13, 16(rp)
    151 L(wd1):	adc	%r12, %rbx
    152 	adc	$0, %rax
    153 	mov	%rbx, 24(rp)
    154 
    155 	pop	%r13
    156 	pop	%r12
    157 	pop	%rbx
    158 	FUNC_EXIT()
    159 	ret
    160 EPILOGUE()
    161 ASM_END()
    162