Home | History | Annotate | Line # | Download | only in bd1
      1      1.1  mrg dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.2  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26  1.1.1.2  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg C	     cycles/limb
     34  1.1.1.3  mrg C AMD K8,K9      3.65
     35  1.1.1.3  mrg C AMD K10        3.30    3.68
     36  1.1.1.3  mrg C AMD bull       4.04    4.29
     37  1.1.1.3  mrg C AMD pile       4.33
     38  1.1.1.3  mrg C AMD steam
     39  1.1.1.3  mrg C AMD excavator
     40  1.1.1.3  mrg C AMD bobcat     5.73
     41  1.1.1.3  mrg C AMD jaguar     5.87
     42  1.1.1.3  mrg C Intel P4      12.5
     43  1.1.1.3  mrg C Intel core2    4.38
     44  1.1.1.3  mrg C Intel NHM      4.28
     45  1.1.1.3  mrg C Intel SBR      2.69
     46  1.1.1.3  mrg C Intel IBR      2.55
     47  1.1.1.3  mrg C Intel HWL      2.41
     48  1.1.1.3  mrg C Intel BWL      2.49
     49  1.1.1.3  mrg C Intel SKL      2.50
     50  1.1.1.3  mrg C Intel atom    20.3
     51  1.1.1.3  mrg C Intel SLM      7.8
     52  1.1.1.3  mrg C VIA nano       4.25
     53      1.1  mrg 
     54      1.1  mrg C The loop of this code is the result of running a code generation and
     55      1.1  mrg C optimisation tool suite written by David Harvey and Torbjorn Granlund.
     56      1.1  mrg 
     57      1.1  mrg C TODO
     58      1.1  mrg C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
     59      1.1  mrg 
     60      1.1  mrg define(`rp',      `%rdi')   C rcx
     61      1.1  mrg define(`up',      `%rsi')   C rdx
     62      1.1  mrg define(`n_param', `%rdx')   C r8
     63      1.1  mrg define(`v0',      `%rcx')   C r9
     64      1.1  mrg 
     65      1.1  mrg define(`n',       `%rbx')
     66      1.1  mrg 
     67      1.1  mrg ABI_SUPPORT(DOS64)
     68      1.1  mrg ABI_SUPPORT(STD64)
     69      1.1  mrg 
     70      1.1  mrg IFDOS(`	define(`up', ``%rsi'')	') dnl
     71      1.1  mrg IFDOS(`	define(`rp', ``%rcx'')	') dnl
     72      1.1  mrg IFDOS(`	define(`v0', ``%r9'')	') dnl
     73      1.1  mrg IFDOS(`	define(`r9', ``rdi'')	') dnl
     74      1.1  mrg IFDOS(`	define(`n',  ``%r8'')	') dnl
     75      1.1  mrg IFDOS(`	define(`r8', ``rbx'')	') dnl
     76      1.1  mrg 
     77      1.1  mrg ASM_START()
     78      1.1  mrg 	TEXT
     79      1.1  mrg 	ALIGN(16)
     80      1.1  mrg PROLOGUE(mpn_mul_1c)
     81      1.1  mrg IFDOS(``push	%rsi		'')
     82      1.1  mrg IFDOS(``push	%rdi		'')
     83      1.1  mrg IFDOS(``mov	%rdx, %rsi	'')
     84      1.1  mrg 
     85      1.1  mrg 	mov	(up), %rax		C read first u limb early
     86      1.1  mrg 	push	%rbx
     87      1.1  mrg IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     88      1.1  mrg IFDOS(`	mov	n, %r11		')
     89      1.1  mrg 	mul	v0
     90      1.1  mrg 
     91      1.1  mrg IFSTD(` add	%r8, %rax	')
     92      1.1  mrg IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
     93      1.1  mrg 	adc	$0, %rdx
     94      1.1  mrg 	jmp	L(common)
     95      1.1  mrg 
     96      1.1  mrg EPILOGUE()
     97      1.1  mrg 
     98      1.1  mrg 	ALIGN(16)
     99      1.1  mrg PROLOGUE(mpn_mul_1)
    100      1.1  mrg IFDOS(``push	%rsi		'')
    101      1.1  mrg IFDOS(``push	%rdi		'')
    102      1.1  mrg IFDOS(``mov	%rdx, %rsi	'')
    103      1.1  mrg 
    104      1.1  mrg 	mov	(up), %rax		C read first u limb early
    105      1.1  mrg 	push	%rbx
    106      1.1  mrg IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
    107      1.1  mrg IFDOS(`	mov	n, %r11		')
    108      1.1  mrg 	mul	v0
    109      1.1  mrg 
    110      1.1  mrg L(common):
    111      1.1  mrg IFSTD(`	mov	%r11, n		')
    112      1.1  mrg 
    113      1.1  mrg 	and	$3, R32(%r11)
    114      1.1  mrg 	lea	-16(rp,n,8), rp
    115      1.1  mrg 	jz	L(b0)
    116      1.1  mrg 	cmp	$2, R32(%r11)
    117      1.1  mrg 	jb	L(b1)
    118      1.1  mrg 	jz	L(b2)
    119      1.1  mrg 
    120      1.1  mrg L(b3):	mov	%rax, %r10
    121      1.1  mrg 	mov	%rdx, %r11
    122      1.1  mrg 	mov	8(up), %rax
    123      1.1  mrg 	mul	v0
    124      1.1  mrg 	lea	(up,n,8), up
    125      1.1  mrg 	not	n
    126      1.1  mrg 	jmp	L(L3)
    127      1.1  mrg 
    128      1.1  mrg L(b0):	mov	%rax, %r9
    129      1.1  mrg 	mov	%rdx, %r10
    130      1.1  mrg 	mov	8(up), %rax
    131      1.1  mrg 	lea	(up,n,8), up
    132      1.1  mrg 	neg	n
    133      1.1  mrg 	jmp	L(L0)
    134      1.1  mrg 
    135      1.1  mrg L(b1):	mov	%rax, %r8
    136      1.1  mrg 	cmp	$1, n
    137      1.1  mrg 	jz	L(n1)
    138      1.1  mrg 	mov	%rdx, %r9
    139      1.1  mrg 	lea	(up,n,8), up
    140      1.1  mrg 	neg	n
    141      1.1  mrg 	mov	%r8, 16(rp,n,8)
    142      1.1  mrg 	inc	n
    143      1.1  mrg 	jmp	L(L1)
    144      1.1  mrg 
    145      1.1  mrg L(b2):	mov	%rax, %r11
    146      1.1  mrg 	mov	%rdx, %r8
    147      1.1  mrg 	mov	8(up), %rax
    148      1.1  mrg 	lea	(up,n,8), up
    149      1.1  mrg 	neg	n
    150      1.1  mrg 	add	$2, n
    151      1.1  mrg 	jns	L(end)
    152      1.1  mrg 
    153      1.1  mrg 	ALIGN(16)
    154      1.1  mrg L(top):	mul	v0
    155      1.1  mrg 	mov	%rdx, %r9
    156      1.1  mrg 	add	%rax, %r8
    157      1.1  mrg 	adc	$0, %r9
    158      1.1  mrg 	mov	%r8, 8(rp,n,8)
    159      1.1  mrg 	mov	%r11, (rp,n,8)
    160      1.1  mrg L(L1):	mov	(up,n,8), %rax
    161      1.1  mrg 	mul	v0
    162      1.1  mrg 	add	%rax, %r9
    163      1.1  mrg 	mov	%rdx, %r10
    164      1.1  mrg 	mov	8(up,n,8), %rax
    165      1.1  mrg 	adc	$0, %r10
    166      1.1  mrg L(L0):	mul	v0
    167      1.1  mrg 	add	%rax, %r10
    168      1.1  mrg 	mov	%rdx, %r11
    169      1.1  mrg 	mov	16(up,n,8), %rax
    170      1.1  mrg 	adc	$0, %r11
    171      1.1  mrg 	mul	v0
    172      1.1  mrg 	mov	%r9, 16(rp,n,8)
    173      1.1  mrg L(L3):	add	%rax, %r11
    174      1.1  mrg 	mov	%r10, 24(rp,n,8)
    175      1.1  mrg 	mov	%rdx, %r8
    176      1.1  mrg 	adc	$0, %r8
    177      1.1  mrg 	add	$4, n
    178      1.1  mrg 	mov	-8(up,n,8), %rax
    179      1.1  mrg 	js	L(top)
    180      1.1  mrg 
    181      1.1  mrg L(end):	mul	v0
    182      1.1  mrg 	add	%rax, %r8
    183      1.1  mrg 	adc	$0, %rdx
    184      1.1  mrg 	mov	%r11, (rp)
    185      1.1  mrg L(n1):	mov	%r8, 8(rp)
    186      1.1  mrg 	mov	%rdx, %rax
    187      1.1  mrg 
    188      1.1  mrg 	pop	%rbx
    189      1.1  mrg IFDOS(``pop	%rdi		'')
    190      1.1  mrg IFDOS(``pop	%rsi		'')
    191      1.1  mrg 	ret
    192      1.1  mrg EPILOGUE()
    193      1.1  mrg ASM_END()
    194