Home | History | Annotate | Line # | Download | only in arm64
      1  1.1.1.2  mrg dnl  ARM64 mpn_addmul_1 and mpn_submul_1
      2      1.1  mrg 
      3      1.1  mrg dnl  Contributed to the GNU project by Torbjrn Granlund.
      4      1.1  mrg 
      5  1.1.1.2  mrg dnl  Copyright 2013, 2015, 2017 Free Software Foundation, Inc.
      6      1.1  mrg 
      7      1.1  mrg dnl  This file is part of the GNU MP Library.
      8      1.1  mrg dnl
      9      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10      1.1  mrg dnl  it under the terms of either:
     11      1.1  mrg dnl
     12      1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13      1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14      1.1  mrg dnl      option) any later version.
     15      1.1  mrg dnl
     16      1.1  mrg dnl  or
     17      1.1  mrg dnl
     18      1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     19      1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20      1.1  mrg dnl      later version.
     21      1.1  mrg dnl
     22      1.1  mrg dnl  or both in parallel, as here.
     23      1.1  mrg dnl
     24      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26      1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27      1.1  mrg dnl  for more details.
     28      1.1  mrg dnl
     29      1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     30      1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31      1.1  mrg dnl  see https://www.gnu.org/licenses/.
     32      1.1  mrg 
     33      1.1  mrg include(`../config.m4')
     34      1.1  mrg 
     35      1.1  mrg C	     cycles/limb
     36  1.1.1.2  mrg C Cortex-A53	9.3-9.8
     37  1.1.1.2  mrg C Cortex-A57	 7.0
     38  1.1.1.2  mrg C X-Gene	 5.0
     39  1.1.1.2  mrg 
     40  1.1.1.2  mrg C NOTES
     41  1.1.1.2  mrg C  * It is possible to keep the carry chain alive between the addition blocks
     42  1.1.1.2  mrg C    and thus avoid csinc, but only for addmul_1.  Since that saves no time
     43  1.1.1.2  mrg C    on the tested pipelines, we keep addmul_1 and submul_1 similar.
     44  1.1.1.2  mrg C  * We could separate feed-in into 4 blocks, one for each residue (mod 4).
     45  1.1.1.2  mrg C    That is likely to save a few cycles.
     46      1.1  mrg 
     47  1.1.1.2  mrg changecom(blah)
     48      1.1  mrg 
     49      1.1  mrg define(`rp', `x0')
     50      1.1  mrg define(`up', `x1')
     51      1.1  mrg define(`n',  `x2')
     52      1.1  mrg define(`v0', `x3')
     53      1.1  mrg 
     54      1.1  mrg ifdef(`OPERATION_addmul_1', `
     55      1.1  mrg   define(`ADDSUB',	adds)
     56      1.1  mrg   define(`ADDSUBC',	adcs)
     57      1.1  mrg   define(`COND',	`cc')
     58      1.1  mrg   define(`func',	mpn_addmul_1)')
     59      1.1  mrg ifdef(`OPERATION_submul_1', `
     60      1.1  mrg   define(`ADDSUB',	subs)
     61      1.1  mrg   define(`ADDSUBC',	sbcs)
     62      1.1  mrg   define(`COND',	`cs')
     63      1.1  mrg   define(`func',	mpn_submul_1)')
     64      1.1  mrg 
     65      1.1  mrg MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
     66      1.1  mrg 
     67      1.1  mrg PROLOGUE(func)
     68  1.1.1.2  mrg 	adds	x15, xzr, xzr
     69      1.1  mrg 
     70      1.1  mrg 	tbz	n, #0, L(1)
     71      1.1  mrg 
     72      1.1  mrg 	ldr	x4, [up],#8
     73      1.1  mrg 	mul	x8, x4, v0
     74      1.1  mrg 	umulh	x12, x4, v0
     75  1.1.1.2  mrg 	ldr	x4, [rp]
     76      1.1  mrg 	ADDSUB	x8, x4, x8
     77      1.1  mrg 	csinc	x15, x12, x12, COND
     78      1.1  mrg 	str	x8, [rp],#8
     79      1.1  mrg 
     80      1.1  mrg L(1):	tbz	n, #1, L(2)
     81      1.1  mrg 
     82      1.1  mrg 	ldp	x4, x5, [up],#16
     83      1.1  mrg 	mul	x8, x4, v0
     84      1.1  mrg 	umulh	x12, x4, v0
     85      1.1  mrg 	mul	x9, x5, v0
     86      1.1  mrg 	umulh	x13, x5, v0
     87      1.1  mrg 	adds	x8, x8, x15
     88      1.1  mrg 	adcs	x9, x9, x12
     89  1.1.1.2  mrg 	ldp	x4, x5, [rp]
     90      1.1  mrg 	adc	x15, x13, xzr
     91      1.1  mrg 	ADDSUB	x8, x4, x8
     92      1.1  mrg 	ADDSUBC	x9, x5, x9
     93      1.1  mrg 	csinc	x15, x15, x15, COND
     94      1.1  mrg 	stp	x8, x9, [rp],#16
     95      1.1  mrg 
     96      1.1  mrg L(2):	lsr	n, n, #2
     97  1.1.1.2  mrg 	cbz	n, L(le3)
     98  1.1.1.2  mrg 	ldp	x4, x5, [up],#32
     99  1.1.1.2  mrg 	ldp	x6, x7, [up,#-16]
    100  1.1.1.2  mrg 	b	L(mid)
    101  1.1.1.2  mrg L(le3):	mov	x0, x15
    102  1.1.1.2  mrg 	ret
    103      1.1  mrg 
    104  1.1.1.2  mrg 	ALIGN(16)
    105  1.1.1.2  mrg L(top):	ldp	x4, x5, [up],#32
    106  1.1.1.2  mrg 	ldp	x6, x7, [up,#-16]
    107  1.1.1.2  mrg 	ADDSUB	x8, x16, x8
    108  1.1.1.2  mrg 	ADDSUBC	x9, x17, x9
    109  1.1.1.2  mrg 	stp	x8, x9, [rp],#32
    110  1.1.1.2  mrg 	ADDSUBC	x10, x12, x10
    111  1.1.1.2  mrg 	ADDSUBC	x11, x13, x11
    112  1.1.1.2  mrg 	stp	x10, x11, [rp,#-16]
    113  1.1.1.2  mrg 	csinc	x15, x15, x15, COND
    114  1.1.1.2  mrg L(mid):	sub	n, n, #1
    115      1.1  mrg 	mul	x8, x4, v0
    116      1.1  mrg 	umulh	x12, x4, v0
    117      1.1  mrg 	mul	x9, x5, v0
    118      1.1  mrg 	umulh	x13, x5, v0
    119      1.1  mrg 	adds	x8, x8, x15
    120      1.1  mrg 	mul	x10, x6, v0
    121      1.1  mrg 	umulh	x14, x6, v0
    122      1.1  mrg 	adcs	x9, x9, x12
    123      1.1  mrg 	mul	x11, x7, v0
    124      1.1  mrg 	umulh	x15, x7, v0
    125      1.1  mrg 	adcs	x10, x10, x13
    126  1.1.1.2  mrg 	ldp	x16, x17, [rp]
    127      1.1  mrg 	adcs	x11, x11, x14
    128  1.1.1.2  mrg 	ldp	x12, x13, [rp,#16]
    129      1.1  mrg 	adc	x15, x15, xzr
    130      1.1  mrg 	cbnz	n, L(top)
    131      1.1  mrg 
    132  1.1.1.2  mrg 	ADDSUB	x8, x16, x8
    133  1.1.1.2  mrg 	ADDSUBC	x9, x17, x9
    134  1.1.1.2  mrg 	ADDSUBC	x10, x12, x10
    135  1.1.1.2  mrg 	ADDSUBC	x11, x13, x11
    136  1.1.1.2  mrg 	stp	x8, x9, [rp]
    137  1.1.1.2  mrg 	stp	x10, x11, [rp,#16]
    138  1.1.1.2  mrg 	csinc	x0, x15, x15, COND
    139      1.1  mrg 	ret
    140      1.1  mrg EPILOGUE()
    141