Home | History | Annotate | Line # | Download | only in v5
      1 dnl  ARM mpn_mod_1s_2p
      2 
      3 dnl  Contributed to the GNU project by Torbjrn Granlund.
      4 
      5 dnl  Copyright 2012 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C	     cycles/limb
     36 C StrongARM	 -
     37 C XScale	 ?
     38 C Cortex-A7	 ?
     39 C Cortex-A8	 ?
     40 C Cortex-A9	 4.25
     41 C Cortex-A15	 3
     42 
     43 define(`ap', `r0')
     44 define(`n',  `r1')
     45 define(`d',  `r2')
     46 define(`cps',`r3')
     47 
     48 ASM_START()
     49 PROLOGUE(mpn_mod_1s_2p)
     50 	push	{r4-r10}
     51 	tst	n, #1
     52 	add	r7, r3, #8
     53 	ldmia	r7, {r7, r8, r12}	C load B1, B2, B3
     54 	add	ap, ap, n, lsl #2	C put ap at operand end
     55 	beq	L(evn)
     56 
     57 L(odd):	subs	n, n, #1
     58 	beq	L(1)
     59 	ldmdb	ap!, {r4,r6,r9}
     60 	mov	r10, #0
     61 	umlal	r4, r10, r6, r7
     62 	umlal	r4, r10, r9, r8
     63 	b	L(com)
     64 
     65 L(evn):	ldmdb	ap!, {r4,r10}
     66 L(com):	subs	n, n, #2
     67 	ble	L(end)
     68 	ldmdb	ap!, {r5,r6}
     69 	b	L(mid)
     70 
     71 L(top):	mov	r9, #0
     72 	umlal	r5, r9, r6, r7		C B1
     73 	umlal	r5, r9, r4, r8		C B2
     74 	ldmdb	ap!, {r4,r6}
     75 	umlal	r5, r9, r10, r12	C B3
     76 	ble	L(xit)
     77 	mov	r10, #0
     78 	umlal	r4, r10, r6, r7		C B1
     79 	umlal	r4, r10, r5, r8		C B2
     80 	ldmdb	ap!, {r5,r6}
     81 	umlal	r4, r10, r9, r12	C B3
     82 L(mid):	subs	n, n, #4
     83 	bge	L(top)
     84 
     85 	mov	r9, #0
     86 	umlal	r5, r9, r6, r7		C B1
     87 	umlal	r5, r9, r4, r8		C B2
     88 	umlal	r5, r9, r10, r12	C B3
     89 	mov	r4, r5
     90 
     91 L(end):	movge	   r9, r10		C executed iff coming via xit
     92 	ldr	r6, [r3, #4]		C cps[1] = cnt
     93 	mov	r5, #0
     94 	umlal	r4, r5, r9, r7
     95 	mov	r7, r5, lsl r6
     96 L(x):	rsb	r1, r6, #32
     97 	orr	r8, r7, r4, lsr r1
     98 	mov	r9, r4, lsl r6
     99 	ldr	r5, [r3, #0]
    100 	add	r0, r8, #1
    101 	umull	r12, r1, r8, r5
    102 	adds	r4, r12, r9
    103 	adc	r1, r1, r0
    104 	mul	r5, r2, r1
    105 	sub	r9, r9, r5
    106 	cmp	r9, r4
    107 	addhi	r9, r9, r2
    108 	cmp	r2, r9
    109 	subls	r9, r9, r2
    110 	mov	r0, r9, lsr r6
    111 	pop	{r4-r10}
    112 	bx	r14
    113 
    114 L(xit):	mov	r10, #0
    115 	umlal	r4, r10, r6, r7		C B1
    116 	umlal	r4, r10, r5, r8		C B2
    117 	umlal	r4, r10, r9, r12	C B3
    118 	b	L(end)
    119 
    120 L(1):	ldr	r6, [r3, #4]		C cps[1] = cnt
    121 	ldr	r4, [ap, #-4]		C ap[0]
    122 	mov	r7, #0
    123 	b	L(x)
    124 EPILOGUE()
    125 
    126 PROLOGUE(mpn_mod_1s_2p_cps)
    127 	push	{r4-r8, r14}
    128 	clz	r4, r1
    129 	mov	r5, r1, lsl r4		C b <<= cnt
    130 	mov	r6, r0			C r6 = cps
    131 	mov	r0, r5
    132 	bl	mpn_invert_limb
    133 	rsb	r3, r4, #32
    134 	mov	r3, r0, lsr r3
    135 	mov	r2, #1
    136 	orr	r3, r3, r2, lsl r4
    137 	rsb	r1, r5, #0
    138 	mul	r2, r1, r3
    139 	umull	r3, r12, r2, r0
    140 	add	r12, r2, r12
    141 	mvn	r12, r12
    142 	mul	r1, r5, r12
    143 	cmp	r1, r3
    144 	addhi	r1, r1, r5
    145 	umull	r12, r7, r1, r0
    146 	add	r7, r1, r7
    147 	mvn	r7, r7
    148 	mul	r3, r5, r7
    149 	cmp	r3, r12
    150 	addhi	r3, r3, r5
    151 	mov	r5, r2, lsr r4
    152 	mov	r7, r1, lsr r4
    153 	mov	r8, r3, lsr r4
    154 	stmia	r6, {r0,r4,r5,r7,r8}	C fill cps
    155 	pop	{r4-r8, pc}
    156 EPILOGUE()
    157