Home | History | Annotate | Line # | Download | only in mode64
mul_1.asm revision 1.1.1.3
      1 dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
      2 dnl  the result in a second limb vector.
      3 
      4 dnl  Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 C               cycles/limb
     35 C POWER3/PPC630     6-18
     36 C POWER4/PPC970     7.25?  not updated for last file revision
     37 C POWER5            7.25
     38 C POWER6           14
     39 C POWER7            2.9
     40 
     41 C TODO
     42 C  * Try to reduce the number of needed live registers (at least r5 and r10
     43 C    could be combined)
     44 C  * Optimize feed-in code, for speed and size.
     45 C  * Clean up r12/r7 usage in feed-in code.
     46 
     47 C INPUT PARAMETERS
     48 define(`rp', `r3')
     49 define(`up', `r4')
     50 define(`n', `r5')
     51 define(`vl', `r6')
     52 
     53 ASM_START()
     54 PROLOGUE(mpn_mul_1c)
     55 	std	r27, -40(r1)
     56 	std	r26, -48(r1)
     57 	mr	r12, r7
     58 	b	L(ent)
     59 EPILOGUE()
     60 PROLOGUE(mpn_mul_1)
     61 	std	r27, -40(r1)
     62 	std	r26, -48(r1)
     63 	li	r12, 0		C cy_limb = 0
     64 L(ent):	ld	r26, 0(up)
     65 
     66 	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
     67 	cmpdi	cr6, r0, 2
     68 	addic	n, n, 3		C compute count...
     69 	srdi	n, n, 2		C ...for ctr
     70 	mtctr	n		C copy count into ctr
     71 	beq	cr0, L(b00)
     72 	blt	cr6, L(b01)
     73 	beq	cr6, L(b10)
     74 
     75 L(b11):	mr	r7, r12
     76 	mulld	r0, r26, r6
     77 	mulhdu	r12, r26, r6
     78 	addi	up, up, 8
     79 	addc	r0, r0, r7
     80 	std	r0, 0(rp)
     81 	addi	rp, rp, 8
     82 	b	L(fic)
     83 
     84 L(b00):	ld	r27, 8(up)
     85 	addi	up, up, 16
     86 	mulld	r0, r26, r6
     87 	mulhdu	r5, r26, r6
     88 	mulld	r7, r27, r6
     89 	mulhdu	r8, r27, r6
     90 	addc	r0, r0, r12
     91 	adde	r7, r7, r5
     92 	addze	r12, r8
     93 	std	r0, 0(rp)
     94 	std	r7, 8(rp)
     95 	addi	rp, rp, 16
     96 	b	L(fic)
     97 
     98 	nop			C alignment
     99 L(b01):	bdnz	L(gt1)
    100 	mulld	r0, r26, r6
    101 	mulhdu	r8, r26, r6
    102 	addc	r0, r0, r12
    103 	std	r0, 0(rp)
    104 	b	L(ret)
    105 L(gt1):	ld	r27, 8(up)
    106 	nop
    107 	mulld	r0, r26, r6
    108 	mulhdu	r5, r26, r6
    109 	ld	r26, 16(up)
    110 	mulld	r7, r27, r6
    111 	mulhdu	r8, r27, r6
    112 	mulld	r9, r26, r6
    113 	mulhdu	r10, r26, r6
    114 	addc	r0, r0, r12
    115 	adde	r7, r7, r5
    116 	adde	r9, r9, r8
    117 	addze	r12, r10
    118 	std	r0, 0(rp)
    119 	std	r7, 8(rp)
    120 	std	r9, 16(rp)
    121 	addi	up, up, 24
    122 	addi	rp, rp, 24
    123 	b	L(fic)
    124 
    125 	nop
    126 L(fic):	ld	r26, 0(up)
    127 L(b10):	ld	r27, 8(up)
    128 	addi	up, up, 16
    129 	bdz	L(end)
    130 
    131 L(top):	mulld	r0, r26, r6
    132 	mulhdu	r5, r26, r6
    133 	mulld	r7, r27, r6
    134 	mulhdu	r8, r27, r6
    135 	ld	r26, 0(up)
    136 	ld	r27, 8(up)
    137 	adde	r0, r0, r12
    138 	adde	r7, r7, r5
    139 	mulld	r9, r26, r6
    140 	mulhdu	r10, r26, r6
    141 	mulld	r11, r27, r6
    142 	mulhdu	r12, r27, r6
    143 	ld	r26, 16(up)
    144 	ld	r27, 24(up)
    145 	std	r0, 0(rp)
    146 	adde	r9, r9, r8
    147 	std	r7, 8(rp)
    148 	adde	r11, r11, r10
    149 	std	r9, 16(rp)
    150 	addi	up, up, 32
    151 	std	r11, 24(rp)
    152 
    153 	addi	rp, rp, 32
    154 	bdnz	L(top)
    155 
    156 L(end):	mulld	r0, r26, r6
    157 	mulhdu	r5, r26, r6
    158 	mulld	r7, r27, r6
    159 	mulhdu	r8, r27, r6
    160 	adde	r0, r0, r12
    161 	adde	r7, r7, r5
    162 	std	r0, 0(rp)
    163 	std	r7, 8(rp)
    164 L(ret):	addze	r3, r8
    165 	ld	r27, -40(r1)
    166 	ld	r26, -48(r1)
    167 	blr
    168 EPILOGUE()
    169