Home | History | Annotate | Line # | Download | only in mode64
mul_1.asm revision 1.1
      1 dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
      2 dnl  the result in a second limb vector.
      3 
      4 dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
      5 dnl  Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of the GNU Lesser General Public License as published
     11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     12 dnl  your option) any later version.
     13 
     14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     17 dnl  License for more details.
     18 
     19 dnl  You should have received a copy of the GNU Lesser General Public License
     20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     21 
     22 include(`../config.m4')
     23 
     24 C		cycles/limb
     25 C POWER3/PPC630:     6-18
     26 C POWER4/PPC970:     7.25
     27 C POWER5:            7.75
     28 
     29 C TODO
     30 C  * Try to reduce the number of needed live registers (at least r5 and r10
     31 C    could be combined)
     32 C  * Optimize feed-in code, for speed and size.
     33 C  * Clean up r12/r7 usage in feed-in code.
     34 
     35 C INPUT PARAMETERS
     36 define(`rp', `r3')
     37 define(`up', `r4')
     38 define(`n', `r5')
     39 define(`vl', `r6')
     40 
     41 ASM_START()
     42 PROLOGUE(mpn_mul_1c)
     43 	std	r27, -40(r1)
     44 	std	r26, -48(r1)
     45 	mr	r12, r7
     46 	b	L(ent)
     47 EPILOGUE()
     48 PROLOGUE(mpn_mul_1)
     49 	std	r27, -40(r1)
     50 	std	r26, -48(r1)
     51 	li	r12, 0		C cy_limb = 0
     52 L(ent):	ld	r26, 0(up)
     53 
     54 	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
     55 	cmpdi	cr6, r0, 2
     56 	addic	n, n, 3		C compute count...
     57 	srdi	n, n, 2		C ...for ctr
     58 	mtctr	n		C copy count into ctr
     59 	beq	cr0, L(b00)
     60 	blt	cr6, L(b01)
     61 	beq	cr6, L(b10)
     62 
     63 L(b11):	mr	r7, r12
     64 	mulld	r0, r26, r6
     65 	mulhdu	r12, r26, r6
     66 	addi	up, up, 8
     67 	addc	r0, r0, r7
     68 	std	r0, 0(rp)
     69 	addi	rp, rp, 8
     70 	b	L(fic)
     71 
     72 L(b00):	ld	r27, 8(up)
     73 	addi	up, up, 16
     74 	mulld	r0, r26, r6
     75 	mulhdu	r5, r26, r6
     76 	mulld	r7, r27, r6
     77 	mulhdu	r8, r27, r6
     78 	addc	r0, r0, r12
     79 	adde	r7, r7, r5
     80 	addze	r12, r8
     81 	std	r0, 0(rp)
     82 	std	r7, 8(rp)
     83 	addi	rp, rp, 16
     84 	b	L(fic)
     85 
     86 	nop			C alignment
     87 L(b01):	bdnz	L(gt1)
     88 	mulld	r0, r26, r6
     89 	mulhdu	r8, r26, r6
     90 	addc	r0, r0, r12
     91 	std	r0, 0(rp)
     92 	b	L(ret)
     93 L(gt1):	ld	r27, 8(up)
     94 	nop
     95 	mulld	r0, r26, r6
     96 	mulhdu	r5, r26, r6
     97 	ld	r26, 16(up)
     98 	mulld	r7, r27, r6
     99 	mulhdu	r8, r27, r6
    100 	mulld	r9, r26, r6
    101 	mulhdu	r10, r26, r6
    102 	addc	r0, r0, r12
    103 	adde	r7, r7, r5
    104 	adde	r9, r9, r8
    105 	addze	r12, r10
    106 	std	r0, 0(rp)
    107 	std	r7, 8(rp)
    108 	std	r9, 16(rp)
    109 	addi	up, up, 24
    110 	addi	rp, rp, 24
    111 	b	L(fic)
    112 
    113 	nop
    114 L(fic):	ld	r26, 0(up)
    115 L(b10):	ld	r27, 8(up)
    116 	addi	up, up, 16
    117 	bdz	L(end)
    118 
    119 L(top):	mulld	r0, r26, r6
    120 	mulhdu	r5, r26, r6
    121 	ld	r26, 0(up)
    122 	nop
    123 
    124 	mulld	r7, r27, r6
    125 	mulhdu	r8, r27, r6
    126 	ld	r27, 8(up)
    127 	nop
    128 
    129 	adde	r0, r0, r12
    130 	adde	r7, r7, r5
    131 
    132 	mulld	r9, r26, r6
    133 	mulhdu	r10, r26, r6
    134 	ld	r26, 16(up)
    135 	nop
    136 
    137 	mulld	r11, r27, r6
    138 	mulhdu	r12, r27, r6
    139 	ld	r27, 24(up)
    140 
    141 	std	r0, 0(rp)
    142 	adde	r9, r9, r8
    143 	std	r7, 8(rp)
    144 	adde	r11, r11, r10
    145 	std	r9, 16(rp)
    146 	addi	up, up, 32
    147 	std	r11, 24(rp)
    148 
    149 	addi	rp, rp, 32
    150 	bdnz	L(top)
    151 
    152 L(end):	mulld	r0, r26, r6
    153 	mulhdu	r5, r26, r6
    154 
    155 	mulld	r7, r27, r6
    156 	mulhdu	r8, r27, r6
    157 
    158 	adde	r0, r0, r12
    159 	adde	r7, r7, r5
    160 
    161 	std	r0, 0(rp)
    162 	std	r7, 8(rp)
    163 L(ret):	addze	r3, r8
    164 	ld	r27, -40(r1)
    165 	ld	r26, -48(r1)
    166 	blr
    167 EPILOGUE()
    168