Home | History | Annotate | Line # | Download | only in powerpc64
rshift.asm revision 1.1.1.1.2.1
      1 dnl  PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
      2 
      3 dnl  Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 C                   cycles/limb
     23 C POWER3/PPC630          ?
     24 C POWER4/PPC970          ?
     25 C POWER5                 2.25
     26 C POWER6                 9.75
     27 C POWER7                 2.15
     28 
     29 C TODO
     30 C  * Try to reduce the number of needed live registers
     31 C  * Micro-optimise header code
     32 C  * Keep in synch with lshift.asm and lshiftc.asm
     33 
     34 C INPUT PARAMETERS
     35 define(`rp',  `r3')
     36 define(`up',  `r4')
     37 define(`n',   `r5')
     38 define(`cnt', `r6')
     39 
     40 define(`tnc',`r0')
     41 define(`u0',`r30')
     42 define(`u1',`r31')
     43 define(`retval',`r5')
     44 
     45 ASM_START()
     46 PROLOGUE(mpn_rshift)
     47 	std	r31, -8(r1)
     48 	std	r30, -16(r1)
     49 	subfic	tnc, cnt, 64
     50 C	sldi	r30, n, 3	C byte count corresponding to n
     51 C	add	rp, rp, r30	C rp = rp + n
     52 C	add	up, up, r30	C up = up + n
     53 	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
     54 	cmpdi	cr6, r30, 2
     55 	addi	r31, n, 3	C compute count...
     56 	ld	r10, 0(up)	C load 1st limb for b00...b11
     57 	sld	retval, r10, tnc
     58 ifdef(`HAVE_ABI_mode32',
     59 `	rldicl	r31, r31, 62,34',	C ...branch count
     60 `	srdi	r31, r31, 2')	C ...for ctr
     61 	mtctr	r31		C copy count into ctr
     62 	beq	cr0, L(b00)
     63 	blt	cr6, L(b01)
     64 	ld	r11, 8(up)	C load 2nd limb for b10 and b11
     65 	beq	cr6, L(b10)
     66 
     67 	ALIGN(16)
     68 L(b11):	srd	r8, r10, cnt
     69 	sld	r9, r11, tnc
     70 	ld	u1, 16(up)
     71 	addi	up, up, 24
     72 	srd	r12, r11, cnt
     73 	sld	r7, u1, tnc
     74 	addi	rp, rp, -16
     75 	bdnz	L(gt3)
     76 
     77 	or	r11, r8, r9
     78 	srd	r8, u1, cnt
     79 	b	L(cj3)
     80 
     81 	ALIGN(16)
     82 L(gt3):	ld	u0, 0(up)
     83 	or	r11, r8, r9
     84 	srd	r8, u1, cnt
     85 	sld	r9, u0, tnc
     86 	ld	u1, 8(up)
     87 	or	r10, r12, r7
     88 	b	L(L11)
     89 
     90 	ALIGN(32)
     91 L(b10):	srd	r12, r10, cnt
     92 	addi	rp, rp, -24
     93 	sld	r7, r11, tnc
     94 	bdnz	L(gt2)
     95 
     96 	srd	r8, r11, cnt
     97 	or	r10, r12, r7
     98 	b	L(cj2)
     99 
    100 L(gt2):	ld	u0, 16(up)
    101 	srd	r8, r11, cnt
    102 	sld	r9, u0, tnc
    103 	ld	u1, 24(up)
    104 	or	r10, r12, r7
    105 	srd	r12, u0, cnt
    106 	sld	r7, u1, tnc
    107 	ld	u0, 32(up)
    108 	or	r11, r8, r9
    109 	addi	up, up, 16
    110 	b	L(L10)
    111 
    112 	ALIGN(16)
    113 L(b00):	ld	u1, 8(up)
    114 	srd	r12, r10, cnt
    115 	sld	r7, u1, tnc
    116 	ld	u0, 16(up)
    117 	srd	r8, u1, cnt
    118 	sld	r9, u0, tnc
    119 	ld	u1, 24(up)
    120 	or	r10, r12, r7
    121 	srd	r12, u0, cnt
    122 	sld	r7, u1, tnc
    123 	addi	rp, rp, -8
    124 	bdz	L(cj4)
    125 
    126 L(gt4):	addi	up, up, 32
    127 	ld	u0, 0(up)
    128 	or	r11, r8, r9
    129 	b	L(L00)
    130 
    131 	ALIGN(16)
    132 L(b01):	bdnz	L(gt1)
    133 	srd	r8, r10, cnt
    134 	std	r8, 0(rp)
    135 	b	L(ret)
    136 
    137 L(gt1):	ld	u0, 8(up)
    138 	srd	r8, r10, cnt
    139 	sld	r9, u0, tnc
    140 	ld	u1, 16(up)
    141 	srd	r12, u0, cnt
    142 	sld	r7, u1, tnc
    143 	ld	u0, 24(up)
    144 	or	r11, r8, r9
    145 	srd	r8, u1, cnt
    146 	sld	r9, u0, tnc
    147 	ld	u1, 32(up)
    148 	addi	up, up, 40
    149 	or	r10, r12, r7
    150 	bdz	L(end)
    151 
    152 	ALIGN(32)
    153 L(top):	srd	r12, u0, cnt
    154 	sld	r7, u1, tnc
    155 	ld	u0, 0(up)
    156 	std	r11, 0(rp)
    157 	or	r11, r8, r9
    158 L(L00):	srd	r8, u1, cnt
    159 	sld	r9, u0, tnc
    160 	ld	u1, 8(up)
    161 	std	r10, 8(rp)
    162 	or	r10, r12, r7
    163 L(L11):	srd	r12, u0, cnt
    164 	sld	r7, u1, tnc
    165 	ld	u0, 16(up)
    166 	std	r11, 16(rp)
    167 	or	r11, r8, r9
    168 L(L10):	srd	r8, u1, cnt
    169 	sld	r9, u0, tnc
    170 	ld	u1, 24(up)
    171 	addi	up, up, 32
    172 	std	r10, 24(rp)
    173 	addi	rp, rp, 32
    174 	or	r10, r12, r7
    175 	bdnz	L(top)
    176 
    177 	ALIGN(32)
    178 L(end):	srd	r12, u0, cnt
    179 	sld	r7, u1, tnc
    180 	std	r11, 0(rp)
    181 L(cj4):	or	r11, r8, r9
    182 	srd	r8, u1, cnt
    183 	std	r10, 8(rp)
    184 L(cj3):	or	r10, r12, r7
    185 	std	r11, 16(rp)
    186 L(cj2):	std	r10, 24(rp)
    187 	std	r8, 32(rp)
    188 
    189 L(ret):	ld	r31, -8(r1)
    190 	ld	r30, -16(r1)
    191 ifdef(`HAVE_ABI_mode32',
    192 `	srdi	r3, retval, 32
    193 	mr	r4, retval
    194 ',`	mr	r3, retval')
    195 	blr
    196 EPILOGUE()
    197