Home | History | Annotate | Line # | Download | only in mode64
      1      1.1  mrg dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
      2      1.1  mrg 
      3  1.1.1.3  mrg dnl  Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.3  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21  1.1.1.3  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26  1.1.1.3  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33  1.1.1.2  mrg C                           cycles/limb
     34  1.1.1.2  mrg C                       norm    unorm   frac
     35  1.1.1.2  mrg C POWER3/PPC630         16-34   16-34   ~11   outdated figures
     36  1.1.1.2  mrg C POWER4/PPC970          28      28      19
     37  1.1.1.2  mrg C POWER5                 29      29     ~19
     38  1.1.1.2  mrg C POWER6                 49      59     ~42
     39  1.1.1.2  mrg C POWER7                 24.5    23     ~14
     40      1.1  mrg 
     41      1.1  mrg C INPUT PARAMETERS
     42      1.1  mrg C qp  = r3
     43      1.1  mrg C fn  = r4
     44      1.1  mrg C up  = r5
     45      1.1  mrg C un  = r6
     46      1.1  mrg C d   = r7
     47      1.1  mrg 
     48      1.1  mrg C We use a not very predictable branch in the frac code, therefore the cycle
     49      1.1  mrg C count wobbles somewhat.  With the alternative branch-free code, things run
     50      1.1  mrg C considerably slower on POWER4/PPC970 and POWER5.
     51      1.1  mrg 
     52      1.1  mrg C Add preinv entry point.
     53      1.1  mrg 
     54      1.1  mrg 
     55      1.1  mrg ASM_START()
     56      1.1  mrg 
     57      1.1  mrg EXTERN_FUNC(mpn_invert_limb)
     58      1.1  mrg 
     59  1.1.1.3  mrg PROLOGUE(mpn_divrem_1,toc)
     60      1.1  mrg 
     61      1.1  mrg 	mfcr	r12
     62      1.1  mrg 	add.	r10, r6, r4
     63      1.1  mrg 	std	r25, -56(r1)
     64      1.1  mrg 	mr	r25, r4
     65      1.1  mrg 	mflr	r0
     66      1.1  mrg 	std	r26, -48(r1)
     67      1.1  mrg 	mr	r26, r5
     68      1.1  mrg 	std	r28, -32(r1)
     69      1.1  mrg 	mr	r28, r6
     70      1.1  mrg 	std	r29, -24(r1)
     71      1.1  mrg 	mr	r29, r3
     72      1.1  mrg 	li	r3, 0
     73      1.1  mrg 	std	r30, -16(r1)
     74      1.1  mrg 	mr	r30, r7
     75      1.1  mrg 	std	r31, -8(r1)
     76      1.1  mrg 	li	r31, 0
     77      1.1  mrg 	std	r27, -40(r1)
     78      1.1  mrg 	std	r0, 16(r1)
     79      1.1  mrg 	stw	r12, 8(r1)
     80      1.1  mrg 	stdu	r1, -176(r1)
     81      1.1  mrg 	beq-	cr0, L(1)
     82      1.1  mrg 	cmpdi	cr7, r7, 0
     83      1.1  mrg 	sldi	r0, r10, 3
     84      1.1  mrg 	add	r11, r0, r29
     85      1.1  mrg 	addi	r29, r11, -8
     86      1.1  mrg 	blt-	cr7, L(162)
     87      1.1  mrg 	cmpdi	cr4, r6, 0
     88      1.1  mrg 	beq+	cr4, L(71)
     89      1.1  mrg L(163):
     90      1.1  mrg 	sldi	r9, r6, 3
     91      1.1  mrg 	add	r9, r9, r5
     92      1.1  mrg 	ld	r7, -8(r9)
     93      1.1  mrg 	cmpld	cr7, r7, r30
     94      1.1  mrg 	bge-	cr7, L(71)
     95      1.1  mrg 	cmpdi	cr7, r10, 1
     96      1.1  mrg 	li	r0, 0
     97      1.1  mrg 	mr	r31, r7
     98      1.1  mrg 	std	r0, -8(r11)
     99      1.1  mrg 	addi	r29, r29, -8
    100      1.1  mrg 	mr	r3, r7
    101      1.1  mrg 	beq-	cr7, L(1)
    102      1.1  mrg 	addi	r28, r6, -1
    103      1.1  mrg 	cmpdi	cr4, r28, 0
    104      1.1  mrg L(71):
    105      1.1  mrg 	cntlzd	r27, r30
    106      1.1  mrg 	sld	r30, r30, r27
    107      1.1  mrg 	sld	r31, r31, r27
    108      1.1  mrg 	mr	r3, r30
    109      1.1  mrg 	CALL(	mpn_invert_limb)
    110      1.1  mrg 	beq-	cr4, L(110)
    111      1.1  mrg 	sldi	r9, r28, 3
    112      1.1  mrg 	addic.	r6, r28, -2
    113      1.1  mrg 	add	r9, r9, r26
    114      1.1  mrg 	subfic	r5, r27, 64
    115      1.1  mrg 	ld	r8, -8(r9)
    116      1.1  mrg 	srd	r0, r8, r5
    117      1.1  mrg 	or	r31, r31, r0
    118      1.1  mrg 	sld	r7, r8, r27
    119      1.1  mrg 	blt-	cr0, L(154)
    120      1.1  mrg 	addi	r28, r28, -1
    121      1.1  mrg 	mtctr	r28
    122      1.1  mrg 	sldi	r6, r6, 3
    123      1.1  mrg 	ALIGN(16)
    124      1.1  mrg L(uloop):
    125      1.1  mrg 	ldx	r8, r26, r6
    126  1.1.1.2  mrg 	nop
    127      1.1  mrg 	mulld	r0, r31, r3
    128      1.1  mrg 	mulhdu	r10, r31, r3
    129  1.1.1.2  mrg 	addi	r11, r31, 1
    130      1.1  mrg 	srd	r9, r8, r5
    131  1.1.1.2  mrg 	addi	r6, r6, -8
    132      1.1  mrg 	or	r9, r7, r9
    133      1.1  mrg 	addc	r0, r0, r9
    134      1.1  mrg 	adde	r10, r10, r11
    135      1.1  mrg 	mulld	r31, r10, r30
    136      1.1  mrg 	subf	r31, r31, r9
    137  1.1.1.2  mrg 	subfc	r0, r31, r0	C r <= ql
    138  1.1.1.2  mrg 	subfe	r0, r0, r0	C r0 = -(r <= ql)
    139  1.1.1.2  mrg 	and	r9, r30, r0
    140  1.1.1.2  mrg 	add	r31, r31, r9
    141  1.1.1.2  mrg 	add	r10, r0, r10	C qh -= (r >= ql)
    142      1.1  mrg 	cmpld	cr7, r31, r30
    143      1.1  mrg 	bge-	cr7, L(164)
    144      1.1  mrg L(123):
    145      1.1  mrg 	std	r10, 0(r29)
    146      1.1  mrg 	addi	r29, r29, -8
    147      1.1  mrg 	sld	r7, r8, r27
    148      1.1  mrg 	bdnz	L(uloop)
    149      1.1  mrg L(154):
    150      1.1  mrg 	addi	r11, r31, 1
    151      1.1  mrg 	nop
    152      1.1  mrg 	mulld	r0, r31, r3
    153      1.1  mrg 	mulhdu	r8, r31, r3
    154      1.1  mrg 	addc	r0, r0, r7
    155      1.1  mrg 	adde	r8, r8, r11
    156      1.1  mrg 	mulld	r31, r8, r30
    157      1.1  mrg 	subf	r31, r31, r7
    158      1.1  mrg 	subfc	r0, r0, r31	C r >= ql
    159      1.1  mrg 	subfe	r0, r0, r0	C r0 = -(r >= ql)
    160      1.1  mrg 	not	r7, r0
    161      1.1  mrg 	add	r8, r7, r8	C qh -= (r >= ql)
    162      1.1  mrg 	andc	r0, r30, r0
    163      1.1  mrg 	add	r31, r31, r0
    164      1.1  mrg 	cmpld	cr7, r31, r30
    165      1.1  mrg 	bge-	cr7, L(165)
    166      1.1  mrg L(134):
    167      1.1  mrg 	std	r8, 0(r29)
    168      1.1  mrg 	addi	r29, r29, -8
    169      1.1  mrg L(110):
    170      1.1  mrg 	addic.	r0, r25, -1
    171      1.1  mrg 	blt-	cr0, L(156)
    172      1.1  mrg 	mtctr	r25
    173      1.1  mrg 	neg	r9, r30
    174      1.1  mrg 	ALIGN(16)
    175      1.1  mrg L(ufloop):
    176      1.1  mrg 	addi	r11, r31, 1
    177      1.1  mrg 	nop
    178  1.1.1.2  mrg 	mulld	r0, r3, r31
    179      1.1  mrg 	mulhdu	r10, r3, r31
    180      1.1  mrg 	add	r10, r10, r11
    181      1.1  mrg 	mulld	r31, r9, r10
    182      1.1  mrg ifelse(0,1,`
    183  1.1.1.2  mrg 	subfc	r0, r0, r31
    184      1.1  mrg 	subfe	r0, r0, r0	C r0 = -(r >= ql)
    185      1.1  mrg 	not	r7, r0
    186      1.1  mrg 	add	r10, r7, r10	C qh -= (r >= ql)
    187      1.1  mrg 	andc	r0, r30, r0
    188      1.1  mrg 	add	r31, r31, r0
    189      1.1  mrg ',`
    190  1.1.1.2  mrg 	cmpld	cr7, r31, r0
    191      1.1  mrg 	blt	cr7, L(29)
    192      1.1  mrg 	add	r31, r30, r31
    193      1.1  mrg 	addi	r10, r10, -1
    194      1.1  mrg L(29):
    195      1.1  mrg ')
    196      1.1  mrg 	std	r10, 0(r29)
    197      1.1  mrg 	addi	r29, r29, -8
    198      1.1  mrg 	bdnz	L(ufloop)
    199      1.1  mrg L(156):
    200      1.1  mrg 	srd	r3, r31, r27
    201      1.1  mrg L(1):
    202      1.1  mrg 	addi	r1, r1, 176
    203      1.1  mrg 	ld	r0, 16(r1)
    204      1.1  mrg 	lwz	r12, 8(r1)
    205      1.1  mrg 	mtlr	r0
    206      1.1  mrg 	ld	r25, -56(r1)
    207      1.1  mrg 	ld	r26, -48(r1)
    208      1.1  mrg 	mtcrf	8, r12
    209      1.1  mrg 	ld	r27, -40(r1)
    210      1.1  mrg 	ld	r28, -32(r1)
    211      1.1  mrg 	ld	r29, -24(r1)
    212      1.1  mrg 	ld	r30, -16(r1)
    213      1.1  mrg 	ld	r31, -8(r1)
    214      1.1  mrg 	blr
    215      1.1  mrg L(162):
    216      1.1  mrg 	cmpdi	cr7, r6, 0
    217      1.1  mrg 	beq-	cr7, L(8)
    218      1.1  mrg 	sldi	r9, r6, 3
    219      1.1  mrg 	addi	r29, r29, -8
    220      1.1  mrg 	add	r9, r9, r5
    221      1.1  mrg 	addi	r28, r6, -1
    222      1.1  mrg 	ld	r31, -8(r9)
    223      1.1  mrg 	subfc	r9, r7, r31
    224      1.1  mrg 	li	r9, 0
    225      1.1  mrg 	adde	r9, r9, r9
    226      1.1  mrg 	neg	r0, r9
    227      1.1  mrg 	std	r9, -8(r11)
    228      1.1  mrg 	and	r0, r0, r7
    229      1.1  mrg 	subf	r31, r0, r31
    230      1.1  mrg L(8):
    231      1.1  mrg 	mr	r3, r30
    232      1.1  mrg 	CALL(	mpn_invert_limb)
    233  1.1.1.2  mrg 	li	r27, 0
    234      1.1  mrg 	addic.	r6, r28, -1
    235  1.1.1.2  mrg 	blt-	cr0, L(110)
    236      1.1  mrg 	mtctr	r28
    237      1.1  mrg 	sldi	r6, r6, 3
    238      1.1  mrg 	ALIGN(16)
    239      1.1  mrg L(nloop):
    240      1.1  mrg 	addi	r11, r31, 1
    241      1.1  mrg 	ldx	r8, r26, r6
    242      1.1  mrg 	mulld	r0, r31, r3
    243      1.1  mrg 	mulhdu	r10, r31, r3
    244  1.1.1.2  mrg 	addi	r6, r6, -8
    245  1.1.1.2  mrg 	addc	r0, r0, r8
    246      1.1  mrg 	adde	r10, r10, r11
    247      1.1  mrg 	mulld	r31, r10, r30
    248      1.1  mrg 	subf	r31, r31, r8	C r = nl - qh * d
    249  1.1.1.2  mrg 	subfc	r0, r31, r0	C r <= ql
    250  1.1.1.2  mrg 	subfe	r0, r0, r0	C r0 = -(r <= ql)
    251  1.1.1.2  mrg 	and	r9, r30, r0
    252  1.1.1.2  mrg 	add	r31, r31, r9
    253  1.1.1.2  mrg 	add	r10, r0, r10	C qh -= (r >= ql)
    254      1.1  mrg 	cmpld	cr7, r31, r30
    255      1.1  mrg 	bge-	cr7, L(167)
    256      1.1  mrg L(51):
    257      1.1  mrg 	std	r10, 0(r29)
    258      1.1  mrg 	addi	r29, r29, -8
    259      1.1  mrg 	bdnz	L(nloop)
    260  1.1.1.2  mrg 	b	L(110)
    261      1.1  mrg 
    262      1.1  mrg L(164):
    263      1.1  mrg 	subf	r31, r30, r31
    264      1.1  mrg 	addi	r10, r10, 1
    265      1.1  mrg 	b	L(123)
    266      1.1  mrg L(167):
    267      1.1  mrg 	subf	r31, r30, r31
    268      1.1  mrg 	addi	r10, r10, 1
    269      1.1  mrg 	b	L(51)
    270      1.1  mrg L(165):
    271      1.1  mrg 	subf	r31, r30, r31
    272      1.1  mrg 	addi	r8, r8, 1
    273      1.1  mrg 	b	L(134)
    274      1.1  mrg EPILOGUE()
    275