Home | History | Annotate | Line # | Download | only in alpha
      1      1.1  mrg dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C      cycles/limb
     35      1.1  mrg C EV4:    47
     36      1.1  mrg C EV5:    30
     37      1.1  mrg C EV6:    15
     38      1.1  mrg 
     39      1.1  mrg 
     40      1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
     41      1.1  mrg C                                mp_limb_t c)
     42      1.1  mrg C
     43      1.1  mrg C This code follows the "alternate" code in mpn/generic/mode1o.c,
     44      1.1  mrg C eliminating cbit+climb from the dependent chain.  This leaves,
     45      1.1  mrg C
     46      1.1  mrg C        ev4   ev5   ev6
     47      1.1  mrg C         1     3     1    subq   y = x - h
     48      1.1  mrg C        23    13     7    mulq   q = y * inverse
     49      1.1  mrg C        23    14     7    umulh  h = high (q * d)
     50      1.1  mrg C        --    --    --
     51      1.1  mrg C        47    30    15
     52      1.1  mrg C
     53      1.1  mrg C In each case, the load latency, loop control, and extra carry bit handling
     54      1.1  mrg C hide under the multiply latencies.  Those latencies are long enough that
     55      1.1  mrg C we don't need to worry about alignment or pairing to squeeze out
     56      1.1  mrg C performance.
     57      1.1  mrg C
     58      1.1  mrg C For the first limb, some of the loop code is broken out and scheduled back
     59      1.1  mrg C since it can be done earlier.
     60      1.1  mrg C
     61      1.1  mrg C   - The first ldq src[0] is near the start of the routine, for maximum
     62      1.1  mrg C     time from memory.
     63      1.1  mrg C
     64      1.1  mrg C   - The subq y=x-climb can be done without waiting for the inverse.
     65      1.1  mrg C
     66      1.1  mrg C   - The mulq y*inverse is replicated after the final subq for the inverse,
     67      1.1  mrg C     instead of branching to the mulq in the main loop.  On ev4 a branch
     68      1.1  mrg C     there would cost cycles, but we can hide them under the mulq latency.
     69      1.1  mrg C
     70      1.1  mrg C For the last limb, high<divisor is tested and if that's true a subtract
     71      1.1  mrg C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
     72      1.1  mrg C data-dependent branch, but we're waiting for umulh so any penalty should
     73      1.1  mrg C hide there.  The multiplies saved would be worth the cost anyway.
     74      1.1  mrg C
     75      1.1  mrg C Enhancements:
     76      1.1  mrg C
     77      1.1  mrg C For size==1, a plain division (done bitwise say) might be faster than
     78      1.1  mrg C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
     79      1.1  mrg C ev5.  A call to gcc __remqu might be a possibility.
     80      1.1  mrg 
     81      1.1  mrg ASM_START()
     82      1.1  mrg PROLOGUE(mpn_modexact_1c_odd,gp)
     83      1.1  mrg 
     84      1.1  mrg 	C r16	src
     85      1.1  mrg 	C r17	size
     86      1.1  mrg 	C r18	d
     87      1.1  mrg 	C r19	c
     88      1.1  mrg 
     89      1.1  mrg 	LEA(r0, binvert_limb_table)
     90      1.1  mrg 	srl	r18, 1, r20		C d >> 1
     91      1.1  mrg 
     92      1.1  mrg 	and	r20, 127, r20		C idx = d>>1 & 0x7F
     93      1.1  mrg 
     94      1.1  mrg 	addq	r0, r20, r21		C table + idx
     95      1.1  mrg 
     96      1.1  mrg ifelse(bwx_available_p,1,
     97      1.1  mrg `	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
     98      1.1  mrg ',`
     99      1.1  mrg 	ldq_u	r20, 0(r21)		C table[idx] qword
    100      1.1  mrg 	extbl	r20, r21, r20		C table[idx], inverse 8 bits
    101      1.1  mrg ')
    102      1.1  mrg 
    103      1.1  mrg 	mull	r20, r20, r7		C i*i
    104      1.1  mrg 	addq	r20, r20, r20		C 2*i
    105      1.1  mrg 
    106      1.1  mrg 	ldq	r2, 0(r16)		C x = s = src[0]
    107      1.1  mrg 	lda	r17, -1(r17)		C size--
    108      1.1  mrg 	clr	r0			C initial cbit=0
    109      1.1  mrg 
    110      1.1  mrg 	mull	r7, r18, r7		C i*i*d
    111      1.1  mrg 
    112      1.1  mrg 	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
    113      1.1  mrg 
    114      1.1  mrg 	mull	r20, r20, r7		C i*i
    115      1.1  mrg 	addq	r20, r20, r20		C 2*i
    116      1.1  mrg 
    117      1.1  mrg 	mull	r7, r18, r7		C i*i*d
    118      1.1  mrg 
    119      1.1  mrg 	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
    120      1.1  mrg 
    121      1.1  mrg 	mulq	r20, r20, r7		C i*i
    122      1.1  mrg 	addq	r20, r20, r20		C 2*i
    123      1.1  mrg 
    124      1.1  mrg 	mulq	r7, r18, r7		C i*i*d
    125      1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    126      1.1  mrg 
    127      1.1  mrg 	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
    128      1.1  mrg 
    129      1.1  mrg ASSERT(r7, C should have d*inv==1 mod 2^64
    130      1.1  mrg `	mulq	r18, r20, r7
    131      1.1  mrg 	cmpeq	r7, 1, r7')
    132      1.1  mrg 
    133      1.1  mrg 	mulq	r3, r20, r4		C first q = y * inv
    134      1.1  mrg 
    135      1.1  mrg 	beq	r17, L(one)		C if size==1
    136      1.1  mrg 	br	L(entry)
    137      1.1  mrg 
    138      1.1  mrg 
    139      1.1  mrg L(top):
    140      1.1  mrg 	C r0	cbit
    141      1.1  mrg 	C r16	src, incrementing
    142      1.1  mrg 	C r17	size, decrementing
    143      1.1  mrg 	C r18	d
    144      1.1  mrg 	C r19	climb
    145      1.1  mrg 	C r20	inv
    146      1.1  mrg 
    147      1.1  mrg 	ldq	r1, 0(r16)		C s = src[i]
    148      1.1  mrg 	subq	r1, r0, r2		C x = s - cbit
    149      1.1  mrg 	cmpult	r1, r0, r0		C new cbit = s < cbit
    150      1.1  mrg 
    151      1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    152      1.1  mrg 
    153      1.1  mrg 	mulq	r3, r20, r4		C q = y * inv
    154      1.1  mrg L(entry):
    155      1.1  mrg 	cmpult	r2, r19, r5		C cbit2 = x < climb
    156      1.1  mrg 	addq	r5, r0, r0		C cbit += cbit2
    157      1.1  mrg 	lda	r16, 8(r16)		C src++
    158      1.1  mrg 	lda	r17, -1(r17)		C size--
    159      1.1  mrg 
    160      1.1  mrg 	umulh	r4, r18, r19		C climb = q * d
    161      1.1  mrg 	bne	r17, L(top)		C while 2 or more limbs left
    162      1.1  mrg 
    163      1.1  mrg 
    164      1.1  mrg 
    165      1.1  mrg 	C r0	cbit
    166      1.1  mrg 	C r18	d
    167      1.1  mrg 	C r19	climb
    168      1.1  mrg 	C r20	inv
    169      1.1  mrg 
    170      1.1  mrg 	ldq	r1, 0(r16)		C s = src[size-1] high limb
    171      1.1  mrg 
    172      1.1  mrg 	cmpult	r1, r18, r2		C test high<divisor
    173      1.1  mrg 	bne	r2, L(skip)		C skip if so
    174      1.1  mrg 
    175      1.1  mrg 	C can't skip a division, repeat loop code
    176      1.1  mrg 
    177      1.1  mrg 	subq	r1, r0, r2		C x = s - cbit
    178      1.1  mrg 	cmpult	r1, r0, r0		C new cbit = s < cbit
    179      1.1  mrg 
    180      1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    181      1.1  mrg 
    182      1.1  mrg 	mulq	r3, r20, r4		C q = y * inv
    183      1.1  mrg L(one):
    184      1.1  mrg 	cmpult	r2, r19, r5		C cbit2 = x < climb
    185      1.1  mrg 	addq	r5, r0, r0		C cbit += cbit2
    186      1.1  mrg 
    187      1.1  mrg 	umulh	r4, r18, r19		C climb = q * d
    188      1.1  mrg 
    189      1.1  mrg 	addq	r19, r0, r0		C return climb + cbit
    190      1.1  mrg 	ret	r31, (r26), 1
    191      1.1  mrg 
    192      1.1  mrg 
    193      1.1  mrg 	ALIGN(8)
    194      1.1  mrg L(skip):
    195      1.1  mrg 	C with high<divisor, the final step can be just (cbit+climb)-s and
    196      1.1  mrg 	C an addback of d if that underflows
    197      1.1  mrg 
    198      1.1  mrg 	addq	r19, r0, r19		C c = climb + cbit
    199      1.1  mrg 
    200      1.1  mrg 	subq	r19, r1, r2		C c - s
    201      1.1  mrg 	cmpult	r19, r1, r3		C c < s
    202      1.1  mrg 
    203      1.1  mrg 	addq	r2, r18, r0		C return c-s + divisor
    204      1.1  mrg 
    205      1.1  mrg 	cmoveq	r3, r2, r0		C return c-s if no underflow
    206      1.1  mrg 	ret	r31, (r26), 1
    207      1.1  mrg 
    208      1.1  mrg EPILOGUE()
    209      1.1  mrg ASM_END()
    210