mpn/alpha/mode1o.asm

    1.1  mrg dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
    1.1  mrg
    1.1  mrg dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
1.1.1.2  mrg
    1.1  mrg dnl  This file is part of the GNU MP Library.
    1.1  mrg dnl
1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2  mrg dnl  it under the terms of either:
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
1.1.1.2  mrg dnl      option) any later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
1.1.1.2  mrg dnl      later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or both in parallel, as here.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.2  mrg dnl  for more details.
    1.1  mrg dnl
1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
    1.1  mrg
    1.1  mrg include(`../config.m4')
    1.1  mrg
    1.1  mrg
    1.1  mrg C      cycles/limb
    1.1  mrg C EV4:    47
    1.1  mrg C EV5:    30
    1.1  mrg C EV6:    15
    1.1  mrg
    1.1  mrg
    1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
    1.1  mrg C                                mp_limb_t c)
    1.1  mrg C
    1.1  mrg C This code follows the "alternate" code in mpn/generic/mode1o.c,
    1.1  mrg C eliminating cbit+climb from the dependent chain.  This leaves,
    1.1  mrg C
    1.1  mrg C        ev4   ev5   ev6
    1.1  mrg C         1     3     1    subq   y = x - h
    1.1  mrg C        23    13     7    mulq   q = y * inverse
    1.1  mrg C        23    14     7    umulh  h = high (q * d)
    1.1  mrg C        --    --    --
    1.1  mrg C        47    30    15
    1.1  mrg C
    1.1  mrg C In each case, the load latency, loop control, and extra carry bit handling
    1.1  mrg C hide under the multiply latencies.  Those latencies are long enough that
    1.1  mrg C we don't need to worry about alignment or pairing to squeeze out
    1.1  mrg C performance.
    1.1  mrg C
    1.1  mrg C For the first limb, some of the loop code is broken out and scheduled back
    1.1  mrg C since it can be done earlier.
    1.1  mrg C
    1.1  mrg C   - The first ldq src[0] is near the start of the routine, for maximum
    1.1  mrg C     time from memory.
    1.1  mrg C
    1.1  mrg C   - The subq y=x-climb can be done without waiting for the inverse.
    1.1  mrg C
    1.1  mrg C   - The mulq y*inverse is replicated after the final subq for the inverse,
    1.1  mrg C     instead of branching to the mulq in the main loop.  On ev4 a branch
    1.1  mrg C     there would cost cycles, but we can hide them under the mulq latency.
    1.1  mrg C
    1.1  mrg C For the last limb, high<divisor is tested and if that's true a subtract
    1.1  mrg C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
    1.1  mrg C data-dependent branch, but we're waiting for umulh so any penalty should
    1.1  mrg C hide there.  The multiplies saved would be worth the cost anyway.
    1.1  mrg C
    1.1  mrg C Enhancements:
    1.1  mrg C
    1.1  mrg C For size==1, a plain division (done bitwise say) might be faster than
    1.1  mrg C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
    1.1  mrg C ev5.  A call to gcc __remqu might be a possibility.
    1.1  mrg
    1.1  mrg ASM_START()
    1.1  mrg PROLOGUE(mpn_modexact_1c_odd,gp)
    1.1  mrg
    1.1  mrg 	C r16	src
    1.1  mrg 	C r17	size
    1.1  mrg 	C r18	d
    1.1  mrg 	C r19	c
    1.1  mrg
    1.1  mrg 	LEA(r0, binvert_limb_table)
    1.1  mrg 	srl	r18, 1, r20		C d >> 1
    1.1  mrg
    1.1  mrg 	and	r20, 127, r20		C idx = d>>1 & 0x7F
    1.1  mrg
    1.1  mrg 	addq	r0, r20, r21		C table + idx
    1.1  mrg
    1.1  mrg ifelse(bwx_available_p,1,
    1.1  mrg `	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
    1.1  mrg ',`
    1.1  mrg 	ldq_u	r20, 0(r21)		C table[idx] qword
    1.1  mrg 	extbl	r20, r21, r20		C table[idx], inverse 8 bits
    1.1  mrg ')
    1.1  mrg
    1.1  mrg 	mull	r20, r20, r7		C i*i
    1.1  mrg 	addq	r20, r20, r20		C 2*i
    1.1  mrg
    1.1  mrg 	ldq	r2, 0(r16)		C x = s = src[0]
    1.1  mrg 	lda	r17, -1(r17)		C size--
    1.1  mrg 	clr	r0			C initial cbit=0
    1.1  mrg
    1.1  mrg 	mull	r7, r18, r7		C i*i*d
    1.1  mrg
    1.1  mrg 	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
    1.1  mrg
    1.1  mrg 	mull	r20, r20, r7		C i*i
    1.1  mrg 	addq	r20, r20, r20		C 2*i
    1.1  mrg
    1.1  mrg 	mull	r7, r18, r7		C i*i*d
    1.1  mrg
    1.1  mrg 	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
    1.1  mrg
    1.1  mrg 	mulq	r20, r20, r7		C i*i
    1.1  mrg 	addq	r20, r20, r20		C 2*i
    1.1  mrg
    1.1  mrg 	mulq	r7, r18, r7		C i*i*d
    1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    1.1  mrg
    1.1  mrg 	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
    1.1  mrg
    1.1  mrg ASSERT(r7, C should have d*inv==1 mod 2^64
    1.1  mrg `	mulq	r18, r20, r7
    1.1  mrg 	cmpeq	r7, 1, r7')
    1.1  mrg
    1.1  mrg 	mulq	r3, r20, r4		C first q = y * inv
    1.1  mrg
    1.1  mrg 	beq	r17, L(one)		C if size==1
    1.1  mrg 	br	L(entry)
    1.1  mrg
    1.1  mrg
    1.1  mrg L(top):
    1.1  mrg 	C r0	cbit
    1.1  mrg 	C r16	src, incrementing
    1.1  mrg 	C r17	size, decrementing
    1.1  mrg 	C r18	d
    1.1  mrg 	C r19	climb
    1.1  mrg 	C r20	inv
    1.1  mrg
    1.1  mrg 	ldq	r1, 0(r16)		C s = src[i]
    1.1  mrg 	subq	r1, r0, r2		C x = s - cbit
    1.1  mrg 	cmpult	r1, r0, r0		C new cbit = s < cbit
    1.1  mrg
    1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    1.1  mrg
    1.1  mrg 	mulq	r3, r20, r4		C q = y * inv
    1.1  mrg L(entry):
    1.1  mrg 	cmpult	r2, r19, r5		C cbit2 = x < climb
    1.1  mrg 	addq	r5, r0, r0		C cbit += cbit2
    1.1  mrg 	lda	r16, 8(r16)		C src++
    1.1  mrg 	lda	r17, -1(r17)		C size--
    1.1  mrg
    1.1  mrg 	umulh	r4, r18, r19		C climb = q * d
    1.1  mrg 	bne	r17, L(top)		C while 2 or more limbs left
    1.1  mrg
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C r0	cbit
    1.1  mrg 	C r18	d
    1.1  mrg 	C r19	climb
    1.1  mrg 	C r20	inv
    1.1  mrg
    1.1  mrg 	ldq	r1, 0(r16)		C s = src[size-1] high limb
    1.1  mrg
    1.1  mrg 	cmpult	r1, r18, r2		C test high<divisor
    1.1  mrg 	bne	r2, L(skip)		C skip if so
    1.1  mrg
    1.1  mrg 	C can't skip a division, repeat loop code
    1.1  mrg
    1.1  mrg 	subq	r1, r0, r2		C x = s - cbit
    1.1  mrg 	cmpult	r1, r0, r0		C new cbit = s < cbit
    1.1  mrg
    1.1  mrg 	subq	r2, r19, r3		C y = x - climb
    1.1  mrg
    1.1  mrg 	mulq	r3, r20, r4		C q = y * inv
    1.1  mrg L(one):
    1.1  mrg 	cmpult	r2, r19, r5		C cbit2 = x < climb
    1.1  mrg 	addq	r5, r0, r0		C cbit += cbit2
    1.1  mrg
    1.1  mrg 	umulh	r4, r18, r19		C climb = q * d
    1.1  mrg
    1.1  mrg 	addq	r19, r0, r0		C return climb + cbit
    1.1  mrg 	ret	r31, (r26), 1
    1.1  mrg
    1.1  mrg
    1.1  mrg 	ALIGN(8)
    1.1  mrg L(skip):
    1.1  mrg 	C with high<divisor, the final step can be just (cbit+climb)-s and
    1.1  mrg 	C an addback of d if that underflows
    1.1  mrg
    1.1  mrg 	addq	r19, r0, r19		C c = climb + cbit
    1.1  mrg
    1.1  mrg 	subq	r19, r1, r2		C c - s
    1.1  mrg 	cmpult	r19, r1, r3		C c < s
    1.1  mrg
    1.1  mrg 	addq	r2, r18, r0		C return c-s + divisor
    1.1  mrg
    1.1  mrg 	cmoveq	r3, r2, r0		C return c-s if no underflow
    1.1  mrg 	ret	r31, (r26), 1
    1.1  mrg
    1.1  mrg EPILOGUE()
    1.1  mrg ASM_END()