mode1o.asm revision 1.1.1.2 1 1.1 mrg dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
2 1.1 mrg
3 1.1 mrg dnl Copyright 2003, 2004 Free Software Foundation, Inc.
4 1.1.1.2 mrg
5 1.1 mrg dnl This file is part of the GNU MP Library.
6 1.1 mrg dnl
7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 1.1.1.2 mrg dnl it under the terms of either:
9 1.1.1.2 mrg dnl
10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free
11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your
12 1.1.1.2 mrg dnl option) any later version.
13 1.1.1.2 mrg dnl
14 1.1.1.2 mrg dnl or
15 1.1.1.2 mrg dnl
16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software
17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any
18 1.1.1.2 mrg dnl later version.
19 1.1.1.2 mrg dnl
20 1.1.1.2 mrg dnl or both in parallel, as here.
21 1.1.1.2 mrg dnl
22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 1.1.1.2 mrg dnl for more details.
26 1.1 mrg dnl
27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the
28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/.
30 1.1 mrg
31 1.1 mrg include(`../config.m4')
32 1.1 mrg
33 1.1 mrg
34 1.1 mrg C cycles/limb
35 1.1 mrg C EV4: 47
36 1.1 mrg C EV5: 30
37 1.1 mrg C EV6: 15
38 1.1 mrg
39 1.1 mrg
40 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
41 1.1 mrg C mp_limb_t c)
42 1.1 mrg C
43 1.1 mrg C This code follows the "alternate" code in mpn/generic/mode1o.c,
44 1.1 mrg C eliminating cbit+climb from the dependent chain. This leaves,
45 1.1 mrg C
46 1.1 mrg C ev4 ev5 ev6
47 1.1 mrg C 1 3 1 subq y = x - h
48 1.1 mrg C 23 13 7 mulq q = y * inverse
49 1.1 mrg C 23 14 7 umulh h = high (q * d)
50 1.1 mrg C -- -- --
51 1.1 mrg C 47 30 15
52 1.1 mrg C
53 1.1 mrg C In each case, the load latency, loop control, and extra carry bit handling
54 1.1 mrg C hide under the multiply latencies. Those latencies are long enough that
55 1.1 mrg C we don't need to worry about alignment or pairing to squeeze out
56 1.1 mrg C performance.
57 1.1 mrg C
58 1.1 mrg C For the first limb, some of the loop code is broken out and scheduled back
59 1.1 mrg C since it can be done earlier.
60 1.1 mrg C
61 1.1 mrg C - The first ldq src[0] is near the start of the routine, for maximum
62 1.1 mrg C time from memory.
63 1.1 mrg C
64 1.1 mrg C - The subq y=x-climb can be done without waiting for the inverse.
65 1.1 mrg C
66 1.1 mrg C - The mulq y*inverse is replicated after the final subq for the inverse,
67 1.1 mrg C instead of branching to the mulq in the main loop. On ev4 a branch
68 1.1 mrg C there would cost cycles, but we can hide them under the mulq latency.
69 1.1 mrg C
70 1.1 mrg C For the last limb, high<divisor is tested and if that's true a subtract
71 1.1 mrg C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
72 1.1 mrg C data-dependent branch, but we're waiting for umulh so any penalty should
73 1.1 mrg C hide there. The multiplies saved would be worth the cost anyway.
74 1.1 mrg C
75 1.1 mrg C Enhancements:
76 1.1 mrg C
77 1.1 mrg C For size==1, a plain division (done bitwise say) might be faster than
78 1.1 mrg C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
79 1.1 mrg C ev5. A call to gcc __remqu might be a possibility.
80 1.1 mrg
81 1.1 mrg ASM_START()
82 1.1 mrg PROLOGUE(mpn_modexact_1c_odd,gp)
83 1.1 mrg
84 1.1 mrg C r16 src
85 1.1 mrg C r17 size
86 1.1 mrg C r18 d
87 1.1 mrg C r19 c
88 1.1 mrg
89 1.1 mrg LEA(r0, binvert_limb_table)
90 1.1 mrg srl r18, 1, r20 C d >> 1
91 1.1 mrg
92 1.1 mrg and r20, 127, r20 C idx = d>>1 & 0x7F
93 1.1 mrg
94 1.1 mrg addq r0, r20, r21 C table + idx
95 1.1 mrg
96 1.1 mrg ifelse(bwx_available_p,1,
97 1.1 mrg ` ldbu r20, 0(r21) C table[idx], inverse 8 bits
98 1.1 mrg ',`
99 1.1 mrg ldq_u r20, 0(r21) C table[idx] qword
100 1.1 mrg extbl r20, r21, r20 C table[idx], inverse 8 bits
101 1.1 mrg ')
102 1.1 mrg
103 1.1 mrg mull r20, r20, r7 C i*i
104 1.1 mrg addq r20, r20, r20 C 2*i
105 1.1 mrg
106 1.1 mrg ldq r2, 0(r16) C x = s = src[0]
107 1.1 mrg lda r17, -1(r17) C size--
108 1.1 mrg clr r0 C initial cbit=0
109 1.1 mrg
110 1.1 mrg mull r7, r18, r7 C i*i*d
111 1.1 mrg
112 1.1 mrg subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
113 1.1 mrg
114 1.1 mrg mull r20, r20, r7 C i*i
115 1.1 mrg addq r20, r20, r20 C 2*i
116 1.1 mrg
117 1.1 mrg mull r7, r18, r7 C i*i*d
118 1.1 mrg
119 1.1 mrg subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
120 1.1 mrg
121 1.1 mrg mulq r20, r20, r7 C i*i
122 1.1 mrg addq r20, r20, r20 C 2*i
123 1.1 mrg
124 1.1 mrg mulq r7, r18, r7 C i*i*d
125 1.1 mrg subq r2, r19, r3 C y = x - climb
126 1.1 mrg
127 1.1 mrg subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
128 1.1 mrg
129 1.1 mrg ASSERT(r7, C should have d*inv==1 mod 2^64
130 1.1 mrg ` mulq r18, r20, r7
131 1.1 mrg cmpeq r7, 1, r7')
132 1.1 mrg
133 1.1 mrg mulq r3, r20, r4 C first q = y * inv
134 1.1 mrg
135 1.1 mrg beq r17, L(one) C if size==1
136 1.1 mrg br L(entry)
137 1.1 mrg
138 1.1 mrg
139 1.1 mrg L(top):
140 1.1 mrg C r0 cbit
141 1.1 mrg C r16 src, incrementing
142 1.1 mrg C r17 size, decrementing
143 1.1 mrg C r18 d
144 1.1 mrg C r19 climb
145 1.1 mrg C r20 inv
146 1.1 mrg
147 1.1 mrg ldq r1, 0(r16) C s = src[i]
148 1.1 mrg subq r1, r0, r2 C x = s - cbit
149 1.1 mrg cmpult r1, r0, r0 C new cbit = s < cbit
150 1.1 mrg
151 1.1 mrg subq r2, r19, r3 C y = x - climb
152 1.1 mrg
153 1.1 mrg mulq r3, r20, r4 C q = y * inv
154 1.1 mrg L(entry):
155 1.1 mrg cmpult r2, r19, r5 C cbit2 = x < climb
156 1.1 mrg addq r5, r0, r0 C cbit += cbit2
157 1.1 mrg lda r16, 8(r16) C src++
158 1.1 mrg lda r17, -1(r17) C size--
159 1.1 mrg
160 1.1 mrg umulh r4, r18, r19 C climb = q * d
161 1.1 mrg bne r17, L(top) C while 2 or more limbs left
162 1.1 mrg
163 1.1 mrg
164 1.1 mrg
165 1.1 mrg C r0 cbit
166 1.1 mrg C r18 d
167 1.1 mrg C r19 climb
168 1.1 mrg C r20 inv
169 1.1 mrg
170 1.1 mrg ldq r1, 0(r16) C s = src[size-1] high limb
171 1.1 mrg
172 1.1 mrg cmpult r1, r18, r2 C test high<divisor
173 1.1 mrg bne r2, L(skip) C skip if so
174 1.1 mrg
175 1.1 mrg C can't skip a division, repeat loop code
176 1.1 mrg
177 1.1 mrg subq r1, r0, r2 C x = s - cbit
178 1.1 mrg cmpult r1, r0, r0 C new cbit = s < cbit
179 1.1 mrg
180 1.1 mrg subq r2, r19, r3 C y = x - climb
181 1.1 mrg
182 1.1 mrg mulq r3, r20, r4 C q = y * inv
183 1.1 mrg L(one):
184 1.1 mrg cmpult r2, r19, r5 C cbit2 = x < climb
185 1.1 mrg addq r5, r0, r0 C cbit += cbit2
186 1.1 mrg
187 1.1 mrg umulh r4, r18, r19 C climb = q * d
188 1.1 mrg
189 1.1 mrg addq r19, r0, r0 C return climb + cbit
190 1.1 mrg ret r31, (r26), 1
191 1.1 mrg
192 1.1 mrg
193 1.1 mrg ALIGN(8)
194 1.1 mrg L(skip):
195 1.1 mrg C with high<divisor, the final step can be just (cbit+climb)-s and
196 1.1 mrg C an addback of d if that underflows
197 1.1 mrg
198 1.1 mrg addq r19, r0, r19 C c = climb + cbit
199 1.1 mrg
200 1.1 mrg subq r19, r1, r2 C c - s
201 1.1 mrg cmpult r19, r1, r3 C c < s
202 1.1 mrg
203 1.1 mrg addq r2, r18, r0 C return c-s + divisor
204 1.1 mrg
205 1.1 mrg cmoveq r3, r2, r0 C return c-s if no underflow
206 1.1 mrg ret r31, (r26), 1
207 1.1 mrg
208 1.1 mrg EPILOGUE()
209 1.1 mrg ASM_END()
210