mod_1.c revision 1.1.1.3 1 1.1 mrg /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
2 1.1 mrg
3 1.1.1.3 mrg Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
4 1.1.1.3 mrg Inc.
5 1.1 mrg
6 1.1 mrg This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1.1.3 mrg it under the terms of either:
10 1.1.1.3 mrg
11 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
12 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
13 1.1.1.3 mrg option) any later version.
14 1.1.1.3 mrg
15 1.1.1.3 mrg or
16 1.1.1.3 mrg
17 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
18 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
19 1.1.1.3 mrg later version.
20 1.1.1.3 mrg
21 1.1.1.3 mrg or both in parallel, as here.
22 1.1 mrg
23 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
24 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 1.1.1.3 mrg for more details.
27 1.1 mrg
28 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
29 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
30 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
31 1.1 mrg
32 1.1 mrg #include "gmp.h"
33 1.1 mrg #include "gmp-impl.h"
34 1.1 mrg #include "longlong.h"
35 1.1 mrg
36 1.1 mrg #include "mpn/sparc64/sparc64.h"
37 1.1 mrg
38 1.1 mrg
39 1.1 mrg /* 64-bit divisor 32-bit divisor
40 1.1 mrg cycles/limb cycles/limb
41 1.1 mrg (approx) (approx)
42 1.1 mrg Ultrasparc 2i: 160 120
43 1.1 mrg */
44 1.1 mrg
45 1.1 mrg
46 1.1 mrg /* 32-bit divisors are treated in special case code. This requires 4 mulx
47 1.1 mrg per limb instead of 8 in the general case.
48 1.1 mrg
49 1.1 mrg For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
50 1.1 mrg addressing, to get the two halves of each limb read in the correct order.
51 1.1 mrg This is kept in an adj variable. Doing that measures about 6 c/l faster
52 1.1 mrg than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't
53 1.1 mrg be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
54 1.1 mrg 3.2.1 at least).
55 1.1 mrg
56 1.1 mrg A simple udivx/umulx loop for the 32-bit case was attempted for small
57 1.1 mrg sizes, but at size==2 it was only about the same speed and at size==3 was
58 1.1 mrg slower. */
59 1.1 mrg
60 1.1.1.2 mrg static mp_limb_t
61 1.1.1.2 mrg mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
62 1.1 mrg {
63 1.1 mrg int norm, norm_rshift;
64 1.1 mrg mp_limb_t src_high_limb;
65 1.1 mrg mp_size_t i;
66 1.1 mrg
67 1.1 mrg ASSERT (size_limbs >= 0);
68 1.1 mrg ASSERT (d_limb != 0);
69 1.1 mrg
70 1.1 mrg if (UNLIKELY (size_limbs == 0))
71 1.1 mrg return 0;
72 1.1 mrg
73 1.1 mrg src_high_limb = src_limbptr[size_limbs-1];
74 1.1 mrg
75 1.1 mrg /* udivx is good for size==1, and no need to bother checking limb<divisor,
76 1.1 mrg since if that's likely the caller should check */
77 1.1 mrg if (UNLIKELY (size_limbs == 1))
78 1.1 mrg return src_high_limb % d_limb;
79 1.1 mrg
80 1.1 mrg if (d_limb <= CNST_LIMB(0xFFFFFFFF))
81 1.1 mrg {
82 1.1 mrg unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask;
83 1.1 mrg mp_size_t size, adj;
84 1.1 mrg mp_limb_t dinv_limb;
85 1.1 mrg
86 1.1 mrg size = 2 * size_limbs; /* halfwords */
87 1.1 mrg src = (unsigned *) src_limbptr;
88 1.1 mrg
89 1.1 mrg /* prospective initial remainder, if < d */
90 1.1 mrg r = src_high_limb >> 32;
91 1.1 mrg
92 1.1 mrg /* If the length of the source is uniformly distributed, then there's
93 1.1 mrg a 50% chance of the high 32-bits being zero, which we can skip. */
94 1.1 mrg if (r == 0)
95 1.1 mrg {
96 1.1 mrg r = (unsigned) src_high_limb;
97 1.1 mrg size--;
98 1.1 mrg ASSERT (size > 0); /* because always even */
99 1.1 mrg }
100 1.1 mrg
101 1.1 mrg /* Skip a division if high < divisor. Having the test here before
102 1.1 mrg normalizing will still skip as often as possible. */
103 1.1 mrg if (r < d_limb)
104 1.1 mrg {
105 1.1 mrg size--;
106 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */
107 1.1 mrg }
108 1.1 mrg else
109 1.1 mrg r = 0;
110 1.1 mrg
111 1.1 mrg count_leading_zeros_32 (norm, d_limb);
112 1.1 mrg norm -= 32;
113 1.1 mrg d_limb <<= norm;
114 1.1 mrg
115 1.1 mrg norm_rshift = 32 - norm;
116 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
117 1.1 mrg i = size-1;
118 1.1 mrg adj = HALF_ENDIAN_ADJ (i);
119 1.1 mrg n1 = src [i + adj];
120 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
121 1.1 mrg
122 1.1 mrg invert_half_limb (dinv_limb, d_limb);
123 1.1 mrg adj = -adj;
124 1.1 mrg
125 1.1 mrg for (i--; i >= 0; i--)
126 1.1 mrg {
127 1.1 mrg n0 = src [i + adj];
128 1.1 mrg adj = -adj;
129 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
130 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
131 1.1 mrg n1 = n0;
132 1.1 mrg }
133 1.1 mrg
134 1.1 mrg /* same as loop, but without n0 */
135 1.1 mrg nshift = n1 << norm;
136 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
137 1.1 mrg
138 1.1 mrg ASSERT ((r & ((1 << norm) - 1)) == 0);
139 1.1 mrg return r >> norm;
140 1.1 mrg }
141 1.1 mrg else
142 1.1 mrg {
143 1.1 mrg mp_srcptr src;
144 1.1 mrg mp_size_t size;
145 1.1 mrg mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
146 1.1 mrg
147 1.1 mrg src = src_limbptr;
148 1.1 mrg size = size_limbs;
149 1.1 mrg r = src_high_limb; /* initial remainder */
150 1.1 mrg
151 1.1 mrg /* Skip a division if high < divisor. Having the test here before
152 1.1 mrg normalizing will still skip as often as possible. */
153 1.1 mrg if (r < d_limb)
154 1.1 mrg {
155 1.1 mrg size--;
156 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */
157 1.1 mrg }
158 1.1 mrg else
159 1.1 mrg r = 0;
160 1.1 mrg
161 1.1 mrg count_leading_zeros (norm, d_limb);
162 1.1 mrg d_limb <<= norm;
163 1.1 mrg
164 1.1 mrg norm_rshift = GMP_LIMB_BITS - norm;
165 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
166 1.1 mrg
167 1.1 mrg src += size;
168 1.1 mrg n1 = *--src;
169 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
170 1.1 mrg
171 1.1 mrg invert_limb (dinv, d_limb);
172 1.1 mrg
173 1.1 mrg for (i = size-2; i >= 0; i--)
174 1.1 mrg {
175 1.1 mrg n0 = *--src;
176 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
177 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
178 1.1 mrg n1 = n0;
179 1.1 mrg }
180 1.1 mrg
181 1.1 mrg /* same as loop, but without n0 */
182 1.1 mrg nshift = n1 << norm;
183 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
184 1.1 mrg
185 1.1 mrg ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
186 1.1 mrg return r >> norm;
187 1.1 mrg }
188 1.1 mrg }
189 1.1.1.2 mrg
190 1.1.1.2 mrg mp_limb_t
191 1.1.1.2 mrg mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
192 1.1.1.2 mrg {
193 1.1.1.2 mrg ASSERT (n >= 0);
194 1.1.1.2 mrg ASSERT (b != 0);
195 1.1.1.2 mrg
196 1.1.1.2 mrg /* Should this be handled at all? Rely on callers? Note un==0 is currently
197 1.1.1.2 mrg required by mpz/fdiv_r_ui.c and possibly other places. */
198 1.1.1.2 mrg if (n == 0)
199 1.1.1.2 mrg return 0;
200 1.1.1.2 mrg
201 1.1.1.2 mrg if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
202 1.1.1.2 mrg {
203 1.1.1.2 mrg if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
204 1.1.1.2 mrg {
205 1.1.1.2 mrg return mpn_mod_1_anynorm (ap, n, b);
206 1.1.1.2 mrg }
207 1.1.1.2 mrg else
208 1.1.1.2 mrg {
209 1.1.1.2 mrg mp_limb_t pre[4];
210 1.1.1.2 mrg mpn_mod_1_1p_cps (pre, b);
211 1.1.1.2 mrg return mpn_mod_1_1p (ap, n, b, pre);
212 1.1.1.2 mrg }
213 1.1.1.2 mrg }
214 1.1.1.2 mrg else
215 1.1.1.2 mrg {
216 1.1.1.2 mrg if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
217 1.1.1.2 mrg {
218 1.1.1.2 mrg return mpn_mod_1_anynorm (ap, n, b);
219 1.1.1.2 mrg }
220 1.1.1.2 mrg else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
221 1.1.1.2 mrg {
222 1.1.1.2 mrg mp_limb_t pre[4];
223 1.1.1.2 mrg mpn_mod_1_1p_cps (pre, b);
224 1.1.1.2 mrg return mpn_mod_1_1p (ap, n, b << pre[1], pre);
225 1.1.1.2 mrg }
226 1.1.1.2 mrg else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
227 1.1.1.2 mrg {
228 1.1.1.2 mrg mp_limb_t pre[5];
229 1.1.1.2 mrg mpn_mod_1s_2p_cps (pre, b);
230 1.1.1.2 mrg return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
231 1.1.1.2 mrg }
232 1.1.1.2 mrg else
233 1.1.1.2 mrg {
234 1.1.1.2 mrg mp_limb_t pre[7];
235 1.1.1.2 mrg mpn_mod_1s_4p_cps (pre, b);
236 1.1.1.2 mrg return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
237 1.1.1.2 mrg }
238 1.1.1.2 mrg }
239 1.1.1.2 mrg }
240