mod_1.c revision 1.1 1 1.1 mrg /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
2 1.1 mrg
3 1.1 mrg Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2003 Free Software Foundation,
4 1.1 mrg Inc.
5 1.1 mrg
6 1.1 mrg This file is part of the GNU MP Library.
7 1.1 mrg
8 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
9 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
10 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
11 1.1 mrg option) any later version.
12 1.1 mrg
13 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
14 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 1.1 mrg License for more details.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU Lesser General Public License
19 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
20 1.1 mrg
21 1.1 mrg #include "gmp.h"
22 1.1 mrg #include "gmp-impl.h"
23 1.1 mrg #include "longlong.h"
24 1.1 mrg
25 1.1 mrg #include "mpn/sparc64/sparc64.h"
26 1.1 mrg
27 1.1 mrg
28 1.1 mrg /* 64-bit divisor 32-bit divisor
29 1.1 mrg cycles/limb cycles/limb
30 1.1 mrg (approx) (approx)
31 1.1 mrg Ultrasparc 2i: 160 120
32 1.1 mrg */
33 1.1 mrg
34 1.1 mrg
35 1.1 mrg /* 32-bit divisors are treated in special case code. This requires 4 mulx
36 1.1 mrg per limb instead of 8 in the general case.
37 1.1 mrg
38 1.1 mrg For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
39 1.1 mrg addressing, to get the two halves of each limb read in the correct order.
40 1.1 mrg This is kept in an adj variable. Doing that measures about 6 c/l faster
41 1.1 mrg than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't
42 1.1 mrg be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
43 1.1 mrg 3.2.1 at least).
44 1.1 mrg
45 1.1 mrg A simple udivx/umulx loop for the 32-bit case was attempted for small
46 1.1 mrg sizes, but at size==2 it was only about the same speed and at size==3 was
47 1.1 mrg slower. */
48 1.1 mrg
49 1.1 mrg mp_limb_t
50 1.1 mrg mpn_mod_1 (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
51 1.1 mrg {
52 1.1 mrg int norm, norm_rshift;
53 1.1 mrg mp_limb_t src_high_limb;
54 1.1 mrg mp_size_t i;
55 1.1 mrg
56 1.1 mrg ASSERT (size_limbs >= 0);
57 1.1 mrg ASSERT (d_limb != 0);
58 1.1 mrg
59 1.1 mrg if (UNLIKELY (size_limbs == 0))
60 1.1 mrg return 0;
61 1.1 mrg
62 1.1 mrg src_high_limb = src_limbptr[size_limbs-1];
63 1.1 mrg
64 1.1 mrg /* udivx is good for size==1, and no need to bother checking limb<divisor,
65 1.1 mrg since if that's likely the caller should check */
66 1.1 mrg if (UNLIKELY (size_limbs == 1))
67 1.1 mrg return src_high_limb % d_limb;
68 1.1 mrg
69 1.1 mrg if (d_limb <= CNST_LIMB(0xFFFFFFFF))
70 1.1 mrg {
71 1.1 mrg unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask;
72 1.1 mrg mp_size_t size, adj;
73 1.1 mrg mp_limb_t dinv_limb;
74 1.1 mrg
75 1.1 mrg size = 2 * size_limbs; /* halfwords */
76 1.1 mrg src = (unsigned *) src_limbptr;
77 1.1 mrg
78 1.1 mrg /* prospective initial remainder, if < d */
79 1.1 mrg r = src_high_limb >> 32;
80 1.1 mrg
81 1.1 mrg /* If the length of the source is uniformly distributed, then there's
82 1.1 mrg a 50% chance of the high 32-bits being zero, which we can skip. */
83 1.1 mrg if (r == 0)
84 1.1 mrg {
85 1.1 mrg r = (unsigned) src_high_limb;
86 1.1 mrg size--;
87 1.1 mrg ASSERT (size > 0); /* because always even */
88 1.1 mrg }
89 1.1 mrg
90 1.1 mrg /* Skip a division if high < divisor. Having the test here before
91 1.1 mrg normalizing will still skip as often as possible. */
92 1.1 mrg if (r < d_limb)
93 1.1 mrg {
94 1.1 mrg size--;
95 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */
96 1.1 mrg }
97 1.1 mrg else
98 1.1 mrg r = 0;
99 1.1 mrg
100 1.1 mrg count_leading_zeros_32 (norm, d_limb);
101 1.1 mrg norm -= 32;
102 1.1 mrg d_limb <<= norm;
103 1.1 mrg
104 1.1 mrg norm_rshift = 32 - norm;
105 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
106 1.1 mrg i = size-1;
107 1.1 mrg adj = HALF_ENDIAN_ADJ (i);
108 1.1 mrg n1 = src [i + adj];
109 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
110 1.1 mrg
111 1.1 mrg invert_half_limb (dinv_limb, d_limb);
112 1.1 mrg adj = -adj;
113 1.1 mrg
114 1.1 mrg for (i--; i >= 0; i--)
115 1.1 mrg {
116 1.1 mrg n0 = src [i + adj];
117 1.1 mrg adj = -adj;
118 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
119 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
120 1.1 mrg n1 = n0;
121 1.1 mrg }
122 1.1 mrg
123 1.1 mrg /* same as loop, but without n0 */
124 1.1 mrg nshift = n1 << norm;
125 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
126 1.1 mrg
127 1.1 mrg ASSERT ((r & ((1 << norm) - 1)) == 0);
128 1.1 mrg return r >> norm;
129 1.1 mrg }
130 1.1 mrg else
131 1.1 mrg {
132 1.1 mrg mp_srcptr src;
133 1.1 mrg mp_size_t size;
134 1.1 mrg mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
135 1.1 mrg
136 1.1 mrg src = src_limbptr;
137 1.1 mrg size = size_limbs;
138 1.1 mrg r = src_high_limb; /* initial remainder */
139 1.1 mrg
140 1.1 mrg /* Skip a division if high < divisor. Having the test here before
141 1.1 mrg normalizing will still skip as often as possible. */
142 1.1 mrg if (r < d_limb)
143 1.1 mrg {
144 1.1 mrg size--;
145 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */
146 1.1 mrg }
147 1.1 mrg else
148 1.1 mrg r = 0;
149 1.1 mrg
150 1.1 mrg count_leading_zeros (norm, d_limb);
151 1.1 mrg d_limb <<= norm;
152 1.1 mrg
153 1.1 mrg norm_rshift = GMP_LIMB_BITS - norm;
154 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
155 1.1 mrg
156 1.1 mrg src += size;
157 1.1 mrg n1 = *--src;
158 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
159 1.1 mrg
160 1.1 mrg invert_limb (dinv, d_limb);
161 1.1 mrg
162 1.1 mrg for (i = size-2; i >= 0; i--)
163 1.1 mrg {
164 1.1 mrg n0 = *--src;
165 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
166 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
167 1.1 mrg n1 = n0;
168 1.1 mrg }
169 1.1 mrg
170 1.1 mrg /* same as loop, but without n0 */
171 1.1 mrg nshift = n1 << norm;
172 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
173 1.1 mrg
174 1.1 mrg ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
175 1.1 mrg return r >> norm;
176 1.1 mrg }
177 1.1 mrg }
178