invertappr.c revision 1.1.1.3 1 1.1 mrg /* mpn_invertappr and helper functions. Compute I such that
2 1.1 mrg floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The algorithm used here was inspired by ApproximateReciprocal from "Modern
7 1.1 mrg Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special
8 1.1 mrg thanks to Paul Zimmermann for his very valuable suggestions on all the
9 1.1 mrg theoretical aspects during the work on this code.
10 1.1 mrg
11 1.1 mrg THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
12 1.1 mrg SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
13 1.1 mrg GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
14 1.1 mrg
15 1.1.1.3 mrg Copyright (C) 2007, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
16 1.1 mrg
17 1.1 mrg This file is part of the GNU MP Library.
18 1.1 mrg
19 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
20 1.1.1.3 mrg it under the terms of either:
21 1.1.1.3 mrg
22 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free
23 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your
24 1.1.1.3 mrg option) any later version.
25 1.1.1.3 mrg
26 1.1.1.3 mrg or
27 1.1.1.3 mrg
28 1.1.1.3 mrg * the GNU General Public License as published by the Free Software
29 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any
30 1.1.1.3 mrg later version.
31 1.1.1.3 mrg
32 1.1.1.3 mrg or both in parallel, as here.
33 1.1 mrg
34 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
35 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
36 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
37 1.1.1.3 mrg for more details.
38 1.1 mrg
39 1.1.1.3 mrg You should have received copies of the GNU General Public License and the
40 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
41 1.1.1.3 mrg see https://www.gnu.org/licenses/. */
42 1.1 mrg
43 1.1 mrg #include "gmp.h"
44 1.1 mrg #include "gmp-impl.h"
45 1.1 mrg #include "longlong.h"
46 1.1 mrg
47 1.1.1.3 mrg /* FIXME: The iterative version splits the operand in two slightly unbalanced
48 1.1 mrg parts, the use of log_2 (or counting the bits) underestimate the maximum
49 1.1 mrg number of iterations. */
50 1.1 mrg
51 1.1 mrg #if TUNE_PROGRAM_BUILD
52 1.1 mrg #define NPOWS \
53 1.1 mrg ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
54 1.1 mrg #define MAYBE_dcpi1_divappr 1
55 1.1 mrg #else
56 1.1 mrg #define NPOWS \
57 1.1 mrg ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
58 1.1 mrg #define MAYBE_dcpi1_divappr \
59 1.1 mrg (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
60 1.1 mrg #if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
61 1.1 mrg (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
62 1.1 mrg #undef INV_MULMOD_BNM1_THRESHOLD
63 1.1 mrg #define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
64 1.1 mrg #endif
65 1.1 mrg #endif
66 1.1 mrg
67 1.1 mrg /* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
68 1.1 mrg the strictly normalised value {dp,n} (i.e., most significant bit must be set)
69 1.1 mrg as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
70 1.1 mrg
71 1.1 mrg Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
72 1.1 mrg following conditions are satisfied by the output:
73 1.1 mrg 0 <= e <= 1;
74 1.1 mrg {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
75 1.1 mrg I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
76 1.1 mrg e=1 means that the result _may_ be one less than expected.
77 1.1 mrg
78 1.1 mrg The _bc version returns e=1 most of the time.
79 1.1 mrg The _ni version should return e=0 most of the time; only about 1% of
80 1.1 mrg possible random input should give e=1.
81 1.1 mrg
82 1.1 mrg When the strict result is needed, i.e., e=0 in the relation above:
83 1.1 mrg {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
84 1.1 mrg the function mpn_invert (ip, dp, n, scratch) should be used instead. */
85 1.1 mrg
86 1.1.1.3 mrg /* Maximum scratch needed by this branch (at xp): 2*n */
87 1.1 mrg static mp_limb_t
88 1.1.1.3 mrg mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
89 1.1 mrg {
90 1.1 mrg ASSERT (n > 0);
91 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
92 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
93 1.1.1.3 mrg ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
94 1.1.1.3 mrg ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
95 1.1 mrg
96 1.1 mrg /* Compute a base value of r limbs. */
97 1.1 mrg if (n == 1)
98 1.1 mrg invert_limb (*ip, *dp);
99 1.1 mrg else {
100 1.1 mrg mp_size_t i;
101 1.1 mrg
102 1.1.1.3 mrg /* n > 1 here */
103 1.1.1.3 mrg i = n;
104 1.1.1.3 mrg do
105 1.1.1.3 mrg xp[--i] = GMP_NUMB_MAX;
106 1.1.1.3 mrg while (i);
107 1.1 mrg mpn_com (xp + n, dp, n);
108 1.1 mrg
109 1.1 mrg /* Now xp contains B^2n - {dp,n}*B^n - 1 */
110 1.1 mrg
111 1.1 mrg /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
112 1.1 mrg if (n == 2) {
113 1.1 mrg mpn_divrem_2 (ip, 0, xp, 4, dp);
114 1.1 mrg } else {
115 1.1 mrg gmp_pi1_t inv;
116 1.1 mrg invert_pi1 (inv, dp[n-1], dp[n-2]);
117 1.1 mrg if (! MAYBE_dcpi1_divappr
118 1.1 mrg || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
119 1.1 mrg mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
120 1.1 mrg else
121 1.1 mrg mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
122 1.1.1.3 mrg MPN_DECR_U(ip, n, CNST_LIMB (1));
123 1.1 mrg return 1;
124 1.1 mrg }
125 1.1 mrg }
126 1.1 mrg return 0;
127 1.1 mrg }
128 1.1 mrg
129 1.1 mrg /* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
130 1.1 mrg iterations (at least one).
131 1.1 mrg
132 1.1 mrg Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
133 1.1 mrg Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
134 1.1 mrg in version 0.4 of the book.
135 1.1 mrg
136 1.1 mrg Some adaptations were introduced, to allow product mod B^m-1 and return the
137 1.1 mrg value e.
138 1.1 mrg
139 1.1.1.3 mrg We introduced a correction in such a way that "the value of
140 1.1.1.3 mrg B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
141 1.1.1.3 mrg "2B^n-1").
142 1.1.1.3 mrg
143 1.1.1.3 mrg Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
144 1.1.1.3 mrg in the scratch, i.e. 3*rn <= 2*n: we require n>4.
145 1.1 mrg
146 1.1 mrg We use a wrapped product modulo B^m-1. NOTE: is there any normalisation
147 1.1 mrg problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
148 1.1 mrg B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can
149 1.1 mrg happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
150 1.1 mrg B^{n+h} - A, then we get into the "negative" branch, where X_h is not
151 1.1 mrg incremented (because A < B^n).
152 1.1 mrg
153 1.1 mrg FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
154 1.1.1.3 mrg is allocated apart.
155 1.1.1.3 mrg */
156 1.1 mrg
157 1.1 mrg mp_limb_t
158 1.1 mrg mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
159 1.1 mrg {
160 1.1 mrg mp_limb_t cy;
161 1.1 mrg mp_size_t rn, mn;
162 1.1 mrg mp_size_t sizes[NPOWS], *sizp;
163 1.1 mrg mp_ptr tp;
164 1.1 mrg TMP_DECL;
165 1.1.1.3 mrg #define xp scratch
166 1.1 mrg
167 1.1.1.3 mrg ASSERT (n > 4);
168 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
169 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
170 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
171 1.1 mrg ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
172 1.1 mrg
173 1.1 mrg /* Compute the computation precisions from highest to lowest, leaving the
174 1.1 mrg base case size in 'rn'. */
175 1.1 mrg sizp = sizes;
176 1.1 mrg rn = n;
177 1.1 mrg do {
178 1.1 mrg *sizp = rn;
179 1.1.1.3 mrg rn = (rn >> 1) + 1;
180 1.1.1.3 mrg ++sizp;
181 1.1 mrg } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
182 1.1 mrg
183 1.1 mrg /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
184 1.1 mrg dp += n;
185 1.1 mrg ip += n;
186 1.1 mrg
187 1.1 mrg /* Compute a base value of rn limbs. */
188 1.1 mrg mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
189 1.1 mrg
190 1.1 mrg TMP_MARK;
191 1.1 mrg
192 1.1 mrg if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
193 1.1 mrg {
194 1.1 mrg mn = mpn_mulmod_bnm1_next_size (n + 1);
195 1.1 mrg tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
196 1.1 mrg }
197 1.1 mrg /* Use Newton's iterations to get the desired precision.*/
198 1.1 mrg
199 1.1 mrg while (1) {
200 1.1 mrg n = *--sizp;
201 1.1 mrg /*
202 1.1 mrg v n v
203 1.1 mrg +----+--+
204 1.1 mrg ^ rn ^
205 1.1 mrg */
206 1.1 mrg
207 1.1 mrg /* Compute i_jd . */
208 1.1 mrg if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
209 1.1 mrg || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
210 1.1 mrg /* FIXME: We do only need {xp,n+1}*/
211 1.1 mrg mpn_mul (xp, dp - n, n, ip - rn, rn);
212 1.1 mrg mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
213 1.1.1.3 mrg cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
214 1.1.1.3 mrg /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
215 1.1.1.3 mrg } else { /* Use B^mn-1 wraparound */
216 1.1 mrg mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
217 1.1 mrg /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
218 1.1 mrg /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
219 1.1 mrg /* Add dp*B^rn mod (B^mn-1) */
220 1.1 mrg ASSERT (n >= mn - rn);
221 1.1.1.3 mrg cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
222 1.1.1.3 mrg cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
223 1.1.1.3 mrg /* Subtract B^{rn+n}, maybe only compensate the carry*/
224 1.1.1.3 mrg xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
225 1.1.1.3 mrg MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
226 1.1.1.3 mrg MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
227 1.1.1.3 mrg cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
228 1.1 mrg }
229 1.1 mrg
230 1.1.1.3 mrg if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
231 1.1.1.3 mrg cy = xp[n]; /* 0 <= cy <= 1 here. */
232 1.1.1.3 mrg #if HAVE_NATIVE_mpn_sublsh1_n
233 1.1.1.3 mrg if (cy++) {
234 1.1.1.3 mrg if (mpn_cmp (xp, dp - n, n) > 0) {
235 1.1.1.3 mrg mp_limb_t chk;
236 1.1.1.3 mrg chk = mpn_sublsh1_n (xp, xp, dp - n, n);
237 1.1.1.3 mrg ASSERT (chk == xp[n]);
238 1.1.1.3 mrg ++ cy;
239 1.1.1.3 mrg } else
240 1.1.1.3 mrg ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
241 1.1 mrg }
242 1.1.1.3 mrg #else /* no mpn_sublsh1_n*/
243 1.1.1.3 mrg if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
244 1.1 mrg ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
245 1.1.1.3 mrg ++cy;
246 1.1.1.3 mrg }
247 1.1.1.3 mrg #endif
248 1.1.1.3 mrg /* 1 <= cy <= 3 here. */
249 1.1.1.3 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
250 1.1.1.3 mrg if (mpn_cmp (xp, dp - n, n) > 0) {
251 1.1.1.3 mrg ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
252 1.1.1.3 mrg ++cy;
253 1.1.1.3 mrg } else
254 1.1.1.3 mrg ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
255 1.1.1.3 mrg #else /* no mpn_rsblsh1_n*/
256 1.1.1.3 mrg if (mpn_cmp (xp, dp - n, n) > 0) {
257 1.1.1.3 mrg ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
258 1.1.1.3 mrg ++cy;
259 1.1 mrg }
260 1.1.1.3 mrg ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
261 1.1 mrg #endif
262 1.1.1.3 mrg MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
263 1.1.1.3 mrg } else { /* "negative" residue class */
264 1.1.1.3 mrg ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
265 1.1.1.3 mrg MPN_DECR_U(xp, n + 1, cy);
266 1.1.1.3 mrg if (xp[n] != GMP_NUMB_MAX) {
267 1.1.1.3 mrg MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
268 1.1.1.3 mrg ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
269 1.1.1.3 mrg }
270 1.1.1.3 mrg mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
271 1.1 mrg }
272 1.1 mrg
273 1.1.1.3 mrg /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
274 1.1.1.3 mrg mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
275 1.1.1.3 mrg cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
276 1.1.1.3 mrg cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
277 1.1.1.3 mrg MPN_INCR_U (ip - rn, rn, cy);
278 1.1 mrg if (sizp == sizes) { /* Get out of the cycle */
279 1.1 mrg /* Check for possible carry propagation from below. */
280 1.1.1.3 mrg cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
281 1.1.1.3 mrg /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
282 1.1 mrg break;
283 1.1 mrg }
284 1.1 mrg rn = n;
285 1.1 mrg }
286 1.1 mrg TMP_FREE;
287 1.1 mrg
288 1.1 mrg return cy;
289 1.1.1.3 mrg #undef xp
290 1.1 mrg }
291 1.1 mrg
292 1.1 mrg mp_limb_t
293 1.1 mrg mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
294 1.1 mrg {
295 1.1 mrg ASSERT (n > 0);
296 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
297 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
298 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
299 1.1 mrg ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
300 1.1 mrg
301 1.1 mrg if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
302 1.1.1.3 mrg return mpn_bc_invertappr (ip, dp, n, scratch);
303 1.1 mrg else
304 1.1.1.3 mrg return mpn_ni_invertappr (ip, dp, n, scratch);
305 1.1 mrg }
306