invertappr.c revision 1.1 1 1.1 mrg /* mpn_invertappr and helper functions. Compute I such that
2 1.1 mrg floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
3 1.1 mrg
4 1.1 mrg Contributed to the GNU project by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg The algorithm used here was inspired by ApproximateReciprocal from "Modern
7 1.1 mrg Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special
8 1.1 mrg thanks to Paul Zimmermann for his very valuable suggestions on all the
9 1.1 mrg theoretical aspects during the work on this code.
10 1.1 mrg
11 1.1 mrg THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
12 1.1 mrg SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
13 1.1 mrg GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
14 1.1 mrg
15 1.1 mrg Copyright (C) 2007, 2009, 2010 Free Software Foundation, Inc.
16 1.1 mrg
17 1.1 mrg This file is part of the GNU MP Library.
18 1.1 mrg
19 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
20 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
21 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
22 1.1 mrg option) any later version.
23 1.1 mrg
24 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
25 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
27 1.1 mrg License for more details.
28 1.1 mrg
29 1.1 mrg You should have received a copy of the GNU Lesser General Public License
30 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
31 1.1 mrg
32 1.1 mrg /* FIXME: Remove NULL and TMP_*, as soon as all the callers properly
33 1.1 mrg allocate and pass the scratch to the function. */
34 1.1 mrg #include <stdlib.h> /* for NULL */
35 1.1 mrg
36 1.1 mrg #include "gmp.h"
37 1.1 mrg #include "gmp-impl.h"
38 1.1 mrg #include "longlong.h"
39 1.1 mrg
40 1.1 mrg /* FIXME: The iterative version splits the operand in two slighty unbalanced
41 1.1 mrg parts, the use of log_2 (or counting the bits) underestimate the maximum
42 1.1 mrg number of iterations. */
43 1.1 mrg
44 1.1 mrg /* This is intended for constant THRESHOLDs only, where the compiler
45 1.1 mrg can completely fold the result. */
46 1.1 mrg #define LOG2C(n) \
47 1.1 mrg (((n) >= 0x1) + ((n) >= 0x2) + ((n) >= 0x4) + ((n) >= 0x8) + \
48 1.1 mrg ((n) >= 0x10) + ((n) >= 0x20) + ((n) >= 0x40) + ((n) >= 0x80) + \
49 1.1 mrg ((n) >= 0x100) + ((n) >= 0x200) + ((n) >= 0x400) + ((n) >= 0x800) + \
50 1.1 mrg ((n) >= 0x1000) + ((n) >= 0x2000) + ((n) >= 0x4000) + ((n) >= 0x8000))
51 1.1 mrg
52 1.1 mrg #if TUNE_PROGRAM_BUILD
53 1.1 mrg #define NPOWS \
54 1.1 mrg ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
55 1.1 mrg #define MAYBE_dcpi1_divappr 1
56 1.1 mrg #else
57 1.1 mrg #define NPOWS \
58 1.1 mrg ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
59 1.1 mrg #define MAYBE_dcpi1_divappr \
60 1.1 mrg (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
61 1.1 mrg #if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
62 1.1 mrg (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
63 1.1 mrg #undef INV_MULMOD_BNM1_THRESHOLD
64 1.1 mrg #define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
65 1.1 mrg #endif
66 1.1 mrg #endif
67 1.1 mrg
68 1.1 mrg /* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
69 1.1 mrg the strictly normalised value {dp,n} (i.e., most significant bit must be set)
70 1.1 mrg as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
71 1.1 mrg
72 1.1 mrg Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
73 1.1 mrg following conditions are satisfied by the output:
74 1.1 mrg 0 <= e <= 1;
75 1.1 mrg {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
76 1.1 mrg I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
77 1.1 mrg e=1 means that the result _may_ be one less than expected.
78 1.1 mrg
79 1.1 mrg The _bc version returns e=1 most of the time.
80 1.1 mrg The _ni version should return e=0 most of the time; only about 1% of
81 1.1 mrg possible random input should give e=1.
82 1.1 mrg
83 1.1 mrg When the strict result is needed, i.e., e=0 in the relation above:
84 1.1 mrg {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
85 1.1 mrg the function mpn_invert (ip, dp, n, scratch) should be used instead. */
86 1.1 mrg
87 1.1 mrg /* Maximum scratch needed by this branch (at tp): 3*n + 2 */
88 1.1 mrg static mp_limb_t
89 1.1 mrg mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr tp)
90 1.1 mrg {
91 1.1 mrg mp_ptr xp;
92 1.1 mrg
93 1.1 mrg ASSERT (n > 0);
94 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
95 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
96 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, tp, mpn_invertappr_itch(n)));
97 1.1 mrg ASSERT (! MPN_OVERLAP_P (dp, n, tp, mpn_invertappr_itch(n)));
98 1.1 mrg
99 1.1 mrg /* Compute a base value of r limbs. */
100 1.1 mrg if (n == 1)
101 1.1 mrg invert_limb (*ip, *dp);
102 1.1 mrg else {
103 1.1 mrg mp_size_t i;
104 1.1 mrg xp = tp + n + 2; /* 2 * n limbs */
105 1.1 mrg
106 1.1 mrg for (i = n - 1; i >= 0; i--)
107 1.1 mrg xp[i] = GMP_NUMB_MAX;
108 1.1 mrg mpn_com (xp + n, dp, n);
109 1.1 mrg
110 1.1 mrg /* Now xp contains B^2n - {dp,n}*B^n - 1 */
111 1.1 mrg
112 1.1 mrg /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
113 1.1 mrg if (n == 2) {
114 1.1 mrg mpn_divrem_2 (ip, 0, xp, 4, dp);
115 1.1 mrg } else {
116 1.1 mrg gmp_pi1_t inv;
117 1.1 mrg invert_pi1 (inv, dp[n-1], dp[n-2]);
118 1.1 mrg if (! MAYBE_dcpi1_divappr
119 1.1 mrg || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
120 1.1 mrg mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
121 1.1 mrg else
122 1.1 mrg mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
123 1.1 mrg MPN_DECR_U(ip, n, 1);
124 1.1 mrg return 1;
125 1.1 mrg }
126 1.1 mrg }
127 1.1 mrg return 0;
128 1.1 mrg }
129 1.1 mrg
130 1.1 mrg /* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
131 1.1 mrg iterations (at least one).
132 1.1 mrg
133 1.1 mrg Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
134 1.1 mrg Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
135 1.1 mrg in version 0.4 of the book.
136 1.1 mrg
137 1.1 mrg Some adaptations were introduced, to allow product mod B^m-1 and return the
138 1.1 mrg value e.
139 1.1 mrg
140 1.1 mrg USE_MUL_N = 1 (default) introduces a correction in such a way that "the
141 1.1 mrg value of B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
142 1.1 mrg "2B^n-1"). This correction should not require to modify the proof.
143 1.1 mrg
144 1.1 mrg We use a wrapped product modulo B^m-1. NOTE: is there any normalisation
145 1.1 mrg problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
146 1.1 mrg B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can
147 1.1 mrg happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
148 1.1 mrg B^{n+h} - A, then we get into the "negative" branch, where X_h is not
149 1.1 mrg incremented (because A < B^n).
150 1.1 mrg
151 1.1 mrg FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
152 1.1 mrg is allocated apart. */
153 1.1 mrg
154 1.1 mrg #define USE_MUL_N 1
155 1.1 mrg
156 1.1 mrg mp_limb_t
157 1.1 mrg mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
158 1.1 mrg {
159 1.1 mrg mp_limb_t cy;
160 1.1 mrg mp_ptr xp;
161 1.1 mrg mp_size_t rn, mn;
162 1.1 mrg mp_size_t sizes[NPOWS], *sizp;
163 1.1 mrg mp_ptr tp;
164 1.1 mrg TMP_DECL;
165 1.1 mrg #define rp scratch
166 1.1 mrg
167 1.1 mrg ASSERT (n > 2);
168 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
169 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
170 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
171 1.1 mrg ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
172 1.1 mrg
173 1.1 mrg /* Compute the computation precisions from highest to lowest, leaving the
174 1.1 mrg base case size in 'rn'. */
175 1.1 mrg sizp = sizes;
176 1.1 mrg rn = n;
177 1.1 mrg do {
178 1.1 mrg *sizp = rn;
179 1.1 mrg rn = ((rn) >> 1) + 1;
180 1.1 mrg sizp ++;
181 1.1 mrg } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
182 1.1 mrg
183 1.1 mrg /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
184 1.1 mrg dp += n;
185 1.1 mrg ip += n;
186 1.1 mrg
187 1.1 mrg /* Compute a base value of rn limbs. */
188 1.1 mrg mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
189 1.1 mrg
190 1.1 mrg TMP_MARK;
191 1.1 mrg
192 1.1 mrg if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
193 1.1 mrg {
194 1.1 mrg mn = mpn_mulmod_bnm1_next_size (n + 1);
195 1.1 mrg tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
196 1.1 mrg }
197 1.1 mrg /* Use Newton's iterations to get the desired precision.*/
198 1.1 mrg
199 1.1 mrg /* define rp scratch; 2rn + 1 limbs <= 2(n>>1 + 1) + 1 <= n + 3 limbs */
200 1.1 mrg /* Maximum scratch needed by this branch <= 3*n + 2 */
201 1.1 mrg xp = scratch + n + 3; /* n + rn limbs */
202 1.1 mrg while (1) {
203 1.1 mrg mp_limb_t method;
204 1.1 mrg
205 1.1 mrg n = *--sizp;
206 1.1 mrg /*
207 1.1 mrg v n v
208 1.1 mrg +----+--+
209 1.1 mrg ^ rn ^
210 1.1 mrg */
211 1.1 mrg
212 1.1 mrg /* Compute i_jd . */
213 1.1 mrg if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
214 1.1 mrg || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
215 1.1 mrg /* FIXME: We do only need {xp,n+1}*/
216 1.1 mrg mpn_mul (xp, dp - n, n, ip - rn, rn);
217 1.1 mrg mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
218 1.1 mrg method = 1; /* Remember we used (truncated) product */
219 1.1 mrg /* We computed cy.{xp,rn+n} <- 1.{ip,rn} * 0.{dp,n} */
220 1.1 mrg } else { /* Use B^n-1 wraparound */
221 1.1 mrg mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
222 1.1 mrg /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
223 1.1 mrg /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
224 1.1 mrg /* Add dp*B^rn mod (B^mn-1) */
225 1.1 mrg ASSERT (n >= mn - rn);
226 1.1 mrg xp[mn] = 1 + mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
227 1.1 mrg cy = mpn_add_n (xp, xp, dp - (n - (mn - rn)), n - (mn - rn));
228 1.1 mrg MPN_INCR_U (xp + n - (mn - rn), mn + 1 - n + (mn - rn), cy);
229 1.1 mrg ASSERT (n + rn >= mn);
230 1.1 mrg /* Subtract B^{rn+n} */
231 1.1 mrg MPN_DECR_U (xp + rn + n - mn, 2*mn + 1 - rn - n, 1);
232 1.1 mrg if (xp[mn])
233 1.1 mrg MPN_INCR_U (xp, mn, xp[mn] - 1);
234 1.1 mrg else
235 1.1 mrg MPN_DECR_U (xp, mn, 1);
236 1.1 mrg method = 0; /* Remember we are working Mod B^m-1 */
237 1.1 mrg }
238 1.1 mrg
239 1.1 mrg if (xp[n] < 2) { /* "positive" residue class */
240 1.1 mrg cy = 1;
241 1.1 mrg while (xp[n] || mpn_cmp (xp, dp - n, n)>0) {
242 1.1 mrg xp[n] -= mpn_sub_n (xp, xp, dp - n, n);
243 1.1 mrg cy ++;
244 1.1 mrg }
245 1.1 mrg MPN_DECR_U(ip - rn, rn, cy);
246 1.1 mrg ASSERT (cy <= 4); /* at most 3 cycles for the while above */
247 1.1 mrg ASSERT_NOCARRY (mpn_sub_n (xp, dp - n, xp, n));
248 1.1 mrg ASSERT (xp[n] == 0);
249 1.1 mrg } else { /* "negative" residue class */
250 1.1 mrg mpn_com (xp, xp, n + 1);
251 1.1 mrg MPN_INCR_U(xp, n + 1, method);
252 1.1 mrg ASSERT (xp[n] <= 1);
253 1.1 mrg #if USE_MUL_N
254 1.1 mrg if (xp[n]) {
255 1.1 mrg MPN_INCR_U(ip - rn, rn, 1);
256 1.1 mrg ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
257 1.1 mrg }
258 1.1 mrg #endif
259 1.1 mrg }
260 1.1 mrg
261 1.1 mrg /* Compute x_ju_j. FIXME:We need {rp+rn,rn}, mulhi? */
262 1.1 mrg #if USE_MUL_N
263 1.1 mrg mpn_mul_n (rp, xp + n - rn, ip - rn, rn);
264 1.1 mrg #else
265 1.1 mrg rp[2*rn] = 0;
266 1.1 mrg mpn_mul (rp, xp + n - rn, rn + xp[n], ip - rn, rn);
267 1.1 mrg #endif
268 1.1 mrg /* We need _only_ the carry from the next addition */
269 1.1 mrg /* Anyway 2rn-n <= 2... we don't need to optimise. */
270 1.1 mrg cy = mpn_add_n (rp + rn, rp + rn, xp + n - rn, 2*rn - n);
271 1.1 mrg cy = mpn_add_nc (ip - n, rp + 3*rn - n, xp + rn, n - rn, cy);
272 1.1 mrg MPN_INCR_U (ip - rn, rn, cy + (1-USE_MUL_N)*(rp[2*rn] + xp[n]));
273 1.1 mrg if (sizp == sizes) { /* Get out of the cycle */
274 1.1 mrg /* Check for possible carry propagation from below. */
275 1.1 mrg cy = rp[3*rn - n - 1] > GMP_NUMB_MAX - 7; /* Be conservative. */
276 1.1 mrg /* cy = mpn_add_1 (rp + rn, rp + rn, 2*rn - n, 4); */
277 1.1 mrg break;
278 1.1 mrg }
279 1.1 mrg rn = n;
280 1.1 mrg }
281 1.1 mrg TMP_FREE;
282 1.1 mrg
283 1.1 mrg return cy;
284 1.1 mrg #undef rp
285 1.1 mrg }
286 1.1 mrg
287 1.1 mrg mp_limb_t
288 1.1 mrg mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
289 1.1 mrg {
290 1.1 mrg mp_limb_t res;
291 1.1 mrg TMP_DECL;
292 1.1 mrg
293 1.1 mrg TMP_MARK;
294 1.1 mrg
295 1.1 mrg if (scratch == NULL)
296 1.1 mrg scratch = TMP_ALLOC_LIMBS (mpn_invertappr_itch (n));
297 1.1 mrg
298 1.1 mrg ASSERT (n > 0);
299 1.1 mrg ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
300 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
301 1.1 mrg ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
302 1.1 mrg ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
303 1.1 mrg
304 1.1 mrg if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
305 1.1 mrg res = mpn_bc_invertappr (ip, dp, n, scratch);
306 1.1 mrg else
307 1.1 mrg res = mpn_ni_invertappr (ip, dp, n, scratch);
308 1.1 mrg
309 1.1 mrg TMP_FREE;
310 1.1 mrg return res;
311 1.1 mrg }
312