toom_interpolate_12pts.c revision 1.1.1.4 1 /* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3 Contributed to the GNU project by Marco Bodrato.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37
38 #include "gmp-impl.h"
39
40
41 #if HAVE_NATIVE_mpn_sublsh_n
42 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
43 #else
44 static mp_limb_t
45 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
46 {
47 #if USE_MUL_1 && 0
48 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
49 #else
50 mp_limb_t __cy;
51 __cy = mpn_lshift(ws,src,n,s);
52 return __cy + mpn_sub_n(dst,dst,ws,n);
53 #endif
54 }
55 #endif
56
57 #if HAVE_NATIVE_mpn_addlsh_n
58 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
59 #else
60 static mp_limb_t
61 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
62 {
63 #if USE_MUL_1 && 0
64 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
65 #else
66 mp_limb_t __cy;
67 __cy = mpn_lshift(ws,src,n,s);
68 return __cy + mpn_add_n(dst,dst,ws,n);
69 #endif
70 }
71 #endif
72
73 #if HAVE_NATIVE_mpn_subrsh
74 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
75 #else
76 /* FIXME: This is not a correct definition, it assumes no carry */
77 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
78 do { \
79 mp_limb_t __cy; \
80 MPN_DECR_U (dst, nd, src[0] >> s); \
81 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
82 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
83 } while (0)
84 #endif
85
86
87 #if GMP_NUMB_BITS < 21
88 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
89 #endif
90
91 #if GMP_NUMB_BITS < 16
92 #error Not implemented: divexact_by42525 needs splitting.
93 #endif
94
95 #if GMP_NUMB_BITS < 12
96 #error Not implemented: Hard to adapt...
97 #endif
98
99 /* FIXME: tuneup should decide the best variant */
100 #ifndef AORSMUL_FASTER_AORS_AORSLSH
101 #define AORSMUL_FASTER_AORS_AORSLSH 1
102 #endif
103 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
104 #define AORSMUL_FASTER_AORS_2AORSLSH 1
105 #endif
106 #ifndef AORSMUL_FASTER_2AORSLSH
107 #define AORSMUL_FASTER_2AORSLSH 1
108 #endif
109 #ifndef AORSMUL_FASTER_3AORSLSH
110 #define AORSMUL_FASTER_3AORSLSH 1
111 #endif
112
113 #define BINVERT_9 \
114 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
115
116 #define BINVERT_255 \
117 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
118
119 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
120 #if GMP_LIMB_BITS == 32
121 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
122 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
123 #else
124 #if GMP_LIMB_BITS == 64
125 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
126 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
127 #endif
128 #endif
129
130 #ifndef mpn_divexact_by255
131 #if GMP_NUMB_BITS % 8 == 0
132 #define mpn_divexact_by255(dst,src,size) \
133 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
134 #else
135 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
136 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
137 #else
138 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
139 #endif
140 #endif
141 #endif
142
143 #ifndef mpn_divexact_by9x4
144 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
145 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
146 #else
147 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
148 #endif
149 #endif
150
151 #ifndef mpn_divexact_by42525
152 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
153 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
154 #else
155 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
156 #endif
157 #endif
158
159 #ifndef mpn_divexact_by2835x4
160 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
161 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
162 #else
163 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
164 #endif
165 #endif
166
167 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
168 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
169 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
170 degree 11 (or 10), given the 12 (rsp. 11) values:
171
172 r0 = limit at infinity of f(x) / x^11,
173 r1 = f(4),f(-4),
174 r2 = f(2),f(-2),
175 r3 = f(1),f(-1),
176 r4 = f(1/4),f(-1/4),
177 r5 = f(1/2),f(-1/2),
178 r6 = f(0).
179
180 All couples of the form f(n),f(-n) must be already mixed with
181 toom_couple_handling(f(n),...,f(-n),...)
182
183 The result is stored in {pp, spt + 7*n (or 6*n)}.
184 At entry, r6 is stored at {pp, 2n},
185 r4 is stored at {pp + 3n, 3n + 1}.
186 r2 is stored at {pp + 7n, 3n + 1}.
187 r0 is stored at {pp +11n, spt}.
188
189 The other values are 3n+1 limbs each (with most significant limbs small).
190
191 Negative intermediate results are stored two-complemented.
192 Inputs are destroyed.
193 */
194
195 void
196 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
197 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
198 {
199 mp_limb_t cy;
200 mp_size_t n3;
201 mp_size_t n3p1;
202 n3 = 3 * n;
203 n3p1 = n3 + 1;
204
205 #define r4 (pp + n3) /* 3n+1 */
206 #define r2 (pp + 7 * n) /* 3n+1 */
207 #define r0 (pp +11 * n) /* s+t <= 2*n */
208
209 /******************************* interpolation *****************************/
210 if (half != 0) {
211 cy = mpn_sub_n (r3, r3, r0, spt);
212 MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
213
214 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
215 MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
216 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
217
218 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
219 MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
220 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
221 };
222
223 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
224 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
225
226 #if HAVE_NATIVE_mpn_add_n_sub_n
227 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
228 #else
229 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
230 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
231 MP_PTR_SWAP(r1, wsi);
232 #endif
233
234 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
235 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
236
237 #if HAVE_NATIVE_mpn_add_n_sub_n
238 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
239 #else
240 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
241 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
242 MP_PTR_SWAP(r5, wsi);
243 #endif
244
245 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
246
247 #if AORSMUL_FASTER_AORS_AORSLSH
248 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
249 #else
250 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
251 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
252 #endif
253 /* A division by 2835x4 follows. Warning: the operand can be negative! */
254 mpn_divexact_by2835x4(r4, r4, n3p1);
255 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
256 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
257
258 #if AORSMUL_FASTER_2AORSLSH
259 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
260 #else
261 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
262 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
263 #endif
264 mpn_divexact_by255(r5, r5, n3p1);
265
266 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
267
268 #if AORSMUL_FASTER_3AORSLSH
269 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
270 #else
271 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
272 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
274 #endif
275 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
276 mpn_divexact_by42525(r1, r1, n3p1);
277
278 #if AORSMUL_FASTER_AORS_2AORSLSH
279 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
280 #else
281 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
282 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
283 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
284 #endif
285 mpn_divexact_by9x4(r2, r2, n3p1);
286
287 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
288
289 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
290 mpn_rsh1sub_n (r4, r2, r4, n3p1);
291 r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
292 #else
293 mpn_sub_n (r4, r2, r4, n3p1);
294 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
295 #endif
296 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
297
298 #ifdef HAVE_NATIVE_mpn_rsh1add_n
299 mpn_rsh1add_n (r5, r5, r1, n3p1);
300 r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
301 #else
302 mpn_add_n (r5, r5, r1, n3p1);
303 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
304 #endif
305
306 /* last interpolation steps... */
307 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
308 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
309 /* ... could be mixed with recomposition
310 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
311 */
312
313 /***************************** recomposition *******************************/
314 /*
315 pp[] prior to operations:
316 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
317
318 summation scheme for remaining operations:
319 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
320 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
321 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
322 */
323
324 cy = mpn_add_n (pp + n, pp + n, r5, n);
325 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
326 #if HAVE_NATIVE_mpn_add_nc
327 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
328 #else
329 MPN_INCR_U (r5 + 2 * n, n + 1, cy);
330 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
331 #endif
332 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
333
334 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
335 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
336 #if HAVE_NATIVE_mpn_add_nc
337 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
338 #else
339 MPN_INCR_U (r3 + 2 * n, n + 1, cy);
340 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
341 #endif
342 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
343
344 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
345 if (half) {
346 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
347 #if HAVE_NATIVE_mpn_add_nc
348 if (LIKELY (spt > n)) {
349 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
350 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
351 } else {
352 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
353 }
354 #else
355 MPN_INCR_U (r1 + 2 * n, n + 1, cy);
356 if (LIKELY (spt > n)) {
357 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
358 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
359 } else {
360 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
361 }
362 #endif
363 } else {
364 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
365 }
366
367 #undef r0
368 #undef r2
369 #undef r4
370 }
371