toom3_sqr.c revision 1.1.1.1 1 1.1 mrg /* mpn_toom3_sqr -- Square {ap,an}.
2 1.1 mrg
3 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
4 1.1 mrg Additional improvements by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 1.1 mrg
10 1.1 mrg Copyright 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
11 1.1 mrg
12 1.1 mrg This file is part of the GNU MP Library.
13 1.1 mrg
14 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
15 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
16 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
17 1.1 mrg option) any later version.
18 1.1 mrg
19 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
20 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 1.1 mrg License for more details.
23 1.1 mrg
24 1.1 mrg You should have received a copy of the GNU Lesser General Public License
25 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
26 1.1 mrg
27 1.1 mrg
28 1.1 mrg #include "gmp.h"
29 1.1 mrg #include "gmp-impl.h"
30 1.1 mrg
31 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
32 1.1 mrg
33 1.1 mrg <-s--><--n--><--n-->
34 1.1 mrg ____ ______ ______
35 1.1 mrg |_a2_|___a1_|___a0_|
36 1.1 mrg
37 1.1 mrg v0 = a0 ^2 # A(0)^2
38 1.1 mrg v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2
39 1.1 mrg vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1
40 1.1 mrg v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6
41 1.1 mrg vinf= a2 ^2 # A(inf)^2
42 1.1 mrg */
43 1.1 mrg
44 1.1 mrg #if TUNE_PROGRAM_BUILD
45 1.1 mrg #define MAYBE_sqr_basecase 1
46 1.1 mrg #define MAYBE_sqr_toom3 1
47 1.1 mrg #else
48 1.1 mrg #define MAYBE_sqr_basecase \
49 1.1 mrg (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
50 1.1 mrg #define MAYBE_sqr_toom3 \
51 1.1 mrg (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
52 1.1 mrg #endif
53 1.1 mrg
54 1.1 mrg #define TOOM3_SQR_REC(p, a, n, ws) \
55 1.1 mrg do { \
56 1.1 mrg if (MAYBE_sqr_basecase \
57 1.1 mrg && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
58 1.1 mrg mpn_sqr_basecase (p, a, n); \
59 1.1 mrg else if (! MAYBE_sqr_toom3 \
60 1.1 mrg || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
61 1.1 mrg mpn_toom2_sqr (p, a, n, ws); \
62 1.1 mrg else \
63 1.1 mrg mpn_toom3_sqr (p, a, n, ws); \
64 1.1 mrg } while (0)
65 1.1 mrg
66 1.1 mrg void
67 1.1 mrg mpn_toom3_sqr (mp_ptr pp,
68 1.1 mrg mp_srcptr ap, mp_size_t an,
69 1.1 mrg mp_ptr scratch)
70 1.1 mrg {
71 1.1 mrg mp_size_t n, s;
72 1.1 mrg mp_limb_t cy, vinf0;
73 1.1 mrg mp_ptr gp;
74 1.1 mrg mp_ptr as1, asm1, as2;
75 1.1 mrg
76 1.1 mrg #define a0 ap
77 1.1 mrg #define a1 (ap + n)
78 1.1 mrg #define a2 (ap + 2*n)
79 1.1 mrg
80 1.1 mrg n = (an + 2) / (size_t) 3;
81 1.1 mrg
82 1.1 mrg s = an - 2 * n;
83 1.1 mrg
84 1.1 mrg ASSERT (0 < s && s <= n);
85 1.1 mrg
86 1.1 mrg as1 = scratch + 4 * n + 4;
87 1.1 mrg asm1 = scratch + 2 * n + 2;
88 1.1 mrg as2 = pp + n + 1;
89 1.1 mrg
90 1.1 mrg gp = scratch;
91 1.1 mrg
92 1.1 mrg /* Compute as1 and asm1. */
93 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
94 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
95 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
96 1.1 mrg {
97 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
98 1.1 mrg as1[n] = cy >> 1;
99 1.1 mrg asm1[n] = 0;
100 1.1 mrg }
101 1.1 mrg else
102 1.1 mrg {
103 1.1 mrg mp_limb_t cy2;
104 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
105 1.1 mrg as1[n] = cy + (cy2 >> 1);
106 1.1 mrg asm1[n] = cy - (cy2 & 1);
107 1.1 mrg }
108 1.1 mrg #else
109 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
110 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
111 1.1 mrg {
112 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
113 1.1 mrg asm1[n] = 0;
114 1.1 mrg }
115 1.1 mrg else
116 1.1 mrg {
117 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
118 1.1 mrg asm1[n] = cy;
119 1.1 mrg }
120 1.1 mrg #endif
121 1.1 mrg
122 1.1 mrg /* Compute as2. */
123 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
124 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
125 1.1 mrg if (s != n)
126 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
127 1.1 mrg cy += as1[n];
128 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
129 1.1 mrg #else
130 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
131 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
132 1.1 mrg if (s != n)
133 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
134 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
135 1.1 mrg #else
136 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
137 1.1 mrg if (s != n)
138 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
139 1.1 mrg cy += as1[n];
140 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
141 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
142 1.1 mrg #endif
143 1.1 mrg #endif
144 1.1 mrg as2[n] = cy;
145 1.1 mrg
146 1.1 mrg ASSERT (as1[n] <= 2);
147 1.1 mrg ASSERT (asm1[n] <= 1);
148 1.1 mrg
149 1.1 mrg #define v0 pp /* 2n */
150 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
151 1.1 mrg #define vinf (pp + 4 * n) /* s+s */
152 1.1 mrg #define vm1 scratch /* 2n+1 */
153 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
154 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
155 1.1 mrg
156 1.1 mrg /* vm1, 2n+1 limbs */
157 1.1 mrg #ifdef SMALLER_RECURSION
158 1.1 mrg TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
159 1.1 mrg cy = 0;
160 1.1 mrg if (asm1[n] != 0)
161 1.1 mrg cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
162 1.1 mrg if (asm1[n] != 0)
163 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
164 1.1 mrg vm1[2 * n] = cy;
165 1.1 mrg #else
166 1.1 mrg TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
167 1.1 mrg #endif
168 1.1 mrg
169 1.1 mrg TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
170 1.1 mrg
171 1.1 mrg TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
172 1.1 mrg
173 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
174 1.1 mrg
175 1.1 mrg #ifdef SMALLER_RECURSION
176 1.1 mrg /* v1, 2n+1 limbs */
177 1.1 mrg TOOM3_SQR_REC (v1, as1, n, scratch_out);
178 1.1 mrg if (as1[n] == 1)
179 1.1 mrg {
180 1.1 mrg cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
181 1.1 mrg }
182 1.1 mrg else if (as1[n] != 0)
183 1.1 mrg {
184 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
185 1.1 mrg cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
186 1.1 mrg #else
187 1.1 mrg cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
188 1.1 mrg #endif
189 1.1 mrg }
190 1.1 mrg else
191 1.1 mrg cy = 0;
192 1.1 mrg if (as1[n] == 1)
193 1.1 mrg {
194 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
195 1.1 mrg }
196 1.1 mrg else if (as1[n] != 0)
197 1.1 mrg {
198 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
199 1.1 mrg cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
200 1.1 mrg #else
201 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
202 1.1 mrg #endif
203 1.1 mrg }
204 1.1 mrg v1[2 * n] = cy;
205 1.1 mrg #else
206 1.1 mrg cy = vinf[1];
207 1.1 mrg TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
208 1.1 mrg vinf[1] = cy;
209 1.1 mrg #endif
210 1.1 mrg
211 1.1 mrg TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
212 1.1 mrg
213 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
214 1.1 mrg }
215