toom3_sqr.c revision 1.1.1.1.8.1 1 1.1 mrg /* mpn_toom3_sqr -- Square {ap,an}.
2 1.1 mrg
3 1.1 mrg Contributed to the GNU project by Torbjorn Granlund.
4 1.1 mrg Additional improvements by Marco Bodrato.
5 1.1 mrg
6 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 1.1 mrg GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 1.1 mrg
10 1.1.1.1.8.1 tls Copyright 2006, 2007, 2008, 2009, 2010, 2012 Free Software Foundation, Inc.
11 1.1 mrg
12 1.1 mrg This file is part of the GNU MP Library.
13 1.1 mrg
14 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
15 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
16 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
17 1.1 mrg option) any later version.
18 1.1 mrg
19 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
20 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 1.1 mrg License for more details.
23 1.1 mrg
24 1.1 mrg You should have received a copy of the GNU Lesser General Public License
25 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
26 1.1 mrg
27 1.1 mrg
28 1.1 mrg #include "gmp.h"
29 1.1 mrg #include "gmp-impl.h"
30 1.1 mrg
31 1.1 mrg /* Evaluate in: -1, 0, +1, +2, +inf
32 1.1 mrg
33 1.1 mrg <-s--><--n--><--n-->
34 1.1 mrg ____ ______ ______
35 1.1 mrg |_a2_|___a1_|___a0_|
36 1.1 mrg
37 1.1 mrg v0 = a0 ^2 # A(0)^2
38 1.1 mrg v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2
39 1.1 mrg vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1
40 1.1 mrg v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6
41 1.1 mrg vinf= a2 ^2 # A(inf)^2
42 1.1 mrg */
43 1.1 mrg
44 1.1.1.1.8.1 tls #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
45 1.1 mrg #define MAYBE_sqr_basecase 1
46 1.1 mrg #define MAYBE_sqr_toom3 1
47 1.1 mrg #else
48 1.1 mrg #define MAYBE_sqr_basecase \
49 1.1 mrg (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
50 1.1 mrg #define MAYBE_sqr_toom3 \
51 1.1 mrg (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
52 1.1 mrg #endif
53 1.1 mrg
54 1.1 mrg #define TOOM3_SQR_REC(p, a, n, ws) \
55 1.1 mrg do { \
56 1.1 mrg if (MAYBE_sqr_basecase \
57 1.1 mrg && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
58 1.1 mrg mpn_sqr_basecase (p, a, n); \
59 1.1 mrg else if (! MAYBE_sqr_toom3 \
60 1.1 mrg || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
61 1.1 mrg mpn_toom2_sqr (p, a, n, ws); \
62 1.1 mrg else \
63 1.1 mrg mpn_toom3_sqr (p, a, n, ws); \
64 1.1 mrg } while (0)
65 1.1 mrg
66 1.1 mrg void
67 1.1 mrg mpn_toom3_sqr (mp_ptr pp,
68 1.1 mrg mp_srcptr ap, mp_size_t an,
69 1.1 mrg mp_ptr scratch)
70 1.1 mrg {
71 1.1.1.1.8.1 tls const int __gmpn_cpuvec_initialized = 1;
72 1.1 mrg mp_size_t n, s;
73 1.1 mrg mp_limb_t cy, vinf0;
74 1.1 mrg mp_ptr gp;
75 1.1 mrg mp_ptr as1, asm1, as2;
76 1.1 mrg
77 1.1 mrg #define a0 ap
78 1.1 mrg #define a1 (ap + n)
79 1.1 mrg #define a2 (ap + 2*n)
80 1.1 mrg
81 1.1 mrg n = (an + 2) / (size_t) 3;
82 1.1 mrg
83 1.1 mrg s = an - 2 * n;
84 1.1 mrg
85 1.1 mrg ASSERT (0 < s && s <= n);
86 1.1 mrg
87 1.1 mrg as1 = scratch + 4 * n + 4;
88 1.1 mrg asm1 = scratch + 2 * n + 2;
89 1.1 mrg as2 = pp + n + 1;
90 1.1 mrg
91 1.1 mrg gp = scratch;
92 1.1 mrg
93 1.1 mrg /* Compute as1 and asm1. */
94 1.1 mrg cy = mpn_add (gp, a0, n, a2, s);
95 1.1 mrg #if HAVE_NATIVE_mpn_add_n_sub_n
96 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
97 1.1 mrg {
98 1.1 mrg cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
99 1.1 mrg as1[n] = cy >> 1;
100 1.1 mrg asm1[n] = 0;
101 1.1 mrg }
102 1.1 mrg else
103 1.1 mrg {
104 1.1 mrg mp_limb_t cy2;
105 1.1 mrg cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
106 1.1 mrg as1[n] = cy + (cy2 >> 1);
107 1.1 mrg asm1[n] = cy - (cy2 & 1);
108 1.1 mrg }
109 1.1 mrg #else
110 1.1 mrg as1[n] = cy + mpn_add_n (as1, gp, a1, n);
111 1.1 mrg if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
112 1.1 mrg {
113 1.1 mrg mpn_sub_n (asm1, a1, gp, n);
114 1.1 mrg asm1[n] = 0;
115 1.1 mrg }
116 1.1 mrg else
117 1.1 mrg {
118 1.1 mrg cy -= mpn_sub_n (asm1, gp, a1, n);
119 1.1 mrg asm1[n] = cy;
120 1.1 mrg }
121 1.1 mrg #endif
122 1.1 mrg
123 1.1 mrg /* Compute as2. */
124 1.1 mrg #if HAVE_NATIVE_mpn_rsblsh1_n
125 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
126 1.1 mrg if (s != n)
127 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
128 1.1 mrg cy += as1[n];
129 1.1 mrg cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
130 1.1 mrg #else
131 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
132 1.1 mrg cy = mpn_addlsh1_n (as2, a1, a2, s);
133 1.1 mrg if (s != n)
134 1.1 mrg cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
135 1.1 mrg cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
136 1.1 mrg #else
137 1.1 mrg cy = mpn_add_n (as2, a2, as1, s);
138 1.1 mrg if (s != n)
139 1.1 mrg cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
140 1.1 mrg cy += as1[n];
141 1.1 mrg cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
142 1.1 mrg cy -= mpn_sub_n (as2, as2, a0, n);
143 1.1 mrg #endif
144 1.1 mrg #endif
145 1.1 mrg as2[n] = cy;
146 1.1 mrg
147 1.1 mrg ASSERT (as1[n] <= 2);
148 1.1 mrg ASSERT (asm1[n] <= 1);
149 1.1 mrg
150 1.1 mrg #define v0 pp /* 2n */
151 1.1 mrg #define v1 (pp + 2 * n) /* 2n+1 */
152 1.1 mrg #define vinf (pp + 4 * n) /* s+s */
153 1.1 mrg #define vm1 scratch /* 2n+1 */
154 1.1 mrg #define v2 (scratch + 2 * n + 1) /* 2n+2 */
155 1.1 mrg #define scratch_out (scratch + 5 * n + 5)
156 1.1 mrg
157 1.1 mrg /* vm1, 2n+1 limbs */
158 1.1 mrg #ifdef SMALLER_RECURSION
159 1.1 mrg TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
160 1.1 mrg cy = 0;
161 1.1 mrg if (asm1[n] != 0)
162 1.1 mrg cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
163 1.1 mrg if (asm1[n] != 0)
164 1.1 mrg cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
165 1.1 mrg vm1[2 * n] = cy;
166 1.1 mrg #else
167 1.1 mrg TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
168 1.1 mrg #endif
169 1.1 mrg
170 1.1 mrg TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
171 1.1 mrg
172 1.1 mrg TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
173 1.1 mrg
174 1.1 mrg vinf0 = vinf[0]; /* v1 overlaps with this */
175 1.1 mrg
176 1.1 mrg #ifdef SMALLER_RECURSION
177 1.1 mrg /* v1, 2n+1 limbs */
178 1.1 mrg TOOM3_SQR_REC (v1, as1, n, scratch_out);
179 1.1 mrg if (as1[n] == 1)
180 1.1 mrg {
181 1.1 mrg cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
182 1.1 mrg }
183 1.1 mrg else if (as1[n] != 0)
184 1.1 mrg {
185 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
186 1.1 mrg cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
187 1.1 mrg #else
188 1.1 mrg cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
189 1.1 mrg #endif
190 1.1 mrg }
191 1.1 mrg else
192 1.1 mrg cy = 0;
193 1.1 mrg if (as1[n] == 1)
194 1.1 mrg {
195 1.1 mrg cy += mpn_add_n (v1 + n, v1 + n, as1, n);
196 1.1 mrg }
197 1.1 mrg else if (as1[n] != 0)
198 1.1 mrg {
199 1.1 mrg #if HAVE_NATIVE_mpn_addlsh1_n
200 1.1 mrg cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
201 1.1 mrg #else
202 1.1 mrg cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
203 1.1 mrg #endif
204 1.1 mrg }
205 1.1 mrg v1[2 * n] = cy;
206 1.1 mrg #else
207 1.1 mrg cy = vinf[1];
208 1.1 mrg TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
209 1.1 mrg vinf[1] = cy;
210 1.1 mrg #endif
211 1.1 mrg
212 1.1 mrg TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
213 1.1 mrg
214 1.1 mrg mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
215 1.1 mrg }
216