mulmid.c revision 1.1.1.2 1 1.1 mrg /* mpn_mulmid -- middle product
2 1.1 mrg
3 1.1 mrg Contributed by David Harvey.
4 1.1 mrg
5 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 1.1 mrg GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 1.1 mrg
9 1.1 mrg Copyright 2011 Free Software Foundation, Inc.
10 1.1 mrg
11 1.1 mrg This file is part of the GNU MP Library.
12 1.1 mrg
13 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
14 1.1.1.2 mrg it under the terms of either:
15 1.1.1.2 mrg
16 1.1.1.2 mrg * the GNU Lesser General Public License as published by the Free
17 1.1.1.2 mrg Software Foundation; either version 3 of the License, or (at your
18 1.1.1.2 mrg option) any later version.
19 1.1.1.2 mrg
20 1.1.1.2 mrg or
21 1.1.1.2 mrg
22 1.1.1.2 mrg * the GNU General Public License as published by the Free Software
23 1.1.1.2 mrg Foundation; either version 2 of the License, or (at your option) any
24 1.1.1.2 mrg later version.
25 1.1.1.2 mrg
26 1.1.1.2 mrg or both in parallel, as here.
27 1.1 mrg
28 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
29 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 1.1.1.2 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 1.1.1.2 mrg for more details.
32 1.1 mrg
33 1.1.1.2 mrg You should have received copies of the GNU General Public License and the
34 1.1.1.2 mrg GNU Lesser General Public License along with the GNU MP Library. If not,
35 1.1.1.2 mrg see https://www.gnu.org/licenses/. */
36 1.1 mrg
37 1.1 mrg
38 1.1 mrg #include "gmp.h"
39 1.1 mrg #include "gmp-impl.h"
40 1.1 mrg
41 1.1 mrg
42 1.1 mrg #define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
43 1.1 mrg
44 1.1 mrg
45 1.1 mrg void
46 1.1 mrg mpn_mulmid (mp_ptr rp,
47 1.1 mrg mp_srcptr ap, mp_size_t an,
48 1.1 mrg mp_srcptr bp, mp_size_t bn)
49 1.1 mrg {
50 1.1 mrg mp_size_t rn, k;
51 1.1 mrg mp_ptr scratch, temp;
52 1.1 mrg
53 1.1 mrg ASSERT (an >= bn);
54 1.1 mrg ASSERT (bn >= 1);
55 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
56 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
57 1.1 mrg
58 1.1 mrg if (bn < MULMID_TOOM42_THRESHOLD)
59 1.1 mrg {
60 1.1 mrg /* region not tall enough to make toom42 worthwhile for any portion */
61 1.1 mrg
62 1.1 mrg if (an < CHUNK)
63 1.1 mrg {
64 1.1 mrg /* region not too wide either, just call basecase directly */
65 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
66 1.1 mrg return;
67 1.1 mrg }
68 1.1 mrg
69 1.1 mrg /* Region quite wide. For better locality, use basecase on chunks:
70 1.1 mrg
71 1.1 mrg AAABBBCC..
72 1.1 mrg .AAABBBCC.
73 1.1 mrg ..AAABBBCC
74 1.1 mrg */
75 1.1 mrg
76 1.1 mrg k = CHUNK - bn + 1; /* number of diagonals per chunk */
77 1.1 mrg
78 1.1 mrg /* first chunk (marked A in the above diagram) */
79 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
80 1.1 mrg
81 1.1 mrg /* remaining chunks (B, C, etc) */
82 1.1 mrg an -= k;
83 1.1 mrg
84 1.1 mrg while (an >= CHUNK)
85 1.1 mrg {
86 1.1 mrg mp_limb_t t0, t1, cy;
87 1.1 mrg ap += k, rp += k;
88 1.1 mrg t0 = rp[0], t1 = rp[1];
89 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
90 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
91 1.1 mrg MPN_INCR_U (rp + 1, k + 1, t1 + cy);
92 1.1 mrg an -= k;
93 1.1 mrg }
94 1.1 mrg
95 1.1 mrg if (an >= bn)
96 1.1 mrg {
97 1.1 mrg /* last remaining chunk */
98 1.1 mrg mp_limb_t t0, t1, cy;
99 1.1 mrg ap += k, rp += k;
100 1.1 mrg t0 = rp[0], t1 = rp[1];
101 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
102 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0);
103 1.1 mrg MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
104 1.1 mrg }
105 1.1 mrg
106 1.1 mrg return;
107 1.1 mrg }
108 1.1 mrg
109 1.1 mrg /* region is tall enough for toom42 */
110 1.1 mrg
111 1.1 mrg rn = an - bn + 1;
112 1.1 mrg
113 1.1 mrg if (rn < MULMID_TOOM42_THRESHOLD)
114 1.1 mrg {
115 1.1 mrg /* region not wide enough to make toom42 worthwhile for any portion */
116 1.1 mrg
117 1.1 mrg TMP_DECL;
118 1.1 mrg
119 1.1 mrg if (bn < CHUNK)
120 1.1 mrg {
121 1.1 mrg /* region not too tall either, just call basecase directly */
122 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
123 1.1 mrg return;
124 1.1 mrg }
125 1.1 mrg
126 1.1 mrg /* Region quite tall. For better locality, use basecase on chunks:
127 1.1 mrg
128 1.1 mrg AAAAA....
129 1.1 mrg .AAAAA...
130 1.1 mrg ..BBBBB..
131 1.1 mrg ...BBBBB.
132 1.1 mrg ....CCCCC
133 1.1 mrg */
134 1.1 mrg
135 1.1 mrg TMP_MARK;
136 1.1 mrg
137 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2);
138 1.1 mrg
139 1.1 mrg /* first chunk (marked A in the above diagram) */
140 1.1 mrg bp += bn - CHUNK, an -= bn - CHUNK;
141 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
142 1.1 mrg
143 1.1 mrg /* remaining chunks (B, C, etc) */
144 1.1 mrg bn -= CHUNK;
145 1.1 mrg
146 1.1 mrg while (bn >= CHUNK)
147 1.1 mrg {
148 1.1 mrg ap += CHUNK, bp -= CHUNK;
149 1.1 mrg mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
150 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
151 1.1 mrg bn -= CHUNK;
152 1.1 mrg }
153 1.1 mrg
154 1.1 mrg if (bn)
155 1.1 mrg {
156 1.1 mrg /* last remaining chunk */
157 1.1 mrg ap += CHUNK, bp -= bn;
158 1.1 mrg mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
159 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
160 1.1 mrg }
161 1.1 mrg
162 1.1 mrg TMP_FREE;
163 1.1 mrg return;
164 1.1 mrg }
165 1.1 mrg
166 1.1 mrg /* we're definitely going to use toom42 somewhere */
167 1.1 mrg
168 1.1 mrg if (bn > rn)
169 1.1 mrg {
170 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly
171 1.1 mrg the last:
172 1.1 mrg
173 1.1 mrg AA....
174 1.1 mrg .AA...
175 1.1 mrg ..BB..
176 1.1 mrg ...BB.
177 1.1 mrg ....CC
178 1.1 mrg */
179 1.1 mrg
180 1.1 mrg TMP_DECL;
181 1.1 mrg TMP_MARK;
182 1.1 mrg
183 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
184 1.1 mrg scratch = temp + rn + 2;
185 1.1 mrg
186 1.1 mrg /* first chunk (marked A in the above diagram) */
187 1.1 mrg bp += bn - rn;
188 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
189 1.1 mrg
190 1.1 mrg /* remaining chunks (B, C, etc) */
191 1.1 mrg bn -= rn;
192 1.1 mrg
193 1.1 mrg while (bn >= rn)
194 1.1 mrg {
195 1.1 mrg ap += rn, bp -= rn;
196 1.1 mrg mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
197 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
198 1.1 mrg bn -= rn;
199 1.1 mrg }
200 1.1 mrg
201 1.1 mrg if (bn)
202 1.1 mrg {
203 1.1 mrg /* last remaining chunk */
204 1.1 mrg ap += rn, bp -= bn;
205 1.1 mrg mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
206 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
207 1.1 mrg }
208 1.1 mrg
209 1.1 mrg TMP_FREE;
210 1.1 mrg }
211 1.1 mrg else
212 1.1 mrg {
213 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly
214 1.1 mrg the last:
215 1.1 mrg
216 1.1 mrg AAABBBCC..
217 1.1 mrg .AAABBBCC.
218 1.1 mrg ..AAABBBCC
219 1.1 mrg */
220 1.1 mrg
221 1.1 mrg TMP_DECL;
222 1.1 mrg TMP_MARK;
223 1.1 mrg
224 1.1 mrg scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
225 1.1 mrg
226 1.1 mrg /* first chunk (marked A in the above diagram) */
227 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
228 1.1 mrg
229 1.1 mrg /* remaining chunks (B, C, etc) */
230 1.1 mrg rn -= bn;
231 1.1 mrg
232 1.1 mrg while (rn >= bn)
233 1.1 mrg {
234 1.1 mrg mp_limb_t t0, t1, cy;
235 1.1 mrg ap += bn, rp += bn;
236 1.1 mrg t0 = rp[0], t1 = rp[1];
237 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
238 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
239 1.1 mrg MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
240 1.1 mrg rn -= bn;
241 1.1 mrg }
242 1.1 mrg
243 1.1 mrg TMP_FREE;
244 1.1 mrg
245 1.1 mrg if (rn)
246 1.1 mrg {
247 1.1 mrg /* last remaining chunk */
248 1.1 mrg mp_limb_t t0, t1, cy;
249 1.1 mrg ap += bn, rp += bn;
250 1.1 mrg t0 = rp[0], t1 = rp[1];
251 1.1 mrg mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
252 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0);
253 1.1 mrg MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
254 1.1 mrg }
255 1.1 mrg }
256 1.1 mrg }
257