mulmid.c revision 1.1 1 1.1 mrg /* mpn_mulmid -- middle product
2 1.1 mrg
3 1.1 mrg Contributed by David Harvey.
4 1.1 mrg
5 1.1 mrg THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 1.1 mrg SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 1.1 mrg GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 1.1 mrg
9 1.1 mrg Copyright 2011 Free Software Foundation, Inc.
10 1.1 mrg
11 1.1 mrg This file is part of the GNU MP Library.
12 1.1 mrg
13 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify
14 1.1 mrg it under the terms of the GNU Lesser General Public License as published by
15 1.1 mrg the Free Software Foundation; either version 3 of the License, or (at your
16 1.1 mrg option) any later version.
17 1.1 mrg
18 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but
19 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 1.1 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 1.1 mrg License for more details.
22 1.1 mrg
23 1.1 mrg You should have received a copy of the GNU Lesser General Public License
24 1.1 mrg along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25 1.1 mrg
26 1.1 mrg
27 1.1 mrg #include "gmp.h"
28 1.1 mrg #include "gmp-impl.h"
29 1.1 mrg
30 1.1 mrg
31 1.1 mrg #define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
32 1.1 mrg
33 1.1 mrg
34 1.1 mrg void
35 1.1 mrg mpn_mulmid (mp_ptr rp,
36 1.1 mrg mp_srcptr ap, mp_size_t an,
37 1.1 mrg mp_srcptr bp, mp_size_t bn)
38 1.1 mrg {
39 1.1 mrg mp_size_t rn, k;
40 1.1 mrg mp_ptr scratch, temp;
41 1.1 mrg
42 1.1 mrg ASSERT (an >= bn);
43 1.1 mrg ASSERT (bn >= 1);
44 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
45 1.1 mrg ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
46 1.1 mrg
47 1.1 mrg if (bn < MULMID_TOOM42_THRESHOLD)
48 1.1 mrg {
49 1.1 mrg /* region not tall enough to make toom42 worthwhile for any portion */
50 1.1 mrg
51 1.1 mrg if (an < CHUNK)
52 1.1 mrg {
53 1.1 mrg /* region not too wide either, just call basecase directly */
54 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
55 1.1 mrg return;
56 1.1 mrg }
57 1.1 mrg
58 1.1 mrg /* Region quite wide. For better locality, use basecase on chunks:
59 1.1 mrg
60 1.1 mrg AAABBBCC..
61 1.1 mrg .AAABBBCC.
62 1.1 mrg ..AAABBBCC
63 1.1 mrg */
64 1.1 mrg
65 1.1 mrg k = CHUNK - bn + 1; /* number of diagonals per chunk */
66 1.1 mrg
67 1.1 mrg /* first chunk (marked A in the above diagram) */
68 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
69 1.1 mrg
70 1.1 mrg /* remaining chunks (B, C, etc) */
71 1.1 mrg an -= k;
72 1.1 mrg
73 1.1 mrg while (an >= CHUNK)
74 1.1 mrg {
75 1.1 mrg mp_limb_t t0, t1, cy;
76 1.1 mrg ap += k, rp += k;
77 1.1 mrg t0 = rp[0], t1 = rp[1];
78 1.1 mrg mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
79 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
80 1.1 mrg MPN_INCR_U (rp + 1, k + 1, t1 + cy);
81 1.1 mrg an -= k;
82 1.1 mrg }
83 1.1 mrg
84 1.1 mrg if (an >= bn)
85 1.1 mrg {
86 1.1 mrg /* last remaining chunk */
87 1.1 mrg mp_limb_t t0, t1, cy;
88 1.1 mrg ap += k, rp += k;
89 1.1 mrg t0 = rp[0], t1 = rp[1];
90 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
91 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0);
92 1.1 mrg MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
93 1.1 mrg }
94 1.1 mrg
95 1.1 mrg return;
96 1.1 mrg }
97 1.1 mrg
98 1.1 mrg /* region is tall enough for toom42 */
99 1.1 mrg
100 1.1 mrg rn = an - bn + 1;
101 1.1 mrg
102 1.1 mrg if (rn < MULMID_TOOM42_THRESHOLD)
103 1.1 mrg {
104 1.1 mrg /* region not wide enough to make toom42 worthwhile for any portion */
105 1.1 mrg
106 1.1 mrg TMP_DECL;
107 1.1 mrg
108 1.1 mrg if (bn < CHUNK)
109 1.1 mrg {
110 1.1 mrg /* region not too tall either, just call basecase directly */
111 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, bn);
112 1.1 mrg return;
113 1.1 mrg }
114 1.1 mrg
115 1.1 mrg /* Region quite tall. For better locality, use basecase on chunks:
116 1.1 mrg
117 1.1 mrg AAAAA....
118 1.1 mrg .AAAAA...
119 1.1 mrg ..BBBBB..
120 1.1 mrg ...BBBBB.
121 1.1 mrg ....CCCCC
122 1.1 mrg */
123 1.1 mrg
124 1.1 mrg TMP_MARK;
125 1.1 mrg
126 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2);
127 1.1 mrg
128 1.1 mrg /* first chunk (marked A in the above diagram) */
129 1.1 mrg bp += bn - CHUNK, an -= bn - CHUNK;
130 1.1 mrg mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
131 1.1 mrg
132 1.1 mrg /* remaining chunks (B, C, etc) */
133 1.1 mrg bn -= CHUNK;
134 1.1 mrg
135 1.1 mrg while (bn >= CHUNK)
136 1.1 mrg {
137 1.1 mrg ap += CHUNK, bp -= CHUNK;
138 1.1 mrg mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
139 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
140 1.1 mrg bn -= CHUNK;
141 1.1 mrg }
142 1.1 mrg
143 1.1 mrg if (bn)
144 1.1 mrg {
145 1.1 mrg /* last remaining chunk */
146 1.1 mrg ap += CHUNK, bp -= bn;
147 1.1 mrg mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
148 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
149 1.1 mrg }
150 1.1 mrg
151 1.1 mrg TMP_FREE;
152 1.1 mrg return;
153 1.1 mrg }
154 1.1 mrg
155 1.1 mrg /* we're definitely going to use toom42 somewhere */
156 1.1 mrg
157 1.1 mrg if (bn > rn)
158 1.1 mrg {
159 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly
160 1.1 mrg the last:
161 1.1 mrg
162 1.1 mrg AA....
163 1.1 mrg .AA...
164 1.1 mrg ..BB..
165 1.1 mrg ...BB.
166 1.1 mrg ....CC
167 1.1 mrg */
168 1.1 mrg
169 1.1 mrg TMP_DECL;
170 1.1 mrg TMP_MARK;
171 1.1 mrg
172 1.1 mrg temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
173 1.1 mrg scratch = temp + rn + 2;
174 1.1 mrg
175 1.1 mrg /* first chunk (marked A in the above diagram) */
176 1.1 mrg bp += bn - rn;
177 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
178 1.1 mrg
179 1.1 mrg /* remaining chunks (B, C, etc) */
180 1.1 mrg bn -= rn;
181 1.1 mrg
182 1.1 mrg while (bn >= rn)
183 1.1 mrg {
184 1.1 mrg ap += rn, bp -= rn;
185 1.1 mrg mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
186 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
187 1.1 mrg bn -= rn;
188 1.1 mrg }
189 1.1 mrg
190 1.1 mrg if (bn)
191 1.1 mrg {
192 1.1 mrg /* last remaining chunk */
193 1.1 mrg ap += rn, bp -= bn;
194 1.1 mrg mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
195 1.1 mrg mpn_add_n (rp, rp, temp, rn + 2);
196 1.1 mrg }
197 1.1 mrg
198 1.1 mrg TMP_FREE;
199 1.1 mrg }
200 1.1 mrg else
201 1.1 mrg {
202 1.1 mrg /* slice region into chunks, use toom42 on all chunks except possibly
203 1.1 mrg the last:
204 1.1 mrg
205 1.1 mrg AAABBBCC..
206 1.1 mrg .AAABBBCC.
207 1.1 mrg ..AAABBBCC
208 1.1 mrg */
209 1.1 mrg
210 1.1 mrg TMP_DECL;
211 1.1 mrg TMP_MARK;
212 1.1 mrg
213 1.1 mrg scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
214 1.1 mrg
215 1.1 mrg /* first chunk (marked A in the above diagram) */
216 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
217 1.1 mrg
218 1.1 mrg /* remaining chunks (B, C, etc) */
219 1.1 mrg rn -= bn;
220 1.1 mrg
221 1.1 mrg while (rn >= bn)
222 1.1 mrg {
223 1.1 mrg mp_limb_t t0, t1, cy;
224 1.1 mrg ap += bn, rp += bn;
225 1.1 mrg t0 = rp[0], t1 = rp[1];
226 1.1 mrg mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
227 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
228 1.1 mrg MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
229 1.1 mrg rn -= bn;
230 1.1 mrg }
231 1.1 mrg
232 1.1 mrg TMP_FREE;
233 1.1 mrg
234 1.1 mrg if (rn)
235 1.1 mrg {
236 1.1 mrg /* last remaining chunk */
237 1.1 mrg mp_limb_t t0, t1, cy;
238 1.1 mrg ap += bn, rp += bn;
239 1.1 mrg t0 = rp[0], t1 = rp[1];
240 1.1 mrg mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
241 1.1 mrg ADDC_LIMB (cy, rp[0], rp[0], t0);
242 1.1 mrg MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
243 1.1 mrg }
244 1.1 mrg }
245 1.1 mrg }
246