iter_ull.c revision 1.3 1 1.3 mrg /* Copyright (C) 2005-2013 Free Software Foundation, Inc.
2 1.1 mrg Contributed by Richard Henderson <rth (at) redhat.com>.
3 1.1 mrg
4 1.1 mrg This file is part of the GNU OpenMP Library (libgomp).
5 1.1 mrg
6 1.1 mrg Libgomp is free software; you can redistribute it and/or modify it
7 1.1 mrg under the terms of the GNU General Public License as published by
8 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
9 1.1 mrg any later version.
10 1.1 mrg
11 1.1 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
12 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 1.1 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 1.1 mrg more details.
15 1.1 mrg
16 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
17 1.1 mrg permissions described in the GCC Runtime Library Exception, version
18 1.1 mrg 3.1, as published by the Free Software Foundation.
19 1.1 mrg
20 1.1 mrg You should have received a copy of the GNU General Public License and
21 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
22 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 1.1 mrg <http://www.gnu.org/licenses/>. */
24 1.1 mrg
25 1.1 mrg /* This file contains routines for managing work-share iteration, both
26 1.1 mrg for loops and sections. */
27 1.1 mrg
28 1.1 mrg #include "libgomp.h"
29 1.1 mrg #include <stdlib.h>
30 1.1 mrg
31 1.1 mrg typedef unsigned long long gomp_ull;
32 1.1 mrg
33 1.1 mrg /* This function implements the STATIC scheduling method. The caller should
34 1.1 mrg iterate *pstart <= x < *pend. Return zero if there are more iterations
35 1.1 mrg to perform; nonzero if not. Return less than 0 if this thread had
36 1.1 mrg received the absolutely last iteration. */
37 1.1 mrg
38 1.1 mrg int
39 1.1 mrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
40 1.1 mrg {
41 1.1 mrg struct gomp_thread *thr = gomp_thread ();
42 1.1 mrg struct gomp_team *team = thr->ts.team;
43 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
44 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1;
45 1.1 mrg
46 1.1 mrg if (thr->ts.static_trip == -1)
47 1.1 mrg return -1;
48 1.1 mrg
49 1.1 mrg /* Quick test for degenerate teams and orphaned constructs. */
50 1.1 mrg if (nthreads == 1)
51 1.1 mrg {
52 1.1 mrg *pstart = ws->next_ull;
53 1.1 mrg *pend = ws->end_ull;
54 1.1 mrg thr->ts.static_trip = -1;
55 1.1 mrg return ws->next_ull == ws->end_ull;
56 1.1 mrg }
57 1.1 mrg
58 1.1 mrg /* We interpret chunk_size zero as "unspecified", which means that we
59 1.1 mrg should break up the iterations such that each thread makes only one
60 1.1 mrg trip through the outer loop. */
61 1.1 mrg if (ws->chunk_size_ull == 0)
62 1.1 mrg {
63 1.3 mrg gomp_ull n, q, i, t, s0, e0, s, e;
64 1.1 mrg
65 1.1 mrg if (thr->ts.static_trip > 0)
66 1.1 mrg return 1;
67 1.1 mrg
68 1.1 mrg /* Compute the total number of iterations. */
69 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
70 1.1 mrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
71 1.1 mrg else
72 1.1 mrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
73 1.1 mrg i = thr->ts.team_id;
74 1.1 mrg
75 1.1 mrg /* Compute the "zero-based" start and end points. That is, as
76 1.1 mrg if the loop began at zero and incremented by one. */
77 1.1 mrg q = n / nthreads;
78 1.3 mrg t = n % nthreads;
79 1.3 mrg if (i < t)
80 1.3 mrg {
81 1.3 mrg t = 0;
82 1.3 mrg q++;
83 1.3 mrg }
84 1.3 mrg s0 = q * i + t;
85 1.1 mrg e0 = s0 + q;
86 1.1 mrg
87 1.1 mrg /* Notice when no iterations allocated for this thread. */
88 1.1 mrg if (s0 >= e0)
89 1.1 mrg {
90 1.1 mrg thr->ts.static_trip = 1;
91 1.1 mrg return 1;
92 1.1 mrg }
93 1.1 mrg
94 1.1 mrg /* Transform these to the actual start and end numbers. */
95 1.1 mrg s = s0 * ws->incr_ull + ws->next_ull;
96 1.1 mrg e = e0 * ws->incr_ull + ws->next_ull;
97 1.1 mrg
98 1.1 mrg *pstart = s;
99 1.1 mrg *pend = e;
100 1.1 mrg thr->ts.static_trip = (e0 == n ? -1 : 1);
101 1.1 mrg return 0;
102 1.1 mrg }
103 1.1 mrg else
104 1.1 mrg {
105 1.1 mrg gomp_ull n, s0, e0, i, c, s, e;
106 1.1 mrg
107 1.1 mrg /* Otherwise, each thread gets exactly chunk_size iterations
108 1.1 mrg (if available) each time through the loop. */
109 1.1 mrg
110 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
111 1.1 mrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
112 1.1 mrg else
113 1.1 mrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
114 1.1 mrg i = thr->ts.team_id;
115 1.1 mrg c = ws->chunk_size_ull;
116 1.1 mrg
117 1.1 mrg /* Initial guess is a C sized chunk positioned nthreads iterations
118 1.1 mrg in, offset by our thread number. */
119 1.1 mrg s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
120 1.1 mrg e0 = s0 + c;
121 1.1 mrg
122 1.1 mrg /* Detect overflow. */
123 1.1 mrg if (s0 >= n)
124 1.1 mrg return 1;
125 1.1 mrg if (e0 > n)
126 1.1 mrg e0 = n;
127 1.1 mrg
128 1.1 mrg /* Transform these to the actual start and end numbers. */
129 1.1 mrg s = s0 * ws->incr_ull + ws->next_ull;
130 1.1 mrg e = e0 * ws->incr_ull + ws->next_ull;
131 1.1 mrg
132 1.1 mrg *pstart = s;
133 1.1 mrg *pend = e;
134 1.1 mrg
135 1.1 mrg if (e0 == n)
136 1.1 mrg thr->ts.static_trip = -1;
137 1.1 mrg else
138 1.1 mrg thr->ts.static_trip++;
139 1.1 mrg return 0;
140 1.1 mrg }
141 1.1 mrg }
142 1.1 mrg
143 1.1 mrg
144 1.1 mrg /* This function implements the DYNAMIC scheduling method. Arguments are
145 1.1 mrg as for gomp_iter_ull_static_next. This function must be called with
146 1.1 mrg ws->lock held. */
147 1.1 mrg
148 1.1 mrg bool
149 1.1 mrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
150 1.1 mrg {
151 1.1 mrg struct gomp_thread *thr = gomp_thread ();
152 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
153 1.1 mrg gomp_ull start, end, chunk, left;
154 1.1 mrg
155 1.1 mrg start = ws->next_ull;
156 1.1 mrg if (start == ws->end_ull)
157 1.1 mrg return false;
158 1.1 mrg
159 1.1 mrg chunk = ws->chunk_size_ull;
160 1.1 mrg left = ws->end_ull - start;
161 1.1 mrg if (__builtin_expect (ws->mode & 2, 0))
162 1.1 mrg {
163 1.1 mrg if (chunk < left)
164 1.1 mrg chunk = left;
165 1.1 mrg }
166 1.1 mrg else
167 1.1 mrg {
168 1.1 mrg if (chunk > left)
169 1.1 mrg chunk = left;
170 1.1 mrg }
171 1.1 mrg end = start + chunk;
172 1.1 mrg
173 1.1 mrg ws->next_ull = end;
174 1.1 mrg *pstart = start;
175 1.1 mrg *pend = end;
176 1.1 mrg return true;
177 1.1 mrg }
178 1.1 mrg
179 1.1 mrg
180 1.1 mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
181 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
182 1.1 mrg instead. Note that the only memory value that changes is ws->next_ull. */
183 1.1 mrg
184 1.1 mrg bool
185 1.1 mrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
186 1.1 mrg {
187 1.1 mrg struct gomp_thread *thr = gomp_thread ();
188 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
189 1.1 mrg gomp_ull start, end, nend, chunk;
190 1.1 mrg
191 1.1 mrg end = ws->end_ull;
192 1.1 mrg chunk = ws->chunk_size_ull;
193 1.1 mrg
194 1.1 mrg if (__builtin_expect (ws->mode & 1, 1))
195 1.1 mrg {
196 1.1 mrg gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
197 1.1 mrg if (__builtin_expect (ws->mode & 2, 0) == 0)
198 1.1 mrg {
199 1.1 mrg if (tmp >= end)
200 1.1 mrg return false;
201 1.1 mrg nend = tmp + chunk;
202 1.1 mrg if (nend > end)
203 1.1 mrg nend = end;
204 1.1 mrg *pstart = tmp;
205 1.1 mrg *pend = nend;
206 1.1 mrg return true;
207 1.1 mrg }
208 1.1 mrg else
209 1.1 mrg {
210 1.1 mrg if (tmp <= end)
211 1.1 mrg return false;
212 1.1 mrg nend = tmp + chunk;
213 1.1 mrg if (nend < end)
214 1.1 mrg nend = end;
215 1.1 mrg *pstart = tmp;
216 1.1 mrg *pend = nend;
217 1.1 mrg return true;
218 1.1 mrg }
219 1.1 mrg }
220 1.1 mrg
221 1.1 mrg start = ws->next_ull;
222 1.1 mrg while (1)
223 1.1 mrg {
224 1.1 mrg gomp_ull left = end - start;
225 1.1 mrg gomp_ull tmp;
226 1.1 mrg
227 1.1 mrg if (start == end)
228 1.1 mrg return false;
229 1.1 mrg
230 1.1 mrg if (__builtin_expect (ws->mode & 2, 0))
231 1.1 mrg {
232 1.1 mrg if (chunk < left)
233 1.1 mrg chunk = left;
234 1.1 mrg }
235 1.1 mrg else
236 1.1 mrg {
237 1.1 mrg if (chunk > left)
238 1.1 mrg chunk = left;
239 1.1 mrg }
240 1.1 mrg nend = start + chunk;
241 1.1 mrg
242 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
243 1.1 mrg if (__builtin_expect (tmp == start, 1))
244 1.1 mrg break;
245 1.1 mrg
246 1.1 mrg start = tmp;
247 1.1 mrg }
248 1.1 mrg
249 1.1 mrg *pstart = start;
250 1.1 mrg *pend = nend;
251 1.1 mrg return true;
252 1.1 mrg }
253 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
254 1.1 mrg
255 1.1 mrg
256 1.1 mrg /* This function implements the GUIDED scheduling method. Arguments are
257 1.1 mrg as for gomp_iter_ull_static_next. This function must be called with the
258 1.1 mrg work share lock held. */
259 1.1 mrg
260 1.1 mrg bool
261 1.1 mrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
262 1.1 mrg {
263 1.1 mrg struct gomp_thread *thr = gomp_thread ();
264 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
265 1.1 mrg struct gomp_team *team = thr->ts.team;
266 1.1 mrg gomp_ull nthreads = team ? team->nthreads : 1;
267 1.1 mrg gomp_ull n, q;
268 1.1 mrg gomp_ull start, end;
269 1.1 mrg
270 1.1 mrg if (ws->next_ull == ws->end_ull)
271 1.1 mrg return false;
272 1.1 mrg
273 1.1 mrg start = ws->next_ull;
274 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
275 1.1 mrg n = (ws->end_ull - start) / ws->incr_ull;
276 1.1 mrg else
277 1.1 mrg n = (start - ws->end_ull) / -ws->incr_ull;
278 1.1 mrg q = (n + nthreads - 1) / nthreads;
279 1.1 mrg
280 1.1 mrg if (q < ws->chunk_size_ull)
281 1.1 mrg q = ws->chunk_size_ull;
282 1.1 mrg if (q <= n)
283 1.1 mrg end = start + q * ws->incr_ull;
284 1.1 mrg else
285 1.1 mrg end = ws->end_ull;
286 1.1 mrg
287 1.1 mrg ws->next_ull = end;
288 1.1 mrg *pstart = start;
289 1.1 mrg *pend = end;
290 1.1 mrg return true;
291 1.1 mrg }
292 1.1 mrg
293 1.1 mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
294 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
295 1.1 mrg instead. Note that the only memory value that changes is ws->next_ull. */
296 1.1 mrg
297 1.1 mrg bool
298 1.1 mrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
299 1.1 mrg {
300 1.1 mrg struct gomp_thread *thr = gomp_thread ();
301 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
302 1.1 mrg struct gomp_team *team = thr->ts.team;
303 1.1 mrg gomp_ull nthreads = team ? team->nthreads : 1;
304 1.1 mrg gomp_ull start, end, nend, incr;
305 1.1 mrg gomp_ull chunk_size;
306 1.1 mrg
307 1.1 mrg start = ws->next_ull;
308 1.1 mrg end = ws->end_ull;
309 1.1 mrg incr = ws->incr_ull;
310 1.1 mrg chunk_size = ws->chunk_size_ull;
311 1.1 mrg
312 1.1 mrg while (1)
313 1.1 mrg {
314 1.1 mrg gomp_ull n, q;
315 1.1 mrg gomp_ull tmp;
316 1.1 mrg
317 1.1 mrg if (start == end)
318 1.1 mrg return false;
319 1.1 mrg
320 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
321 1.1 mrg n = (end - start) / incr;
322 1.1 mrg else
323 1.1 mrg n = (start - end) / -incr;
324 1.1 mrg q = (n + nthreads - 1) / nthreads;
325 1.1 mrg
326 1.1 mrg if (q < chunk_size)
327 1.1 mrg q = chunk_size;
328 1.1 mrg if (__builtin_expect (q <= n, 1))
329 1.1 mrg nend = start + q * incr;
330 1.1 mrg else
331 1.1 mrg nend = end;
332 1.1 mrg
333 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
334 1.1 mrg if (__builtin_expect (tmp == start, 1))
335 1.1 mrg break;
336 1.1 mrg
337 1.1 mrg start = tmp;
338 1.1 mrg }
339 1.1 mrg
340 1.1 mrg *pstart = start;
341 1.1 mrg *pend = nend;
342 1.1 mrg return true;
343 1.1 mrg }
344 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
345