iter.c revision 1.10 1 1.10 mrg /* Copyright (C) 2005-2019 Free Software Foundation, Inc.
2 1.1 mrg Contributed by Richard Henderson <rth (at) redhat.com>.
3 1.1 mrg
4 1.5 mrg This file is part of the GNU Offloading and Multi Processing Library
5 1.5 mrg (libgomp).
6 1.1 mrg
7 1.1 mrg Libgomp is free software; you can redistribute it and/or modify it
8 1.1 mrg under the terms of the GNU General Public License as published by
9 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
10 1.1 mrg any later version.
11 1.1 mrg
12 1.1 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 1.1 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 1.1 mrg more details.
16 1.1 mrg
17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
18 1.1 mrg permissions described in the GCC Runtime Library Exception, version
19 1.1 mrg 3.1, as published by the Free Software Foundation.
20 1.1 mrg
21 1.1 mrg You should have received a copy of the GNU General Public License and
22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 1.1 mrg <http://www.gnu.org/licenses/>. */
25 1.1 mrg
26 1.1 mrg /* This file contains routines for managing work-share iteration, both
27 1.1 mrg for loops and sections. */
28 1.1 mrg
29 1.1 mrg #include "libgomp.h"
30 1.1 mrg #include <stdlib.h>
31 1.1 mrg
32 1.1 mrg
33 1.1 mrg /* This function implements the STATIC scheduling method. The caller should
34 1.1 mrg iterate *pstart <= x < *pend. Return zero if there are more iterations
35 1.1 mrg to perform; nonzero if not. Return less than 0 if this thread had
36 1.1 mrg received the absolutely last iteration. */
37 1.1 mrg
38 1.1 mrg int
39 1.1 mrg gomp_iter_static_next (long *pstart, long *pend)
40 1.1 mrg {
41 1.1 mrg struct gomp_thread *thr = gomp_thread ();
42 1.1 mrg struct gomp_team *team = thr->ts.team;
43 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
44 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1;
45 1.1 mrg
46 1.1 mrg if (thr->ts.static_trip == -1)
47 1.1 mrg return -1;
48 1.1 mrg
49 1.1 mrg /* Quick test for degenerate teams and orphaned constructs. */
50 1.1 mrg if (nthreads == 1)
51 1.1 mrg {
52 1.1 mrg *pstart = ws->next;
53 1.1 mrg *pend = ws->end;
54 1.1 mrg thr->ts.static_trip = -1;
55 1.1 mrg return ws->next == ws->end;
56 1.1 mrg }
57 1.1 mrg
58 1.1 mrg /* We interpret chunk_size zero as "unspecified", which means that we
59 1.1 mrg should break up the iterations such that each thread makes only one
60 1.1 mrg trip through the outer loop. */
61 1.1 mrg if (ws->chunk_size == 0)
62 1.1 mrg {
63 1.3 mrg unsigned long n, q, i, t;
64 1.1 mrg unsigned long s0, e0;
65 1.1 mrg long s, e;
66 1.1 mrg
67 1.1 mrg if (thr->ts.static_trip > 0)
68 1.1 mrg return 1;
69 1.1 mrg
70 1.1 mrg /* Compute the total number of iterations. */
71 1.1 mrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
72 1.1 mrg n = (ws->end - ws->next + s) / ws->incr;
73 1.1 mrg i = thr->ts.team_id;
74 1.1 mrg
75 1.1 mrg /* Compute the "zero-based" start and end points. That is, as
76 1.1 mrg if the loop began at zero and incremented by one. */
77 1.1 mrg q = n / nthreads;
78 1.3 mrg t = n % nthreads;
79 1.3 mrg if (i < t)
80 1.3 mrg {
81 1.3 mrg t = 0;
82 1.3 mrg q++;
83 1.3 mrg }
84 1.3 mrg s0 = q * i + t;
85 1.1 mrg e0 = s0 + q;
86 1.1 mrg
87 1.1 mrg /* Notice when no iterations allocated for this thread. */
88 1.1 mrg if (s0 >= e0)
89 1.1 mrg {
90 1.1 mrg thr->ts.static_trip = 1;
91 1.1 mrg return 1;
92 1.1 mrg }
93 1.1 mrg
94 1.1 mrg /* Transform these to the actual start and end numbers. */
95 1.1 mrg s = (long)s0 * ws->incr + ws->next;
96 1.1 mrg e = (long)e0 * ws->incr + ws->next;
97 1.1 mrg
98 1.1 mrg *pstart = s;
99 1.1 mrg *pend = e;
100 1.1 mrg thr->ts.static_trip = (e0 == n ? -1 : 1);
101 1.1 mrg return 0;
102 1.1 mrg }
103 1.1 mrg else
104 1.1 mrg {
105 1.1 mrg unsigned long n, s0, e0, i, c;
106 1.1 mrg long s, e;
107 1.1 mrg
108 1.1 mrg /* Otherwise, each thread gets exactly chunk_size iterations
109 1.1 mrg (if available) each time through the loop. */
110 1.1 mrg
111 1.1 mrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
112 1.1 mrg n = (ws->end - ws->next + s) / ws->incr;
113 1.1 mrg i = thr->ts.team_id;
114 1.1 mrg c = ws->chunk_size;
115 1.1 mrg
116 1.1 mrg /* Initial guess is a C sized chunk positioned nthreads iterations
117 1.1 mrg in, offset by our thread number. */
118 1.1 mrg s0 = (thr->ts.static_trip * nthreads + i) * c;
119 1.1 mrg e0 = s0 + c;
120 1.1 mrg
121 1.1 mrg /* Detect overflow. */
122 1.1 mrg if (s0 >= n)
123 1.1 mrg return 1;
124 1.1 mrg if (e0 > n)
125 1.1 mrg e0 = n;
126 1.1 mrg
127 1.1 mrg /* Transform these to the actual start and end numbers. */
128 1.1 mrg s = (long)s0 * ws->incr + ws->next;
129 1.1 mrg e = (long)e0 * ws->incr + ws->next;
130 1.1 mrg
131 1.1 mrg *pstart = s;
132 1.1 mrg *pend = e;
133 1.1 mrg
134 1.1 mrg if (e0 == n)
135 1.1 mrg thr->ts.static_trip = -1;
136 1.1 mrg else
137 1.1 mrg thr->ts.static_trip++;
138 1.1 mrg return 0;
139 1.1 mrg }
140 1.1 mrg }
141 1.1 mrg
142 1.1 mrg
143 1.1 mrg /* This function implements the DYNAMIC scheduling method. Arguments are
144 1.1 mrg as for gomp_iter_static_next. This function must be called with ws->lock
145 1.1 mrg held. */
146 1.1 mrg
147 1.1 mrg bool
148 1.1 mrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
149 1.1 mrg {
150 1.1 mrg struct gomp_thread *thr = gomp_thread ();
151 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
152 1.1 mrg long start, end, chunk, left;
153 1.1 mrg
154 1.1 mrg start = ws->next;
155 1.1 mrg if (start == ws->end)
156 1.1 mrg return false;
157 1.1 mrg
158 1.1 mrg chunk = ws->chunk_size;
159 1.1 mrg left = ws->end - start;
160 1.1 mrg if (ws->incr < 0)
161 1.1 mrg {
162 1.1 mrg if (chunk < left)
163 1.1 mrg chunk = left;
164 1.1 mrg }
165 1.1 mrg else
166 1.1 mrg {
167 1.1 mrg if (chunk > left)
168 1.1 mrg chunk = left;
169 1.1 mrg }
170 1.1 mrg end = start + chunk;
171 1.1 mrg
172 1.1 mrg ws->next = end;
173 1.1 mrg *pstart = start;
174 1.1 mrg *pend = end;
175 1.1 mrg return true;
176 1.1 mrg }
177 1.1 mrg
178 1.1 mrg
179 1.1 mrg #ifdef HAVE_SYNC_BUILTINS
180 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
181 1.1 mrg instead. Note that the only memory value that changes is ws->next. */
182 1.1 mrg
183 1.1 mrg bool
184 1.1 mrg gomp_iter_dynamic_next (long *pstart, long *pend)
185 1.1 mrg {
186 1.1 mrg struct gomp_thread *thr = gomp_thread ();
187 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
188 1.1 mrg long start, end, nend, chunk, incr;
189 1.1 mrg
190 1.1 mrg end = ws->end;
191 1.1 mrg incr = ws->incr;
192 1.1 mrg chunk = ws->chunk_size;
193 1.1 mrg
194 1.1 mrg if (__builtin_expect (ws->mode, 1))
195 1.1 mrg {
196 1.1 mrg long tmp = __sync_fetch_and_add (&ws->next, chunk);
197 1.1 mrg if (incr > 0)
198 1.1 mrg {
199 1.1 mrg if (tmp >= end)
200 1.1 mrg return false;
201 1.1 mrg nend = tmp + chunk;
202 1.1 mrg if (nend > end)
203 1.1 mrg nend = end;
204 1.1 mrg *pstart = tmp;
205 1.1 mrg *pend = nend;
206 1.1 mrg return true;
207 1.1 mrg }
208 1.1 mrg else
209 1.1 mrg {
210 1.1 mrg if (tmp <= end)
211 1.1 mrg return false;
212 1.1 mrg nend = tmp + chunk;
213 1.1 mrg if (nend < end)
214 1.1 mrg nend = end;
215 1.1 mrg *pstart = tmp;
216 1.1 mrg *pend = nend;
217 1.1 mrg return true;
218 1.1 mrg }
219 1.1 mrg }
220 1.1 mrg
221 1.5 mrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
222 1.1 mrg while (1)
223 1.1 mrg {
224 1.1 mrg long left = end - start;
225 1.1 mrg long tmp;
226 1.1 mrg
227 1.1 mrg if (start == end)
228 1.1 mrg return false;
229 1.1 mrg
230 1.1 mrg if (incr < 0)
231 1.1 mrg {
232 1.1 mrg if (chunk < left)
233 1.1 mrg chunk = left;
234 1.1 mrg }
235 1.1 mrg else
236 1.1 mrg {
237 1.1 mrg if (chunk > left)
238 1.1 mrg chunk = left;
239 1.1 mrg }
240 1.1 mrg nend = start + chunk;
241 1.1 mrg
242 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
243 1.1 mrg if (__builtin_expect (tmp == start, 1))
244 1.1 mrg break;
245 1.1 mrg
246 1.1 mrg start = tmp;
247 1.1 mrg }
248 1.1 mrg
249 1.1 mrg *pstart = start;
250 1.1 mrg *pend = nend;
251 1.1 mrg return true;
252 1.1 mrg }
253 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
254 1.1 mrg
255 1.1 mrg
256 1.1 mrg /* This function implements the GUIDED scheduling method. Arguments are
257 1.1 mrg as for gomp_iter_static_next. This function must be called with the
258 1.1 mrg work share lock held. */
259 1.1 mrg
260 1.1 mrg bool
261 1.1 mrg gomp_iter_guided_next_locked (long *pstart, long *pend)
262 1.1 mrg {
263 1.1 mrg struct gomp_thread *thr = gomp_thread ();
264 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
265 1.1 mrg struct gomp_team *team = thr->ts.team;
266 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1;
267 1.1 mrg unsigned long n, q;
268 1.1 mrg long start, end;
269 1.1 mrg
270 1.1 mrg if (ws->next == ws->end)
271 1.1 mrg return false;
272 1.1 mrg
273 1.1 mrg start = ws->next;
274 1.1 mrg n = (ws->end - start) / ws->incr;
275 1.1 mrg q = (n + nthreads - 1) / nthreads;
276 1.1 mrg
277 1.1 mrg if (q < ws->chunk_size)
278 1.1 mrg q = ws->chunk_size;
279 1.1 mrg if (q <= n)
280 1.1 mrg end = start + q * ws->incr;
281 1.1 mrg else
282 1.1 mrg end = ws->end;
283 1.1 mrg
284 1.1 mrg ws->next = end;
285 1.1 mrg *pstart = start;
286 1.1 mrg *pend = end;
287 1.1 mrg return true;
288 1.1 mrg }
289 1.1 mrg
290 1.1 mrg #ifdef HAVE_SYNC_BUILTINS
291 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
292 1.1 mrg instead. Note that the only memory value that changes is ws->next. */
293 1.1 mrg
294 1.1 mrg bool
295 1.1 mrg gomp_iter_guided_next (long *pstart, long *pend)
296 1.1 mrg {
297 1.1 mrg struct gomp_thread *thr = gomp_thread ();
298 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
299 1.1 mrg struct gomp_team *team = thr->ts.team;
300 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1;
301 1.1 mrg long start, end, nend, incr;
302 1.1 mrg unsigned long chunk_size;
303 1.1 mrg
304 1.5 mrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
305 1.1 mrg end = ws->end;
306 1.1 mrg incr = ws->incr;
307 1.1 mrg chunk_size = ws->chunk_size;
308 1.1 mrg
309 1.1 mrg while (1)
310 1.1 mrg {
311 1.1 mrg unsigned long n, q;
312 1.1 mrg long tmp;
313 1.1 mrg
314 1.1 mrg if (start == end)
315 1.1 mrg return false;
316 1.1 mrg
317 1.1 mrg n = (end - start) / incr;
318 1.1 mrg q = (n + nthreads - 1) / nthreads;
319 1.1 mrg
320 1.1 mrg if (q < chunk_size)
321 1.1 mrg q = chunk_size;
322 1.1 mrg if (__builtin_expect (q <= n, 1))
323 1.1 mrg nend = start + q * incr;
324 1.1 mrg else
325 1.1 mrg nend = end;
326 1.1 mrg
327 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
328 1.1 mrg if (__builtin_expect (tmp == start, 1))
329 1.1 mrg break;
330 1.1 mrg
331 1.1 mrg start = tmp;
332 1.1 mrg }
333 1.1 mrg
334 1.1 mrg *pstart = start;
335 1.1 mrg *pend = nend;
336 1.1 mrg return true;
337 1.1 mrg }
338 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
339