iter_ull.c revision 1.5.4.2 1 1.5.4.2 martin /* Copyright (C) 2005-2017 Free Software Foundation, Inc.
2 1.1 mrg Contributed by Richard Henderson <rth (at) redhat.com>.
3 1.1 mrg
4 1.5 mrg This file is part of the GNU Offloading and Multi Processing Library
5 1.5 mrg (libgomp).
6 1.1 mrg
7 1.1 mrg Libgomp is free software; you can redistribute it and/or modify it
8 1.1 mrg under the terms of the GNU General Public License as published by
9 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
10 1.1 mrg any later version.
11 1.1 mrg
12 1.1 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 1.1 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 1.1 mrg more details.
16 1.1 mrg
17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
18 1.1 mrg permissions described in the GCC Runtime Library Exception, version
19 1.1 mrg 3.1, as published by the Free Software Foundation.
20 1.1 mrg
21 1.1 mrg You should have received a copy of the GNU General Public License and
22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 1.1 mrg <http://www.gnu.org/licenses/>. */
25 1.1 mrg
26 1.1 mrg /* This file contains routines for managing work-share iteration, both
27 1.1 mrg for loops and sections. */
28 1.1 mrg
29 1.1 mrg #include "libgomp.h"
30 1.1 mrg #include <stdlib.h>
31 1.1 mrg
32 1.1 mrg typedef unsigned long long gomp_ull;
33 1.1 mrg
34 1.1 mrg /* This function implements the STATIC scheduling method. The caller should
35 1.1 mrg iterate *pstart <= x < *pend. Return zero if there are more iterations
36 1.1 mrg to perform; nonzero if not. Return less than 0 if this thread had
37 1.1 mrg received the absolutely last iteration. */
38 1.1 mrg
39 1.1 mrg int
40 1.1 mrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
41 1.1 mrg {
42 1.1 mrg struct gomp_thread *thr = gomp_thread ();
43 1.1 mrg struct gomp_team *team = thr->ts.team;
44 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
45 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1;
46 1.1 mrg
47 1.1 mrg if (thr->ts.static_trip == -1)
48 1.1 mrg return -1;
49 1.1 mrg
50 1.1 mrg /* Quick test for degenerate teams and orphaned constructs. */
51 1.1 mrg if (nthreads == 1)
52 1.1 mrg {
53 1.1 mrg *pstart = ws->next_ull;
54 1.1 mrg *pend = ws->end_ull;
55 1.1 mrg thr->ts.static_trip = -1;
56 1.1 mrg return ws->next_ull == ws->end_ull;
57 1.1 mrg }
58 1.1 mrg
59 1.1 mrg /* We interpret chunk_size zero as "unspecified", which means that we
60 1.1 mrg should break up the iterations such that each thread makes only one
61 1.1 mrg trip through the outer loop. */
62 1.1 mrg if (ws->chunk_size_ull == 0)
63 1.1 mrg {
64 1.3 mrg gomp_ull n, q, i, t, s0, e0, s, e;
65 1.1 mrg
66 1.1 mrg if (thr->ts.static_trip > 0)
67 1.1 mrg return 1;
68 1.1 mrg
69 1.1 mrg /* Compute the total number of iterations. */
70 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
71 1.1 mrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72 1.1 mrg else
73 1.1 mrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74 1.1 mrg i = thr->ts.team_id;
75 1.1 mrg
76 1.1 mrg /* Compute the "zero-based" start and end points. That is, as
77 1.1 mrg if the loop began at zero and incremented by one. */
78 1.1 mrg q = n / nthreads;
79 1.3 mrg t = n % nthreads;
80 1.3 mrg if (i < t)
81 1.3 mrg {
82 1.3 mrg t = 0;
83 1.3 mrg q++;
84 1.3 mrg }
85 1.3 mrg s0 = q * i + t;
86 1.1 mrg e0 = s0 + q;
87 1.1 mrg
88 1.1 mrg /* Notice when no iterations allocated for this thread. */
89 1.1 mrg if (s0 >= e0)
90 1.1 mrg {
91 1.1 mrg thr->ts.static_trip = 1;
92 1.1 mrg return 1;
93 1.1 mrg }
94 1.1 mrg
95 1.1 mrg /* Transform these to the actual start and end numbers. */
96 1.1 mrg s = s0 * ws->incr_ull + ws->next_ull;
97 1.1 mrg e = e0 * ws->incr_ull + ws->next_ull;
98 1.1 mrg
99 1.1 mrg *pstart = s;
100 1.1 mrg *pend = e;
101 1.1 mrg thr->ts.static_trip = (e0 == n ? -1 : 1);
102 1.1 mrg return 0;
103 1.1 mrg }
104 1.1 mrg else
105 1.1 mrg {
106 1.1 mrg gomp_ull n, s0, e0, i, c, s, e;
107 1.1 mrg
108 1.1 mrg /* Otherwise, each thread gets exactly chunk_size iterations
109 1.1 mrg (if available) each time through the loop. */
110 1.1 mrg
111 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
112 1.1 mrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113 1.1 mrg else
114 1.1 mrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115 1.1 mrg i = thr->ts.team_id;
116 1.1 mrg c = ws->chunk_size_ull;
117 1.1 mrg
118 1.1 mrg /* Initial guess is a C sized chunk positioned nthreads iterations
119 1.1 mrg in, offset by our thread number. */
120 1.1 mrg s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121 1.1 mrg e0 = s0 + c;
122 1.1 mrg
123 1.1 mrg /* Detect overflow. */
124 1.1 mrg if (s0 >= n)
125 1.1 mrg return 1;
126 1.1 mrg if (e0 > n)
127 1.1 mrg e0 = n;
128 1.1 mrg
129 1.1 mrg /* Transform these to the actual start and end numbers. */
130 1.1 mrg s = s0 * ws->incr_ull + ws->next_ull;
131 1.1 mrg e = e0 * ws->incr_ull + ws->next_ull;
132 1.1 mrg
133 1.1 mrg *pstart = s;
134 1.1 mrg *pend = e;
135 1.1 mrg
136 1.1 mrg if (e0 == n)
137 1.1 mrg thr->ts.static_trip = -1;
138 1.1 mrg else
139 1.1 mrg thr->ts.static_trip++;
140 1.1 mrg return 0;
141 1.1 mrg }
142 1.1 mrg }
143 1.1 mrg
144 1.1 mrg
145 1.1 mrg /* This function implements the DYNAMIC scheduling method. Arguments are
146 1.1 mrg as for gomp_iter_ull_static_next. This function must be called with
147 1.1 mrg ws->lock held. */
148 1.1 mrg
149 1.1 mrg bool
150 1.1 mrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
151 1.1 mrg {
152 1.1 mrg struct gomp_thread *thr = gomp_thread ();
153 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
154 1.1 mrg gomp_ull start, end, chunk, left;
155 1.1 mrg
156 1.1 mrg start = ws->next_ull;
157 1.1 mrg if (start == ws->end_ull)
158 1.1 mrg return false;
159 1.1 mrg
160 1.1 mrg chunk = ws->chunk_size_ull;
161 1.1 mrg left = ws->end_ull - start;
162 1.1 mrg if (__builtin_expect (ws->mode & 2, 0))
163 1.1 mrg {
164 1.1 mrg if (chunk < left)
165 1.1 mrg chunk = left;
166 1.1 mrg }
167 1.1 mrg else
168 1.1 mrg {
169 1.1 mrg if (chunk > left)
170 1.1 mrg chunk = left;
171 1.1 mrg }
172 1.1 mrg end = start + chunk;
173 1.1 mrg
174 1.1 mrg ws->next_ull = end;
175 1.1 mrg *pstart = start;
176 1.1 mrg *pend = end;
177 1.1 mrg return true;
178 1.1 mrg }
179 1.1 mrg
180 1.1 mrg
181 1.1 mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
182 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
183 1.1 mrg instead. Note that the only memory value that changes is ws->next_ull. */
184 1.1 mrg
185 1.1 mrg bool
186 1.1 mrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
187 1.1 mrg {
188 1.1 mrg struct gomp_thread *thr = gomp_thread ();
189 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
190 1.1 mrg gomp_ull start, end, nend, chunk;
191 1.1 mrg
192 1.1 mrg end = ws->end_ull;
193 1.1 mrg chunk = ws->chunk_size_ull;
194 1.1 mrg
195 1.1 mrg if (__builtin_expect (ws->mode & 1, 1))
196 1.1 mrg {
197 1.1 mrg gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198 1.1 mrg if (__builtin_expect (ws->mode & 2, 0) == 0)
199 1.1 mrg {
200 1.1 mrg if (tmp >= end)
201 1.1 mrg return false;
202 1.1 mrg nend = tmp + chunk;
203 1.1 mrg if (nend > end)
204 1.1 mrg nend = end;
205 1.1 mrg *pstart = tmp;
206 1.1 mrg *pend = nend;
207 1.1 mrg return true;
208 1.1 mrg }
209 1.1 mrg else
210 1.1 mrg {
211 1.1 mrg if (tmp <= end)
212 1.1 mrg return false;
213 1.1 mrg nend = tmp + chunk;
214 1.1 mrg if (nend < end)
215 1.1 mrg nend = end;
216 1.1 mrg *pstart = tmp;
217 1.1 mrg *pend = nend;
218 1.1 mrg return true;
219 1.1 mrg }
220 1.1 mrg }
221 1.1 mrg
222 1.5 mrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223 1.1 mrg while (1)
224 1.1 mrg {
225 1.1 mrg gomp_ull left = end - start;
226 1.1 mrg gomp_ull tmp;
227 1.1 mrg
228 1.1 mrg if (start == end)
229 1.1 mrg return false;
230 1.1 mrg
231 1.1 mrg if (__builtin_expect (ws->mode & 2, 0))
232 1.1 mrg {
233 1.1 mrg if (chunk < left)
234 1.1 mrg chunk = left;
235 1.1 mrg }
236 1.1 mrg else
237 1.1 mrg {
238 1.1 mrg if (chunk > left)
239 1.1 mrg chunk = left;
240 1.1 mrg }
241 1.1 mrg nend = start + chunk;
242 1.1 mrg
243 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244 1.1 mrg if (__builtin_expect (tmp == start, 1))
245 1.1 mrg break;
246 1.1 mrg
247 1.1 mrg start = tmp;
248 1.1 mrg }
249 1.1 mrg
250 1.1 mrg *pstart = start;
251 1.1 mrg *pend = nend;
252 1.1 mrg return true;
253 1.1 mrg }
254 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
255 1.1 mrg
256 1.1 mrg
257 1.1 mrg /* This function implements the GUIDED scheduling method. Arguments are
258 1.1 mrg as for gomp_iter_ull_static_next. This function must be called with the
259 1.1 mrg work share lock held. */
260 1.1 mrg
261 1.1 mrg bool
262 1.1 mrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
263 1.1 mrg {
264 1.1 mrg struct gomp_thread *thr = gomp_thread ();
265 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
266 1.1 mrg struct gomp_team *team = thr->ts.team;
267 1.1 mrg gomp_ull nthreads = team ? team->nthreads : 1;
268 1.1 mrg gomp_ull n, q;
269 1.1 mrg gomp_ull start, end;
270 1.1 mrg
271 1.1 mrg if (ws->next_ull == ws->end_ull)
272 1.1 mrg return false;
273 1.1 mrg
274 1.1 mrg start = ws->next_ull;
275 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
276 1.1 mrg n = (ws->end_ull - start) / ws->incr_ull;
277 1.1 mrg else
278 1.1 mrg n = (start - ws->end_ull) / -ws->incr_ull;
279 1.1 mrg q = (n + nthreads - 1) / nthreads;
280 1.1 mrg
281 1.1 mrg if (q < ws->chunk_size_ull)
282 1.1 mrg q = ws->chunk_size_ull;
283 1.1 mrg if (q <= n)
284 1.1 mrg end = start + q * ws->incr_ull;
285 1.1 mrg else
286 1.1 mrg end = ws->end_ull;
287 1.1 mrg
288 1.1 mrg ws->next_ull = end;
289 1.1 mrg *pstart = start;
290 1.1 mrg *pend = end;
291 1.1 mrg return true;
292 1.1 mrg }
293 1.1 mrg
294 1.1 mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
295 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
296 1.1 mrg instead. Note that the only memory value that changes is ws->next_ull. */
297 1.1 mrg
298 1.1 mrg bool
299 1.1 mrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
300 1.1 mrg {
301 1.1 mrg struct gomp_thread *thr = gomp_thread ();
302 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share;
303 1.1 mrg struct gomp_team *team = thr->ts.team;
304 1.1 mrg gomp_ull nthreads = team ? team->nthreads : 1;
305 1.1 mrg gomp_ull start, end, nend, incr;
306 1.1 mrg gomp_ull chunk_size;
307 1.1 mrg
308 1.5 mrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309 1.1 mrg end = ws->end_ull;
310 1.1 mrg incr = ws->incr_ull;
311 1.1 mrg chunk_size = ws->chunk_size_ull;
312 1.1 mrg
313 1.1 mrg while (1)
314 1.1 mrg {
315 1.1 mrg gomp_ull n, q;
316 1.1 mrg gomp_ull tmp;
317 1.1 mrg
318 1.1 mrg if (start == end)
319 1.1 mrg return false;
320 1.1 mrg
321 1.1 mrg if (__builtin_expect (ws->mode, 0) == 0)
322 1.1 mrg n = (end - start) / incr;
323 1.1 mrg else
324 1.1 mrg n = (start - end) / -incr;
325 1.1 mrg q = (n + nthreads - 1) / nthreads;
326 1.1 mrg
327 1.1 mrg if (q < chunk_size)
328 1.1 mrg q = chunk_size;
329 1.1 mrg if (__builtin_expect (q <= n, 1))
330 1.1 mrg nend = start + q * incr;
331 1.1 mrg else
332 1.1 mrg nend = end;
333 1.1 mrg
334 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335 1.1 mrg if (__builtin_expect (tmp == start, 1))
336 1.1 mrg break;
337 1.1 mrg
338 1.1 mrg start = tmp;
339 1.1 mrg }
340 1.1 mrg
341 1.1 mrg *pstart = start;
342 1.1 mrg *pend = nend;
343 1.1 mrg return true;
344 1.1 mrg }
345 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */
346