1 1.1.1.12 mrg /* Copyright (C) 2005-2024 Free Software Foundation, Inc. 2 1.1 mrg Contributed by Richard Henderson <rth (at) redhat.com>. 3 1.1 mrg 4 1.1.1.3 mrg This file is part of the GNU Offloading and Multi Processing Library 5 1.1.1.3 mrg (libgomp). 6 1.1 mrg 7 1.1 mrg Libgomp is free software; you can redistribute it and/or modify it 8 1.1 mrg under the terms of the GNU General Public License as published by 9 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 10 1.1 mrg any later version. 11 1.1 mrg 12 1.1 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 13 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 14 1.1 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 1.1 mrg more details. 16 1.1 mrg 17 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 18 1.1 mrg permissions described in the GCC Runtime Library Exception, version 19 1.1 mrg 3.1, as published by the Free Software Foundation. 20 1.1 mrg 21 1.1 mrg You should have received a copy of the GNU General Public License and 22 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 23 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 1.1 mrg <http://www.gnu.org/licenses/>. */ 25 1.1 mrg 26 1.1 mrg /* This file contains routines for managing work-share iteration, both 27 1.1 mrg for loops and sections. */ 28 1.1 mrg 29 1.1 mrg #include "libgomp.h" 30 1.1 mrg #include <stdlib.h> 31 1.1 mrg 32 1.1 mrg 33 1.1 mrg /* This function implements the STATIC scheduling method. The caller should 34 1.1 mrg iterate *pstart <= x < *pend. Return zero if there are more iterations 35 1.1 mrg to perform; nonzero if not. Return less than 0 if this thread had 36 1.1 mrg received the absolutely last iteration. */ 37 1.1 mrg 38 1.1 mrg int 39 1.1 mrg gomp_iter_static_next (long *pstart, long *pend) 40 1.1 mrg { 41 1.1 mrg struct gomp_thread *thr = gomp_thread (); 42 1.1 mrg struct gomp_team *team = thr->ts.team; 43 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share; 44 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1; 45 1.1 mrg 46 1.1 mrg if (thr->ts.static_trip == -1) 47 1.1 mrg return -1; 48 1.1 mrg 49 1.1 mrg /* Quick test for degenerate teams and orphaned constructs. */ 50 1.1 mrg if (nthreads == 1) 51 1.1 mrg { 52 1.1 mrg *pstart = ws->next; 53 1.1 mrg *pend = ws->end; 54 1.1 mrg thr->ts.static_trip = -1; 55 1.1 mrg return ws->next == ws->end; 56 1.1 mrg } 57 1.1 mrg 58 1.1 mrg /* We interpret chunk_size zero as "unspecified", which means that we 59 1.1 mrg should break up the iterations such that each thread makes only one 60 1.1 mrg trip through the outer loop. */ 61 1.1 mrg if (ws->chunk_size == 0) 62 1.1 mrg { 63 1.1.1.2 mrg unsigned long n, q, i, t; 64 1.1 mrg unsigned long s0, e0; 65 1.1 mrg long s, e; 66 1.1 mrg 67 1.1 mrg if (thr->ts.static_trip > 0) 68 1.1 mrg return 1; 69 1.1 mrg 70 1.1 mrg /* Compute the total number of iterations. */ 71 1.1 mrg s = ws->incr + (ws->incr > 0 ? -1 : 1); 72 1.1 mrg n = (ws->end - ws->next + s) / ws->incr; 73 1.1 mrg i = thr->ts.team_id; 74 1.1 mrg 75 1.1 mrg /* Compute the "zero-based" start and end points. That is, as 76 1.1 mrg if the loop began at zero and incremented by one. */ 77 1.1 mrg q = n / nthreads; 78 1.1.1.2 mrg t = n % nthreads; 79 1.1.1.2 mrg if (i < t) 80 1.1.1.2 mrg { 81 1.1.1.2 mrg t = 0; 82 1.1.1.2 mrg q++; 83 1.1.1.2 mrg } 84 1.1.1.2 mrg s0 = q * i + t; 85 1.1 mrg e0 = s0 + q; 86 1.1 mrg 87 1.1 mrg /* Notice when no iterations allocated for this thread. */ 88 1.1 mrg if (s0 >= e0) 89 1.1 mrg { 90 1.1 mrg thr->ts.static_trip = 1; 91 1.1 mrg return 1; 92 1.1 mrg } 93 1.1 mrg 94 1.1 mrg /* Transform these to the actual start and end numbers. */ 95 1.1 mrg s = (long)s0 * ws->incr + ws->next; 96 1.1 mrg e = (long)e0 * ws->incr + ws->next; 97 1.1 mrg 98 1.1 mrg *pstart = s; 99 1.1 mrg *pend = e; 100 1.1 mrg thr->ts.static_trip = (e0 == n ? -1 : 1); 101 1.1 mrg return 0; 102 1.1 mrg } 103 1.1 mrg else 104 1.1 mrg { 105 1.1 mrg unsigned long n, s0, e0, i, c; 106 1.1 mrg long s, e; 107 1.1 mrg 108 1.1 mrg /* Otherwise, each thread gets exactly chunk_size iterations 109 1.1 mrg (if available) each time through the loop. */ 110 1.1 mrg 111 1.1 mrg s = ws->incr + (ws->incr > 0 ? -1 : 1); 112 1.1 mrg n = (ws->end - ws->next + s) / ws->incr; 113 1.1 mrg i = thr->ts.team_id; 114 1.1 mrg c = ws->chunk_size; 115 1.1 mrg 116 1.1 mrg /* Initial guess is a C sized chunk positioned nthreads iterations 117 1.1 mrg in, offset by our thread number. */ 118 1.1 mrg s0 = (thr->ts.static_trip * nthreads + i) * c; 119 1.1 mrg e0 = s0 + c; 120 1.1 mrg 121 1.1 mrg /* Detect overflow. */ 122 1.1 mrg if (s0 >= n) 123 1.1 mrg return 1; 124 1.1 mrg if (e0 > n) 125 1.1 mrg e0 = n; 126 1.1 mrg 127 1.1 mrg /* Transform these to the actual start and end numbers. */ 128 1.1 mrg s = (long)s0 * ws->incr + ws->next; 129 1.1 mrg e = (long)e0 * ws->incr + ws->next; 130 1.1 mrg 131 1.1 mrg *pstart = s; 132 1.1 mrg *pend = e; 133 1.1 mrg 134 1.1 mrg if (e0 == n) 135 1.1 mrg thr->ts.static_trip = -1; 136 1.1 mrg else 137 1.1 mrg thr->ts.static_trip++; 138 1.1 mrg return 0; 139 1.1 mrg } 140 1.1 mrg } 141 1.1 mrg 142 1.1 mrg 143 1.1 mrg /* This function implements the DYNAMIC scheduling method. Arguments are 144 1.1 mrg as for gomp_iter_static_next. This function must be called with ws->lock 145 1.1 mrg held. */ 146 1.1 mrg 147 1.1 mrg bool 148 1.1 mrg gomp_iter_dynamic_next_locked (long *pstart, long *pend) 149 1.1 mrg { 150 1.1 mrg struct gomp_thread *thr = gomp_thread (); 151 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share; 152 1.1 mrg long start, end, chunk, left; 153 1.1 mrg 154 1.1 mrg start = ws->next; 155 1.1 mrg if (start == ws->end) 156 1.1 mrg return false; 157 1.1 mrg 158 1.1 mrg chunk = ws->chunk_size; 159 1.1 mrg left = ws->end - start; 160 1.1 mrg if (ws->incr < 0) 161 1.1 mrg { 162 1.1 mrg if (chunk < left) 163 1.1 mrg chunk = left; 164 1.1 mrg } 165 1.1 mrg else 166 1.1 mrg { 167 1.1 mrg if (chunk > left) 168 1.1 mrg chunk = left; 169 1.1 mrg } 170 1.1 mrg end = start + chunk; 171 1.1 mrg 172 1.1 mrg ws->next = end; 173 1.1 mrg *pstart = start; 174 1.1 mrg *pend = end; 175 1.1 mrg return true; 176 1.1 mrg } 177 1.1 mrg 178 1.1 mrg 179 1.1 mrg #ifdef HAVE_SYNC_BUILTINS 180 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap 181 1.1 mrg instead. Note that the only memory value that changes is ws->next. */ 182 1.1 mrg 183 1.1 mrg bool 184 1.1 mrg gomp_iter_dynamic_next (long *pstart, long *pend) 185 1.1 mrg { 186 1.1 mrg struct gomp_thread *thr = gomp_thread (); 187 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share; 188 1.1 mrg long start, end, nend, chunk, incr; 189 1.1 mrg 190 1.1 mrg end = ws->end; 191 1.1 mrg incr = ws->incr; 192 1.1 mrg chunk = ws->chunk_size; 193 1.1 mrg 194 1.1 mrg if (__builtin_expect (ws->mode, 1)) 195 1.1 mrg { 196 1.1 mrg long tmp = __sync_fetch_and_add (&ws->next, chunk); 197 1.1 mrg if (incr > 0) 198 1.1 mrg { 199 1.1 mrg if (tmp >= end) 200 1.1 mrg return false; 201 1.1 mrg nend = tmp + chunk; 202 1.1 mrg if (nend > end) 203 1.1 mrg nend = end; 204 1.1 mrg *pstart = tmp; 205 1.1 mrg *pend = nend; 206 1.1 mrg return true; 207 1.1 mrg } 208 1.1 mrg else 209 1.1 mrg { 210 1.1 mrg if (tmp <= end) 211 1.1 mrg return false; 212 1.1 mrg nend = tmp + chunk; 213 1.1 mrg if (nend < end) 214 1.1 mrg nend = end; 215 1.1 mrg *pstart = tmp; 216 1.1 mrg *pend = nend; 217 1.1 mrg return true; 218 1.1 mrg } 219 1.1 mrg } 220 1.1 mrg 221 1.1.1.3 mrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED); 222 1.1 mrg while (1) 223 1.1 mrg { 224 1.1 mrg long left = end - start; 225 1.1 mrg long tmp; 226 1.1 mrg 227 1.1 mrg if (start == end) 228 1.1 mrg return false; 229 1.1 mrg 230 1.1 mrg if (incr < 0) 231 1.1 mrg { 232 1.1 mrg if (chunk < left) 233 1.1 mrg chunk = left; 234 1.1 mrg } 235 1.1 mrg else 236 1.1 mrg { 237 1.1 mrg if (chunk > left) 238 1.1 mrg chunk = left; 239 1.1 mrg } 240 1.1 mrg nend = start + chunk; 241 1.1 mrg 242 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend); 243 1.1 mrg if (__builtin_expect (tmp == start, 1)) 244 1.1 mrg break; 245 1.1 mrg 246 1.1 mrg start = tmp; 247 1.1 mrg } 248 1.1 mrg 249 1.1 mrg *pstart = start; 250 1.1 mrg *pend = nend; 251 1.1 mrg return true; 252 1.1 mrg } 253 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */ 254 1.1 mrg 255 1.1 mrg 256 1.1 mrg /* This function implements the GUIDED scheduling method. Arguments are 257 1.1 mrg as for gomp_iter_static_next. This function must be called with the 258 1.1 mrg work share lock held. */ 259 1.1 mrg 260 1.1 mrg bool 261 1.1 mrg gomp_iter_guided_next_locked (long *pstart, long *pend) 262 1.1 mrg { 263 1.1 mrg struct gomp_thread *thr = gomp_thread (); 264 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share; 265 1.1 mrg struct gomp_team *team = thr->ts.team; 266 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1; 267 1.1 mrg unsigned long n, q; 268 1.1 mrg long start, end; 269 1.1 mrg 270 1.1 mrg if (ws->next == ws->end) 271 1.1 mrg return false; 272 1.1 mrg 273 1.1 mrg start = ws->next; 274 1.1 mrg n = (ws->end - start) / ws->incr; 275 1.1 mrg q = (n + nthreads - 1) / nthreads; 276 1.1 mrg 277 1.1 mrg if (q < ws->chunk_size) 278 1.1 mrg q = ws->chunk_size; 279 1.1 mrg if (q <= n) 280 1.1 mrg end = start + q * ws->incr; 281 1.1 mrg else 282 1.1 mrg end = ws->end; 283 1.1 mrg 284 1.1 mrg ws->next = end; 285 1.1 mrg *pstart = start; 286 1.1 mrg *pend = end; 287 1.1 mrg return true; 288 1.1 mrg } 289 1.1 mrg 290 1.1 mrg #ifdef HAVE_SYNC_BUILTINS 291 1.1 mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap 292 1.1 mrg instead. Note that the only memory value that changes is ws->next. */ 293 1.1 mrg 294 1.1 mrg bool 295 1.1 mrg gomp_iter_guided_next (long *pstart, long *pend) 296 1.1 mrg { 297 1.1 mrg struct gomp_thread *thr = gomp_thread (); 298 1.1 mrg struct gomp_work_share *ws = thr->ts.work_share; 299 1.1 mrg struct gomp_team *team = thr->ts.team; 300 1.1 mrg unsigned long nthreads = team ? team->nthreads : 1; 301 1.1 mrg long start, end, nend, incr; 302 1.1 mrg unsigned long chunk_size; 303 1.1 mrg 304 1.1.1.3 mrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED); 305 1.1 mrg end = ws->end; 306 1.1 mrg incr = ws->incr; 307 1.1 mrg chunk_size = ws->chunk_size; 308 1.1 mrg 309 1.1 mrg while (1) 310 1.1 mrg { 311 1.1 mrg unsigned long n, q; 312 1.1 mrg long tmp; 313 1.1 mrg 314 1.1 mrg if (start == end) 315 1.1 mrg return false; 316 1.1 mrg 317 1.1 mrg n = (end - start) / incr; 318 1.1 mrg q = (n + nthreads - 1) / nthreads; 319 1.1 mrg 320 1.1 mrg if (q < chunk_size) 321 1.1 mrg q = chunk_size; 322 1.1 mrg if (__builtin_expect (q <= n, 1)) 323 1.1 mrg nend = start + q * incr; 324 1.1 mrg else 325 1.1 mrg nend = end; 326 1.1 mrg 327 1.1 mrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend); 328 1.1 mrg if (__builtin_expect (tmp == start, 1)) 329 1.1 mrg break; 330 1.1 mrg 331 1.1 mrg start = tmp; 332 1.1 mrg } 333 1.1 mrg 334 1.1 mrg *pstart = start; 335 1.1 mrg *pend = nend; 336 1.1 mrg return true; 337 1.1 mrg } 338 1.1 mrg #endif /* HAVE_SYNC_BUILTINS */ 339