dist/libgomp/iter.c

1.1.1.12  mrg /* Copyright (C) 2005-2024 Free Software Foundation, Inc.
     1.1  mrg    Contributed by Richard Henderson <rth (at) redhat.com>.
     1.1  mrg
 1.1.1.3  mrg    This file is part of the GNU Offloading and Multi Processing Library
 1.1.1.3  mrg    (libgomp).
     1.1  mrg
     1.1  mrg    Libgomp is free software; you can redistribute it and/or modify it
     1.1  mrg    under the terms of the GNU General Public License as published by
     1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
     1.1  mrg    any later version.
     1.1  mrg
     1.1  mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     1.1  mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     1.1  mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     1.1  mrg    more details.
     1.1  mrg
     1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     1.1  mrg    3.1, as published by the Free Software Foundation.
     1.1  mrg
     1.1  mrg    You should have received a copy of the GNU General Public License and
     1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     1.1  mrg    <http://www.gnu.org/licenses/>.  */
     1.1  mrg
     1.1  mrg /* This file contains routines for managing work-share iteration, both
     1.1  mrg    for loops and sections.  */
     1.1  mrg
     1.1  mrg #include "libgomp.h"
     1.1  mrg #include <stdlib.h>
     1.1  mrg
     1.1  mrg
     1.1  mrg /* This function implements the STATIC scheduling method.  The caller should
     1.1  mrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
     1.1  mrg    to perform; nonzero if not.  Return less than 0 if this thread had
     1.1  mrg    received the absolutely last iteration.  */
     1.1  mrg
     1.1  mrg int
     1.1  mrg gomp_iter_static_next (long *pstart, long *pend)
     1.1  mrg {
     1.1  mrg   struct gomp_thread *thr = gomp_thread ();
     1.1  mrg   struct gomp_team *team = thr->ts.team;
     1.1  mrg   struct gomp_work_share *ws = thr->ts.work_share;
     1.1  mrg   unsigned long nthreads = team ? team->nthreads : 1;
     1.1  mrg
     1.1  mrg   if (thr->ts.static_trip == -1)
     1.1  mrg     return -1;
     1.1  mrg
     1.1  mrg   /* Quick test for degenerate teams and orphaned constructs.  */
     1.1  mrg   if (nthreads == 1)
     1.1  mrg     {
     1.1  mrg       *pstart = ws->next;
     1.1  mrg       *pend = ws->end;
     1.1  mrg       thr->ts.static_trip = -1;
     1.1  mrg       return ws->next == ws->end;
     1.1  mrg     }
     1.1  mrg
     1.1  mrg   /* We interpret chunk_size zero as "unspecified", which means that we
     1.1  mrg      should break up the iterations such that each thread makes only one
     1.1  mrg      trip through the outer loop.  */
     1.1  mrg   if (ws->chunk_size == 0)
     1.1  mrg     {
 1.1.1.2  mrg       unsigned long n, q, i, t;
     1.1  mrg       unsigned long s0, e0;
     1.1  mrg       long s, e;
     1.1  mrg
     1.1  mrg       if (thr->ts.static_trip > 0)
     1.1  mrg 	return 1;
     1.1  mrg
     1.1  mrg       /* Compute the total number of iterations.  */
     1.1  mrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
     1.1  mrg       n = (ws->end - ws->next + s) / ws->incr;
     1.1  mrg       i = thr->ts.team_id;
     1.1  mrg
     1.1  mrg       /* Compute the "zero-based" start and end points.  That is, as
     1.1  mrg          if the loop began at zero and incremented by one.  */
     1.1  mrg       q = n / nthreads;
 1.1.1.2  mrg       t = n % nthreads;
 1.1.1.2  mrg       if (i < t)
 1.1.1.2  mrg 	{
 1.1.1.2  mrg 	  t = 0;
 1.1.1.2  mrg 	  q++;
 1.1.1.2  mrg 	}
 1.1.1.2  mrg       s0 = q * i + t;
     1.1  mrg       e0 = s0 + q;
     1.1  mrg
     1.1  mrg       /* Notice when no iterations allocated for this thread.  */
     1.1  mrg       if (s0 >= e0)
     1.1  mrg 	{
     1.1  mrg 	  thr->ts.static_trip = 1;
     1.1  mrg 	  return 1;
     1.1  mrg 	}
     1.1  mrg
     1.1  mrg       /* Transform these to the actual start and end numbers.  */
     1.1  mrg       s = (long)s0 * ws->incr + ws->next;
     1.1  mrg       e = (long)e0 * ws->incr + ws->next;
     1.1  mrg
     1.1  mrg       *pstart = s;
     1.1  mrg       *pend = e;
     1.1  mrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
     1.1  mrg       return 0;
     1.1  mrg     }
     1.1  mrg   else
     1.1  mrg     {
     1.1  mrg       unsigned long n, s0, e0, i, c;
     1.1  mrg       long s, e;
     1.1  mrg
     1.1  mrg       /* Otherwise, each thread gets exactly chunk_size iterations
     1.1  mrg 	 (if available) each time through the loop.  */
     1.1  mrg
     1.1  mrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
     1.1  mrg       n = (ws->end - ws->next + s) / ws->incr;
     1.1  mrg       i = thr->ts.team_id;
     1.1  mrg       c = ws->chunk_size;
     1.1  mrg
     1.1  mrg       /* Initial guess is a C sized chunk positioned nthreads iterations
     1.1  mrg 	 in, offset by our thread number.  */
     1.1  mrg       s0 = (thr->ts.static_trip * nthreads + i) * c;
     1.1  mrg       e0 = s0 + c;
     1.1  mrg
     1.1  mrg       /* Detect overflow.  */
     1.1  mrg       if (s0 >= n)
     1.1  mrg 	return 1;
     1.1  mrg       if (e0 > n)
     1.1  mrg 	e0 = n;
     1.1  mrg
     1.1  mrg       /* Transform these to the actual start and end numbers.  */
     1.1  mrg       s = (long)s0 * ws->incr + ws->next;
     1.1  mrg       e = (long)e0 * ws->incr + ws->next;
     1.1  mrg
     1.1  mrg       *pstart = s;
     1.1  mrg       *pend = e;
     1.1  mrg
     1.1  mrg       if (e0 == n)
     1.1  mrg 	thr->ts.static_trip = -1;
     1.1  mrg       else
     1.1  mrg 	thr->ts.static_trip++;
     1.1  mrg       return 0;
     1.1  mrg     }
     1.1  mrg }
     1.1  mrg
     1.1  mrg
     1.1  mrg /* This function implements the DYNAMIC scheduling method.  Arguments are
     1.1  mrg    as for gomp_iter_static_next.  This function must be called with ws->lock
     1.1  mrg    held.  */
     1.1  mrg
     1.1  mrg bool
     1.1  mrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
     1.1  mrg {
     1.1  mrg   struct gomp_thread *thr = gomp_thread ();
     1.1  mrg   struct gomp_work_share *ws = thr->ts.work_share;
     1.1  mrg   long start, end, chunk, left;
     1.1  mrg
     1.1  mrg   start = ws->next;
     1.1  mrg   if (start == ws->end)
     1.1  mrg     return false;
     1.1  mrg
     1.1  mrg   chunk = ws->chunk_size;
     1.1  mrg   left = ws->end - start;
     1.1  mrg   if (ws->incr < 0)
     1.1  mrg     {
     1.1  mrg       if (chunk < left)
     1.1  mrg 	chunk = left;
     1.1  mrg     }
     1.1  mrg   else
     1.1  mrg     {
     1.1  mrg       if (chunk > left)
     1.1  mrg 	chunk = left;
     1.1  mrg     }
     1.1  mrg   end = start + chunk;
     1.1  mrg
     1.1  mrg   ws->next = end;
     1.1  mrg   *pstart = start;
     1.1  mrg   *pend = end;
     1.1  mrg   return true;
     1.1  mrg }
     1.1  mrg
     1.1  mrg
     1.1  mrg #ifdef HAVE_SYNC_BUILTINS
     1.1  mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
     1.1  mrg    instead.  Note that the only memory value that changes is ws->next.  */
     1.1  mrg
     1.1  mrg bool
     1.1  mrg gomp_iter_dynamic_next (long *pstart, long *pend)
     1.1  mrg {
     1.1  mrg   struct gomp_thread *thr = gomp_thread ();
     1.1  mrg   struct gomp_work_share *ws = thr->ts.work_share;
     1.1  mrg   long start, end, nend, chunk, incr;
     1.1  mrg
     1.1  mrg   end = ws->end;
     1.1  mrg   incr = ws->incr;
     1.1  mrg   chunk = ws->chunk_size;
     1.1  mrg
     1.1  mrg   if (__builtin_expect (ws->mode, 1))
     1.1  mrg     {
     1.1  mrg       long tmp = __sync_fetch_and_add (&ws->next, chunk);
     1.1  mrg       if (incr > 0)
     1.1  mrg 	{
     1.1  mrg 	  if (tmp >= end)
     1.1  mrg 	    return false;
     1.1  mrg 	  nend = tmp + chunk;
     1.1  mrg 	  if (nend > end)
     1.1  mrg 	    nend = end;
     1.1  mrg 	  *pstart = tmp;
     1.1  mrg 	  *pend = nend;
     1.1  mrg 	  return true;
     1.1  mrg 	}
     1.1  mrg       else
     1.1  mrg 	{
     1.1  mrg 	  if (tmp <= end)
     1.1  mrg 	    return false;
     1.1  mrg 	  nend = tmp + chunk;
     1.1  mrg 	  if (nend < end)
     1.1  mrg 	    nend = end;
     1.1  mrg 	  *pstart = tmp;
     1.1  mrg 	  *pend = nend;
     1.1  mrg 	  return true;
     1.1  mrg 	}
     1.1  mrg     }
     1.1  mrg
 1.1.1.3  mrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
     1.1  mrg   while (1)
     1.1  mrg     {
     1.1  mrg       long left = end - start;
     1.1  mrg       long tmp;
     1.1  mrg
     1.1  mrg       if (start == end)
     1.1  mrg 	return false;
     1.1  mrg
     1.1  mrg       if (incr < 0)
     1.1  mrg 	{
     1.1  mrg 	  if (chunk < left)
     1.1  mrg 	    chunk = left;
     1.1  mrg 	}
     1.1  mrg       else
     1.1  mrg 	{
     1.1  mrg 	  if (chunk > left)
     1.1  mrg 	    chunk = left;
     1.1  mrg 	}
     1.1  mrg       nend = start + chunk;
     1.1  mrg
     1.1  mrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
     1.1  mrg       if (__builtin_expect (tmp == start, 1))
     1.1  mrg 	break;
     1.1  mrg
     1.1  mrg       start = tmp;
     1.1  mrg     }
     1.1  mrg
     1.1  mrg   *pstart = start;
     1.1  mrg   *pend = nend;
     1.1  mrg   return true;
     1.1  mrg }
     1.1  mrg #endif /* HAVE_SYNC_BUILTINS */
     1.1  mrg
     1.1  mrg
     1.1  mrg /* This function implements the GUIDED scheduling method.  Arguments are
     1.1  mrg    as for gomp_iter_static_next.  This function must be called with the
     1.1  mrg    work share lock held.  */
     1.1  mrg
     1.1  mrg bool
     1.1  mrg gomp_iter_guided_next_locked (long *pstart, long *pend)
     1.1  mrg {
     1.1  mrg   struct gomp_thread *thr = gomp_thread ();
     1.1  mrg   struct gomp_work_share *ws = thr->ts.work_share;
     1.1  mrg   struct gomp_team *team = thr->ts.team;
     1.1  mrg   unsigned long nthreads = team ? team->nthreads : 1;
     1.1  mrg   unsigned long n, q;
     1.1  mrg   long start, end;
     1.1  mrg
     1.1  mrg   if (ws->next == ws->end)
     1.1  mrg     return false;
     1.1  mrg
     1.1  mrg   start = ws->next;
     1.1  mrg   n = (ws->end - start) / ws->incr;
     1.1  mrg   q = (n + nthreads - 1) / nthreads;
     1.1  mrg
     1.1  mrg   if (q < ws->chunk_size)
     1.1  mrg     q = ws->chunk_size;
     1.1  mrg   if (q <= n)
     1.1  mrg     end = start + q * ws->incr;
     1.1  mrg   else
     1.1  mrg     end = ws->end;
     1.1  mrg
     1.1  mrg   ws->next = end;
     1.1  mrg   *pstart = start;
     1.1  mrg   *pend = end;
     1.1  mrg   return true;
     1.1  mrg }
     1.1  mrg
     1.1  mrg #ifdef HAVE_SYNC_BUILTINS
     1.1  mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
     1.1  mrg    instead.  Note that the only memory value that changes is ws->next.  */
     1.1  mrg
     1.1  mrg bool
     1.1  mrg gomp_iter_guided_next (long *pstart, long *pend)
     1.1  mrg {
     1.1  mrg   struct gomp_thread *thr = gomp_thread ();
     1.1  mrg   struct gomp_work_share *ws = thr->ts.work_share;
     1.1  mrg   struct gomp_team *team = thr->ts.team;
     1.1  mrg   unsigned long nthreads = team ? team->nthreads : 1;
     1.1  mrg   long start, end, nend, incr;
     1.1  mrg   unsigned long chunk_size;
     1.1  mrg
 1.1.1.3  mrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
     1.1  mrg   end = ws->end;
     1.1  mrg   incr = ws->incr;
     1.1  mrg   chunk_size = ws->chunk_size;
     1.1  mrg
     1.1  mrg   while (1)
     1.1  mrg     {
     1.1  mrg       unsigned long n, q;
     1.1  mrg       long tmp;
     1.1  mrg
     1.1  mrg       if (start == end)
     1.1  mrg 	return false;
     1.1  mrg
     1.1  mrg       n = (end - start) / incr;
     1.1  mrg       q = (n + nthreads - 1) / nthreads;
     1.1  mrg
     1.1  mrg       if (q < chunk_size)
     1.1  mrg 	q = chunk_size;
     1.1  mrg       if (__builtin_expect (q <= n, 1))
     1.1  mrg 	nend = start + q * incr;
     1.1  mrg       else
     1.1  mrg 	nend = end;
     1.1  mrg
     1.1  mrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
     1.1  mrg       if (__builtin_expect (tmp == start, 1))
     1.1  mrg 	break;
     1.1  mrg
     1.1  mrg       start = tmp;
     1.1  mrg     }
     1.1  mrg
     1.1  mrg   *pstart = start;
     1.1  mrg   *pend = nend;
     1.1  mrg   return true;
     1.1  mrg }
     1.1  mrg #endif /* HAVE_SYNC_BUILTINS */