dist/libgomp/iter_ull.c

1.5.4.2  martin /* Copyright (C) 2005-2017 Free Software Foundation, Inc.
    1.1     mrg    Contributed by Richard Henderson <rth (at) redhat.com>.
    1.1     mrg
    1.5     mrg    This file is part of the GNU Offloading and Multi Processing Library
    1.5     mrg    (libgomp).
    1.1     mrg
    1.1     mrg    Libgomp is free software; you can redistribute it and/or modify it
    1.1     mrg    under the terms of the GNU General Public License as published by
    1.1     mrg    the Free Software Foundation; either version 3, or (at your option)
    1.1     mrg    any later version.
    1.1     mrg
    1.1     mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
    1.1     mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    1.1     mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
    1.1     mrg    more details.
    1.1     mrg
    1.1     mrg    Under Section 7 of GPL version 3, you are granted additional
    1.1     mrg    permissions described in the GCC Runtime Library Exception, version
    1.1     mrg    3.1, as published by the Free Software Foundation.
    1.1     mrg
    1.1     mrg    You should have received a copy of the GNU General Public License and
    1.1     mrg    a copy of the GCC Runtime Library Exception along with this program;
    1.1     mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    1.1     mrg    <http://www.gnu.org/licenses/>.  */
    1.1     mrg
    1.1     mrg /* This file contains routines for managing work-share iteration, both
    1.1     mrg    for loops and sections.  */
    1.1     mrg
    1.1     mrg #include "libgomp.h"
    1.1     mrg #include <stdlib.h>
    1.1     mrg
    1.1     mrg typedef unsigned long long gomp_ull;
    1.1     mrg
    1.1     mrg /* This function implements the STATIC scheduling method.  The caller should
    1.1     mrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
    1.1     mrg    to perform; nonzero if not.  Return less than 0 if this thread had
    1.1     mrg    received the absolutely last iteration.  */
    1.1     mrg
    1.1     mrg int
    1.1     mrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
    1.1     mrg {
    1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    1.1     mrg   struct gomp_team *team = thr->ts.team;
    1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    1.1     mrg   unsigned long nthreads = team ? team->nthreads : 1;
    1.1     mrg
    1.1     mrg   if (thr->ts.static_trip == -1)
    1.1     mrg     return -1;
    1.1     mrg
    1.1     mrg   /* Quick test for degenerate teams and orphaned constructs.  */
    1.1     mrg   if (nthreads == 1)
    1.1     mrg     {
    1.1     mrg       *pstart = ws->next_ull;
    1.1     mrg       *pend = ws->end_ull;
    1.1     mrg       thr->ts.static_trip = -1;
    1.1     mrg       return ws->next_ull == ws->end_ull;
    1.1     mrg     }
    1.1     mrg
    1.1     mrg   /* We interpret chunk_size zero as "unspecified", which means that we
    1.1     mrg      should break up the iterations such that each thread makes only one
    1.1     mrg      trip through the outer loop.  */
    1.1     mrg   if (ws->chunk_size_ull == 0)
    1.1     mrg     {
    1.3     mrg       gomp_ull n, q, i, t, s0, e0, s, e;
    1.1     mrg
    1.1     mrg       if (thr->ts.static_trip > 0)
    1.1     mrg 	return 1;
    1.1     mrg
    1.1     mrg       /* Compute the total number of iterations.  */
    1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
    1.1     mrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
    1.1     mrg       else
    1.1     mrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
    1.1     mrg       i = thr->ts.team_id;
    1.1     mrg
    1.1     mrg       /* Compute the "zero-based" start and end points.  That is, as
    1.1     mrg 	 if the loop began at zero and incremented by one.  */
    1.1     mrg       q = n / nthreads;
    1.3     mrg       t = n % nthreads;
    1.3     mrg       if (i < t)
    1.3     mrg 	{
    1.3     mrg 	  t = 0;
    1.3     mrg 	  q++;
    1.3     mrg 	}
    1.3     mrg       s0 = q * i + t;
    1.1     mrg       e0 = s0 + q;
    1.1     mrg
    1.1     mrg       /* Notice when no iterations allocated for this thread.  */
    1.1     mrg       if (s0 >= e0)
    1.1     mrg 	{
    1.1     mrg 	  thr->ts.static_trip = 1;
    1.1     mrg 	  return 1;
    1.1     mrg 	}
    1.1     mrg
    1.1     mrg       /* Transform these to the actual start and end numbers.  */
    1.1     mrg       s = s0 * ws->incr_ull + ws->next_ull;
    1.1     mrg       e = e0 * ws->incr_ull + ws->next_ull;
    1.1     mrg
    1.1     mrg       *pstart = s;
    1.1     mrg       *pend = e;
    1.1     mrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
    1.1     mrg       return 0;
    1.1     mrg     }
    1.1     mrg   else
    1.1     mrg     {
    1.1     mrg       gomp_ull n, s0, e0, i, c, s, e;
    1.1     mrg
    1.1     mrg       /* Otherwise, each thread gets exactly chunk_size iterations
    1.1     mrg 	 (if available) each time through the loop.  */
    1.1     mrg
    1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
    1.1     mrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
    1.1     mrg       else
    1.1     mrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
    1.1     mrg       i = thr->ts.team_id;
    1.1     mrg       c = ws->chunk_size_ull;
    1.1     mrg
    1.1     mrg       /* Initial guess is a C sized chunk positioned nthreads iterations
    1.1     mrg 	 in, offset by our thread number.  */
    1.1     mrg       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
    1.1     mrg       e0 = s0 + c;
    1.1     mrg
    1.1     mrg       /* Detect overflow.  */
    1.1     mrg       if (s0 >= n)
    1.1     mrg 	return 1;
    1.1     mrg       if (e0 > n)
    1.1     mrg 	e0 = n;
    1.1     mrg
    1.1     mrg       /* Transform these to the actual start and end numbers.  */
    1.1     mrg       s = s0 * ws->incr_ull + ws->next_ull;
    1.1     mrg       e = e0 * ws->incr_ull + ws->next_ull;
    1.1     mrg
    1.1     mrg       *pstart = s;
    1.1     mrg       *pend = e;
    1.1     mrg
    1.1     mrg       if (e0 == n)
    1.1     mrg 	thr->ts.static_trip = -1;
    1.1     mrg       else
    1.1     mrg 	thr->ts.static_trip++;
    1.1     mrg       return 0;
    1.1     mrg     }
    1.1     mrg }
    1.1     mrg
    1.1     mrg
    1.1     mrg /* This function implements the DYNAMIC scheduling method.  Arguments are
    1.1     mrg    as for gomp_iter_ull_static_next.  This function must be called with
    1.1     mrg    ws->lock held.  */
    1.1     mrg
    1.1     mrg bool
    1.1     mrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
    1.1     mrg {
    1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    1.1     mrg   gomp_ull start, end, chunk, left;
    1.1     mrg
    1.1     mrg   start = ws->next_ull;
    1.1     mrg   if (start == ws->end_ull)
    1.1     mrg     return false;
    1.1     mrg
    1.1     mrg   chunk = ws->chunk_size_ull;
    1.1     mrg   left = ws->end_ull - start;
    1.1     mrg   if (__builtin_expect (ws->mode & 2, 0))
    1.1     mrg     {
    1.1     mrg       if (chunk < left)
    1.1     mrg 	chunk = left;
    1.1     mrg     }
    1.1     mrg   else
    1.1     mrg     {
    1.1     mrg       if (chunk > left)
    1.1     mrg 	chunk = left;
    1.1     mrg     }
    1.1     mrg   end = start + chunk;
    1.1     mrg
    1.1     mrg   ws->next_ull = end;
    1.1     mrg   *pstart = start;
    1.1     mrg   *pend = end;
    1.1     mrg   return true;
    1.1     mrg }
    1.1     mrg
    1.1     mrg
    1.1     mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    1.1     mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
    1.1     mrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
    1.1     mrg
    1.1     mrg bool
    1.1     mrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
    1.1     mrg {
    1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    1.1     mrg   gomp_ull start, end, nend, chunk;
    1.1     mrg
    1.1     mrg   end = ws->end_ull;
    1.1     mrg   chunk = ws->chunk_size_ull;
    1.1     mrg
    1.1     mrg   if (__builtin_expect (ws->mode & 1, 1))
    1.1     mrg     {
    1.1     mrg       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
    1.1     mrg       if (__builtin_expect (ws->mode & 2, 0) == 0)
    1.1     mrg 	{
    1.1     mrg 	  if (tmp >= end)
    1.1     mrg 	    return false;
    1.1     mrg 	  nend = tmp + chunk;
    1.1     mrg 	  if (nend > end)
    1.1     mrg 	    nend = end;
    1.1     mrg 	  *pstart = tmp;
    1.1     mrg 	  *pend = nend;
    1.1     mrg 	  return true;
    1.1     mrg 	}
    1.1     mrg       else
    1.1     mrg 	{
    1.1     mrg 	  if (tmp <= end)
    1.1     mrg 	    return false;
    1.1     mrg 	  nend = tmp + chunk;
    1.1     mrg 	  if (nend < end)
    1.1     mrg 	    nend = end;
    1.1     mrg 	  *pstart = tmp;
    1.1     mrg 	  *pend = nend;
    1.1     mrg 	  return true;
    1.1     mrg 	}
    1.1     mrg     }
    1.1     mrg
    1.5     mrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    1.1     mrg   while (1)
    1.1     mrg     {
    1.1     mrg       gomp_ull left = end - start;
    1.1     mrg       gomp_ull tmp;
    1.1     mrg
    1.1     mrg       if (start == end)
    1.1     mrg 	return false;
    1.1     mrg
    1.1     mrg       if (__builtin_expect (ws->mode & 2, 0))
    1.1     mrg 	{
    1.1     mrg 	  if (chunk < left)
    1.1     mrg 	    chunk = left;
    1.1     mrg 	}
    1.1     mrg       else
    1.1     mrg 	{
    1.1     mrg 	  if (chunk > left)
    1.1     mrg 	    chunk = left;
    1.1     mrg 	}
    1.1     mrg       nend = start + chunk;
    1.1     mrg
    1.1     mrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    1.1     mrg       if (__builtin_expect (tmp == start, 1))
    1.1     mrg 	break;
    1.1     mrg
    1.1     mrg       start = tmp;
    1.1     mrg     }
    1.1     mrg
    1.1     mrg   *pstart = start;
    1.1     mrg   *pend = nend;
    1.1     mrg   return true;
    1.1     mrg }
    1.1     mrg #endif /* HAVE_SYNC_BUILTINS */
    1.1     mrg
    1.1     mrg
    1.1     mrg /* This function implements the GUIDED scheduling method.  Arguments are
    1.1     mrg    as for gomp_iter_ull_static_next.  This function must be called with the
    1.1     mrg    work share lock held.  */
    1.1     mrg
    1.1     mrg bool
    1.1     mrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
    1.1     mrg {
    1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    1.1     mrg   struct gomp_team *team = thr->ts.team;
    1.1     mrg   gomp_ull nthreads = team ? team->nthreads : 1;
    1.1     mrg   gomp_ull n, q;
    1.1     mrg   gomp_ull start, end;
    1.1     mrg
    1.1     mrg   if (ws->next_ull == ws->end_ull)
    1.1     mrg     return false;
    1.1     mrg
    1.1     mrg   start = ws->next_ull;
    1.1     mrg   if (__builtin_expect (ws->mode, 0) == 0)
    1.1     mrg     n = (ws->end_ull - start) / ws->incr_ull;
    1.1     mrg   else
    1.1     mrg     n = (start - ws->end_ull) / -ws->incr_ull;
    1.1     mrg   q = (n + nthreads - 1) / nthreads;
    1.1     mrg
    1.1     mrg   if (q < ws->chunk_size_ull)
    1.1     mrg     q = ws->chunk_size_ull;
    1.1     mrg   if (q <= n)
    1.1     mrg     end = start + q * ws->incr_ull;
    1.1     mrg   else
    1.1     mrg     end = ws->end_ull;
    1.1     mrg
    1.1     mrg   ws->next_ull = end;
    1.1     mrg   *pstart = start;
    1.1     mrg   *pend = end;
    1.1     mrg   return true;
    1.1     mrg }
    1.1     mrg
    1.1     mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    1.1     mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
    1.1     mrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
    1.1     mrg
    1.1     mrg bool
    1.1     mrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
    1.1     mrg {
    1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    1.1     mrg   struct gomp_team *team = thr->ts.team;
    1.1     mrg   gomp_ull nthreads = team ? team->nthreads : 1;
    1.1     mrg   gomp_ull start, end, nend, incr;
    1.1     mrg   gomp_ull chunk_size;
    1.1     mrg
    1.5     mrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    1.1     mrg   end = ws->end_ull;
    1.1     mrg   incr = ws->incr_ull;
    1.1     mrg   chunk_size = ws->chunk_size_ull;
    1.1     mrg
    1.1     mrg   while (1)
    1.1     mrg     {
    1.1     mrg       gomp_ull n, q;
    1.1     mrg       gomp_ull tmp;
    1.1     mrg
    1.1     mrg       if (start == end)
    1.1     mrg 	return false;
    1.1     mrg
    1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
    1.1     mrg 	n = (end - start) / incr;
    1.1     mrg       else
    1.1     mrg 	n = (start - end) / -incr;
    1.1     mrg       q = (n + nthreads - 1) / nthreads;
    1.1     mrg
    1.1     mrg       if (q < chunk_size)
    1.1     mrg 	q = chunk_size;
    1.1     mrg       if (__builtin_expect (q <= n, 1))
    1.1     mrg 	nend = start + q * incr;
    1.1     mrg       else
    1.1     mrg 	nend = end;
    1.1     mrg
    1.1     mrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    1.1     mrg       if (__builtin_expect (tmp == start, 1))
    1.1     mrg 	break;
    1.1     mrg
    1.1     mrg       start = tmp;
    1.1     mrg     }
    1.1     mrg
    1.1     mrg   *pstart = start;
    1.1     mrg   *pend = nend;
    1.1     mrg   return true;
    1.1     mrg }
    1.1     mrg #endif /* HAVE_SYNC_BUILTINS */