Home | History | Annotate | Line # | Download | only in libgomp
iter_ull.c revision 1.5.4.2
      1  1.5.4.2  martin /* Copyright (C) 2005-2017 Free Software Foundation, Inc.
      2      1.1     mrg    Contributed by Richard Henderson <rth (at) redhat.com>.
      3      1.1     mrg 
      4      1.5     mrg    This file is part of the GNU Offloading and Multi Processing Library
      5      1.5     mrg    (libgomp).
      6      1.1     mrg 
      7      1.1     mrg    Libgomp is free software; you can redistribute it and/or modify it
      8      1.1     mrg    under the terms of the GNU General Public License as published by
      9      1.1     mrg    the Free Software Foundation; either version 3, or (at your option)
     10      1.1     mrg    any later version.
     11      1.1     mrg 
     12      1.1     mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     13      1.1     mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     14      1.1     mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     15      1.1     mrg    more details.
     16      1.1     mrg 
     17      1.1     mrg    Under Section 7 of GPL version 3, you are granted additional
     18      1.1     mrg    permissions described in the GCC Runtime Library Exception, version
     19      1.1     mrg    3.1, as published by the Free Software Foundation.
     20      1.1     mrg 
     21      1.1     mrg    You should have received a copy of the GNU General Public License and
     22      1.1     mrg    a copy of the GCC Runtime Library Exception along with this program;
     23      1.1     mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24      1.1     mrg    <http://www.gnu.org/licenses/>.  */
     25      1.1     mrg 
     26      1.1     mrg /* This file contains routines for managing work-share iteration, both
     27      1.1     mrg    for loops and sections.  */
     28      1.1     mrg 
     29      1.1     mrg #include "libgomp.h"
     30      1.1     mrg #include <stdlib.h>
     31      1.1     mrg 
     32      1.1     mrg typedef unsigned long long gomp_ull;
     33      1.1     mrg 
     34      1.1     mrg /* This function implements the STATIC scheduling method.  The caller should
     35      1.1     mrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
     36      1.1     mrg    to perform; nonzero if not.  Return less than 0 if this thread had
     37      1.1     mrg    received the absolutely last iteration.  */
     38      1.1     mrg 
     39      1.1     mrg int
     40      1.1     mrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
     41      1.1     mrg {
     42      1.1     mrg   struct gomp_thread *thr = gomp_thread ();
     43      1.1     mrg   struct gomp_team *team = thr->ts.team;
     44      1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
     45      1.1     mrg   unsigned long nthreads = team ? team->nthreads : 1;
     46      1.1     mrg 
     47      1.1     mrg   if (thr->ts.static_trip == -1)
     48      1.1     mrg     return -1;
     49      1.1     mrg 
     50      1.1     mrg   /* Quick test for degenerate teams and orphaned constructs.  */
     51      1.1     mrg   if (nthreads == 1)
     52      1.1     mrg     {
     53      1.1     mrg       *pstart = ws->next_ull;
     54      1.1     mrg       *pend = ws->end_ull;
     55      1.1     mrg       thr->ts.static_trip = -1;
     56      1.1     mrg       return ws->next_ull == ws->end_ull;
     57      1.1     mrg     }
     58      1.1     mrg 
     59      1.1     mrg   /* We interpret chunk_size zero as "unspecified", which means that we
     60      1.1     mrg      should break up the iterations such that each thread makes only one
     61      1.1     mrg      trip through the outer loop.  */
     62      1.1     mrg   if (ws->chunk_size_ull == 0)
     63      1.1     mrg     {
     64      1.3     mrg       gomp_ull n, q, i, t, s0, e0, s, e;
     65      1.1     mrg 
     66      1.1     mrg       if (thr->ts.static_trip > 0)
     67      1.1     mrg 	return 1;
     68      1.1     mrg 
     69      1.1     mrg       /* Compute the total number of iterations.  */
     70      1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
     71      1.1     mrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
     72      1.1     mrg       else
     73      1.1     mrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
     74      1.1     mrg       i = thr->ts.team_id;
     75      1.1     mrg 
     76      1.1     mrg       /* Compute the "zero-based" start and end points.  That is, as
     77      1.1     mrg 	 if the loop began at zero and incremented by one.  */
     78      1.1     mrg       q = n / nthreads;
     79      1.3     mrg       t = n % nthreads;
     80      1.3     mrg       if (i < t)
     81      1.3     mrg 	{
     82      1.3     mrg 	  t = 0;
     83      1.3     mrg 	  q++;
     84      1.3     mrg 	}
     85      1.3     mrg       s0 = q * i + t;
     86      1.1     mrg       e0 = s0 + q;
     87      1.1     mrg 
     88      1.1     mrg       /* Notice when no iterations allocated for this thread.  */
     89      1.1     mrg       if (s0 >= e0)
     90      1.1     mrg 	{
     91      1.1     mrg 	  thr->ts.static_trip = 1;
     92      1.1     mrg 	  return 1;
     93      1.1     mrg 	}
     94      1.1     mrg 
     95      1.1     mrg       /* Transform these to the actual start and end numbers.  */
     96      1.1     mrg       s = s0 * ws->incr_ull + ws->next_ull;
     97      1.1     mrg       e = e0 * ws->incr_ull + ws->next_ull;
     98      1.1     mrg 
     99      1.1     mrg       *pstart = s;
    100      1.1     mrg       *pend = e;
    101      1.1     mrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
    102      1.1     mrg       return 0;
    103      1.1     mrg     }
    104      1.1     mrg   else
    105      1.1     mrg     {
    106      1.1     mrg       gomp_ull n, s0, e0, i, c, s, e;
    107      1.1     mrg 
    108      1.1     mrg       /* Otherwise, each thread gets exactly chunk_size iterations
    109      1.1     mrg 	 (if available) each time through the loop.  */
    110      1.1     mrg 
    111      1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
    112      1.1     mrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
    113      1.1     mrg       else
    114      1.1     mrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
    115      1.1     mrg       i = thr->ts.team_id;
    116      1.1     mrg       c = ws->chunk_size_ull;
    117      1.1     mrg 
    118      1.1     mrg       /* Initial guess is a C sized chunk positioned nthreads iterations
    119      1.1     mrg 	 in, offset by our thread number.  */
    120      1.1     mrg       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
    121      1.1     mrg       e0 = s0 + c;
    122      1.1     mrg 
    123      1.1     mrg       /* Detect overflow.  */
    124      1.1     mrg       if (s0 >= n)
    125      1.1     mrg 	return 1;
    126      1.1     mrg       if (e0 > n)
    127      1.1     mrg 	e0 = n;
    128      1.1     mrg 
    129      1.1     mrg       /* Transform these to the actual start and end numbers.  */
    130      1.1     mrg       s = s0 * ws->incr_ull + ws->next_ull;
    131      1.1     mrg       e = e0 * ws->incr_ull + ws->next_ull;
    132      1.1     mrg 
    133      1.1     mrg       *pstart = s;
    134      1.1     mrg       *pend = e;
    135      1.1     mrg 
    136      1.1     mrg       if (e0 == n)
    137      1.1     mrg 	thr->ts.static_trip = -1;
    138      1.1     mrg       else
    139      1.1     mrg 	thr->ts.static_trip++;
    140      1.1     mrg       return 0;
    141      1.1     mrg     }
    142      1.1     mrg }
    143      1.1     mrg 
    144      1.1     mrg 
    145      1.1     mrg /* This function implements the DYNAMIC scheduling method.  Arguments are
    146      1.1     mrg    as for gomp_iter_ull_static_next.  This function must be called with
    147      1.1     mrg    ws->lock held.  */
    148      1.1     mrg 
    149      1.1     mrg bool
    150      1.1     mrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
    151      1.1     mrg {
    152      1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    153      1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    154      1.1     mrg   gomp_ull start, end, chunk, left;
    155      1.1     mrg 
    156      1.1     mrg   start = ws->next_ull;
    157      1.1     mrg   if (start == ws->end_ull)
    158      1.1     mrg     return false;
    159      1.1     mrg 
    160      1.1     mrg   chunk = ws->chunk_size_ull;
    161      1.1     mrg   left = ws->end_ull - start;
    162      1.1     mrg   if (__builtin_expect (ws->mode & 2, 0))
    163      1.1     mrg     {
    164      1.1     mrg       if (chunk < left)
    165      1.1     mrg 	chunk = left;
    166      1.1     mrg     }
    167      1.1     mrg   else
    168      1.1     mrg     {
    169      1.1     mrg       if (chunk > left)
    170      1.1     mrg 	chunk = left;
    171      1.1     mrg     }
    172      1.1     mrg   end = start + chunk;
    173      1.1     mrg 
    174      1.1     mrg   ws->next_ull = end;
    175      1.1     mrg   *pstart = start;
    176      1.1     mrg   *pend = end;
    177      1.1     mrg   return true;
    178      1.1     mrg }
    179      1.1     mrg 
    180      1.1     mrg 
    181      1.1     mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    182      1.1     mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
    183      1.1     mrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
    184      1.1     mrg 
    185      1.1     mrg bool
    186      1.1     mrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
    187      1.1     mrg {
    188      1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    189      1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    190      1.1     mrg   gomp_ull start, end, nend, chunk;
    191      1.1     mrg 
    192      1.1     mrg   end = ws->end_ull;
    193      1.1     mrg   chunk = ws->chunk_size_ull;
    194      1.1     mrg 
    195      1.1     mrg   if (__builtin_expect (ws->mode & 1, 1))
    196      1.1     mrg     {
    197      1.1     mrg       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
    198      1.1     mrg       if (__builtin_expect (ws->mode & 2, 0) == 0)
    199      1.1     mrg 	{
    200      1.1     mrg 	  if (tmp >= end)
    201      1.1     mrg 	    return false;
    202      1.1     mrg 	  nend = tmp + chunk;
    203      1.1     mrg 	  if (nend > end)
    204      1.1     mrg 	    nend = end;
    205      1.1     mrg 	  *pstart = tmp;
    206      1.1     mrg 	  *pend = nend;
    207      1.1     mrg 	  return true;
    208      1.1     mrg 	}
    209      1.1     mrg       else
    210      1.1     mrg 	{
    211      1.1     mrg 	  if (tmp <= end)
    212      1.1     mrg 	    return false;
    213      1.1     mrg 	  nend = tmp + chunk;
    214      1.1     mrg 	  if (nend < end)
    215      1.1     mrg 	    nend = end;
    216      1.1     mrg 	  *pstart = tmp;
    217      1.1     mrg 	  *pend = nend;
    218      1.1     mrg 	  return true;
    219      1.1     mrg 	}
    220      1.1     mrg     }
    221      1.1     mrg 
    222      1.5     mrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    223      1.1     mrg   while (1)
    224      1.1     mrg     {
    225      1.1     mrg       gomp_ull left = end - start;
    226      1.1     mrg       gomp_ull tmp;
    227      1.1     mrg 
    228      1.1     mrg       if (start == end)
    229      1.1     mrg 	return false;
    230      1.1     mrg 
    231      1.1     mrg       if (__builtin_expect (ws->mode & 2, 0))
    232      1.1     mrg 	{
    233      1.1     mrg 	  if (chunk < left)
    234      1.1     mrg 	    chunk = left;
    235      1.1     mrg 	}
    236      1.1     mrg       else
    237      1.1     mrg 	{
    238      1.1     mrg 	  if (chunk > left)
    239      1.1     mrg 	    chunk = left;
    240      1.1     mrg 	}
    241      1.1     mrg       nend = start + chunk;
    242      1.1     mrg 
    243      1.1     mrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    244      1.1     mrg       if (__builtin_expect (tmp == start, 1))
    245      1.1     mrg 	break;
    246      1.1     mrg 
    247      1.1     mrg       start = tmp;
    248      1.1     mrg     }
    249      1.1     mrg 
    250      1.1     mrg   *pstart = start;
    251      1.1     mrg   *pend = nend;
    252      1.1     mrg   return true;
    253      1.1     mrg }
    254      1.1     mrg #endif /* HAVE_SYNC_BUILTINS */
    255      1.1     mrg 
    256      1.1     mrg 
    257      1.1     mrg /* This function implements the GUIDED scheduling method.  Arguments are
    258      1.1     mrg    as for gomp_iter_ull_static_next.  This function must be called with the
    259      1.1     mrg    work share lock held.  */
    260      1.1     mrg 
    261      1.1     mrg bool
    262      1.1     mrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
    263      1.1     mrg {
    264      1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    265      1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    266      1.1     mrg   struct gomp_team *team = thr->ts.team;
    267      1.1     mrg   gomp_ull nthreads = team ? team->nthreads : 1;
    268      1.1     mrg   gomp_ull n, q;
    269      1.1     mrg   gomp_ull start, end;
    270      1.1     mrg 
    271      1.1     mrg   if (ws->next_ull == ws->end_ull)
    272      1.1     mrg     return false;
    273      1.1     mrg 
    274      1.1     mrg   start = ws->next_ull;
    275      1.1     mrg   if (__builtin_expect (ws->mode, 0) == 0)
    276      1.1     mrg     n = (ws->end_ull - start) / ws->incr_ull;
    277      1.1     mrg   else
    278      1.1     mrg     n = (start - ws->end_ull) / -ws->incr_ull;
    279      1.1     mrg   q = (n + nthreads - 1) / nthreads;
    280      1.1     mrg 
    281      1.1     mrg   if (q < ws->chunk_size_ull)
    282      1.1     mrg     q = ws->chunk_size_ull;
    283      1.1     mrg   if (q <= n)
    284      1.1     mrg     end = start + q * ws->incr_ull;
    285      1.1     mrg   else
    286      1.1     mrg     end = ws->end_ull;
    287      1.1     mrg 
    288      1.1     mrg   ws->next_ull = end;
    289      1.1     mrg   *pstart = start;
    290      1.1     mrg   *pend = end;
    291      1.1     mrg   return true;
    292      1.1     mrg }
    293      1.1     mrg 
    294      1.1     mrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    295      1.1     mrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
    296      1.1     mrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
    297      1.1     mrg 
    298      1.1     mrg bool
    299      1.1     mrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
    300      1.1     mrg {
    301      1.1     mrg   struct gomp_thread *thr = gomp_thread ();
    302      1.1     mrg   struct gomp_work_share *ws = thr->ts.work_share;
    303      1.1     mrg   struct gomp_team *team = thr->ts.team;
    304      1.1     mrg   gomp_ull nthreads = team ? team->nthreads : 1;
    305      1.1     mrg   gomp_ull start, end, nend, incr;
    306      1.1     mrg   gomp_ull chunk_size;
    307      1.1     mrg 
    308      1.5     mrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    309      1.1     mrg   end = ws->end_ull;
    310      1.1     mrg   incr = ws->incr_ull;
    311      1.1     mrg   chunk_size = ws->chunk_size_ull;
    312      1.1     mrg 
    313      1.1     mrg   while (1)
    314      1.1     mrg     {
    315      1.1     mrg       gomp_ull n, q;
    316      1.1     mrg       gomp_ull tmp;
    317      1.1     mrg 
    318      1.1     mrg       if (start == end)
    319      1.1     mrg 	return false;
    320      1.1     mrg 
    321      1.1     mrg       if (__builtin_expect (ws->mode, 0) == 0)
    322      1.1     mrg 	n = (end - start) / incr;
    323      1.1     mrg       else
    324      1.1     mrg 	n = (start - end) / -incr;
    325      1.1     mrg       q = (n + nthreads - 1) / nthreads;
    326      1.1     mrg 
    327      1.1     mrg       if (q < chunk_size)
    328      1.1     mrg 	q = chunk_size;
    329      1.1     mrg       if (__builtin_expect (q <= n, 1))
    330      1.1     mrg 	nend = start + q * incr;
    331      1.1     mrg       else
    332      1.1     mrg 	nend = end;
    333      1.1     mrg 
    334      1.1     mrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    335      1.1     mrg       if (__builtin_expect (tmp == start, 1))
    336      1.1     mrg 	break;
    337      1.1     mrg 
    338      1.1     mrg       start = tmp;
    339      1.1     mrg     }
    340      1.1     mrg 
    341      1.1     mrg   *pstart = start;
    342      1.1     mrg   *pend = nend;
    343      1.1     mrg   return true;
    344      1.1     mrg }
    345      1.1     mrg #endif /* HAVE_SYNC_BUILTINS */
    346