Home | History | Annotate | Line # | Download | only in libgomp
iter_ull.c revision 1.5.4.2
      1 /* Copyright (C) 2005-2017 Free Software Foundation, Inc.
      2    Contributed by Richard Henderson <rth (at) redhat.com>.
      3 
      4    This file is part of the GNU Offloading and Multi Processing Library
      5    (libgomp).
      6 
      7    Libgomp is free software; you can redistribute it and/or modify it
      8    under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 3, or (at your option)
     10    any later version.
     11 
     12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     15    more details.
     16 
     17    Under Section 7 of GPL version 3, you are granted additional
     18    permissions described in the GCC Runtime Library Exception, version
     19    3.1, as published by the Free Software Foundation.
     20 
     21    You should have received a copy of the GNU General Public License and
     22    a copy of the GCC Runtime Library Exception along with this program;
     23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24    <http://www.gnu.org/licenses/>.  */
     25 
     26 /* This file contains routines for managing work-share iteration, both
     27    for loops and sections.  */
     28 
     29 #include "libgomp.h"
     30 #include <stdlib.h>
     31 
     32 typedef unsigned long long gomp_ull;
     33 
     34 /* This function implements the STATIC scheduling method.  The caller should
     35    iterate *pstart <= x < *pend.  Return zero if there are more iterations
     36    to perform; nonzero if not.  Return less than 0 if this thread had
     37    received the absolutely last iteration.  */
     38 
     39 int
     40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
     41 {
     42   struct gomp_thread *thr = gomp_thread ();
     43   struct gomp_team *team = thr->ts.team;
     44   struct gomp_work_share *ws = thr->ts.work_share;
     45   unsigned long nthreads = team ? team->nthreads : 1;
     46 
     47   if (thr->ts.static_trip == -1)
     48     return -1;
     49 
     50   /* Quick test for degenerate teams and orphaned constructs.  */
     51   if (nthreads == 1)
     52     {
     53       *pstart = ws->next_ull;
     54       *pend = ws->end_ull;
     55       thr->ts.static_trip = -1;
     56       return ws->next_ull == ws->end_ull;
     57     }
     58 
     59   /* We interpret chunk_size zero as "unspecified", which means that we
     60      should break up the iterations such that each thread makes only one
     61      trip through the outer loop.  */
     62   if (ws->chunk_size_ull == 0)
     63     {
     64       gomp_ull n, q, i, t, s0, e0, s, e;
     65 
     66       if (thr->ts.static_trip > 0)
     67 	return 1;
     68 
     69       /* Compute the total number of iterations.  */
     70       if (__builtin_expect (ws->mode, 0) == 0)
     71 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
     72       else
     73 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
     74       i = thr->ts.team_id;
     75 
     76       /* Compute the "zero-based" start and end points.  That is, as
     77 	 if the loop began at zero and incremented by one.  */
     78       q = n / nthreads;
     79       t = n % nthreads;
     80       if (i < t)
     81 	{
     82 	  t = 0;
     83 	  q++;
     84 	}
     85       s0 = q * i + t;
     86       e0 = s0 + q;
     87 
     88       /* Notice when no iterations allocated for this thread.  */
     89       if (s0 >= e0)
     90 	{
     91 	  thr->ts.static_trip = 1;
     92 	  return 1;
     93 	}
     94 
     95       /* Transform these to the actual start and end numbers.  */
     96       s = s0 * ws->incr_ull + ws->next_ull;
     97       e = e0 * ws->incr_ull + ws->next_ull;
     98 
     99       *pstart = s;
    100       *pend = e;
    101       thr->ts.static_trip = (e0 == n ? -1 : 1);
    102       return 0;
    103     }
    104   else
    105     {
    106       gomp_ull n, s0, e0, i, c, s, e;
    107 
    108       /* Otherwise, each thread gets exactly chunk_size iterations
    109 	 (if available) each time through the loop.  */
    110 
    111       if (__builtin_expect (ws->mode, 0) == 0)
    112 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
    113       else
    114 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
    115       i = thr->ts.team_id;
    116       c = ws->chunk_size_ull;
    117 
    118       /* Initial guess is a C sized chunk positioned nthreads iterations
    119 	 in, offset by our thread number.  */
    120       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
    121       e0 = s0 + c;
    122 
    123       /* Detect overflow.  */
    124       if (s0 >= n)
    125 	return 1;
    126       if (e0 > n)
    127 	e0 = n;
    128 
    129       /* Transform these to the actual start and end numbers.  */
    130       s = s0 * ws->incr_ull + ws->next_ull;
    131       e = e0 * ws->incr_ull + ws->next_ull;
    132 
    133       *pstart = s;
    134       *pend = e;
    135 
    136       if (e0 == n)
    137 	thr->ts.static_trip = -1;
    138       else
    139 	thr->ts.static_trip++;
    140       return 0;
    141     }
    142 }
    143 
    144 
    145 /* This function implements the DYNAMIC scheduling method.  Arguments are
    146    as for gomp_iter_ull_static_next.  This function must be called with
    147    ws->lock held.  */
    148 
    149 bool
    150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
    151 {
    152   struct gomp_thread *thr = gomp_thread ();
    153   struct gomp_work_share *ws = thr->ts.work_share;
    154   gomp_ull start, end, chunk, left;
    155 
    156   start = ws->next_ull;
    157   if (start == ws->end_ull)
    158     return false;
    159 
    160   chunk = ws->chunk_size_ull;
    161   left = ws->end_ull - start;
    162   if (__builtin_expect (ws->mode & 2, 0))
    163     {
    164       if (chunk < left)
    165 	chunk = left;
    166     }
    167   else
    168     {
    169       if (chunk > left)
    170 	chunk = left;
    171     }
    172   end = start + chunk;
    173 
    174   ws->next_ull = end;
    175   *pstart = start;
    176   *pend = end;
    177   return true;
    178 }
    179 
    180 
    181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    182 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    183    instead.  Note that the only memory value that changes is ws->next_ull.  */
    184 
    185 bool
    186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
    187 {
    188   struct gomp_thread *thr = gomp_thread ();
    189   struct gomp_work_share *ws = thr->ts.work_share;
    190   gomp_ull start, end, nend, chunk;
    191 
    192   end = ws->end_ull;
    193   chunk = ws->chunk_size_ull;
    194 
    195   if (__builtin_expect (ws->mode & 1, 1))
    196     {
    197       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
    198       if (__builtin_expect (ws->mode & 2, 0) == 0)
    199 	{
    200 	  if (tmp >= end)
    201 	    return false;
    202 	  nend = tmp + chunk;
    203 	  if (nend > end)
    204 	    nend = end;
    205 	  *pstart = tmp;
    206 	  *pend = nend;
    207 	  return true;
    208 	}
    209       else
    210 	{
    211 	  if (tmp <= end)
    212 	    return false;
    213 	  nend = tmp + chunk;
    214 	  if (nend < end)
    215 	    nend = end;
    216 	  *pstart = tmp;
    217 	  *pend = nend;
    218 	  return true;
    219 	}
    220     }
    221 
    222   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    223   while (1)
    224     {
    225       gomp_ull left = end - start;
    226       gomp_ull tmp;
    227 
    228       if (start == end)
    229 	return false;
    230 
    231       if (__builtin_expect (ws->mode & 2, 0))
    232 	{
    233 	  if (chunk < left)
    234 	    chunk = left;
    235 	}
    236       else
    237 	{
    238 	  if (chunk > left)
    239 	    chunk = left;
    240 	}
    241       nend = start + chunk;
    242 
    243       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    244       if (__builtin_expect (tmp == start, 1))
    245 	break;
    246 
    247       start = tmp;
    248     }
    249 
    250   *pstart = start;
    251   *pend = nend;
    252   return true;
    253 }
    254 #endif /* HAVE_SYNC_BUILTINS */
    255 
    256 
    257 /* This function implements the GUIDED scheduling method.  Arguments are
    258    as for gomp_iter_ull_static_next.  This function must be called with the
    259    work share lock held.  */
    260 
    261 bool
    262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
    263 {
    264   struct gomp_thread *thr = gomp_thread ();
    265   struct gomp_work_share *ws = thr->ts.work_share;
    266   struct gomp_team *team = thr->ts.team;
    267   gomp_ull nthreads = team ? team->nthreads : 1;
    268   gomp_ull n, q;
    269   gomp_ull start, end;
    270 
    271   if (ws->next_ull == ws->end_ull)
    272     return false;
    273 
    274   start = ws->next_ull;
    275   if (__builtin_expect (ws->mode, 0) == 0)
    276     n = (ws->end_ull - start) / ws->incr_ull;
    277   else
    278     n = (start - ws->end_ull) / -ws->incr_ull;
    279   q = (n + nthreads - 1) / nthreads;
    280 
    281   if (q < ws->chunk_size_ull)
    282     q = ws->chunk_size_ull;
    283   if (q <= n)
    284     end = start + q * ws->incr_ull;
    285   else
    286     end = ws->end_ull;
    287 
    288   ws->next_ull = end;
    289   *pstart = start;
    290   *pend = end;
    291   return true;
    292 }
    293 
    294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
    295 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    296    instead.  Note that the only memory value that changes is ws->next_ull.  */
    297 
    298 bool
    299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
    300 {
    301   struct gomp_thread *thr = gomp_thread ();
    302   struct gomp_work_share *ws = thr->ts.work_share;
    303   struct gomp_team *team = thr->ts.team;
    304   gomp_ull nthreads = team ? team->nthreads : 1;
    305   gomp_ull start, end, nend, incr;
    306   gomp_ull chunk_size;
    307 
    308   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
    309   end = ws->end_ull;
    310   incr = ws->incr_ull;
    311   chunk_size = ws->chunk_size_ull;
    312 
    313   while (1)
    314     {
    315       gomp_ull n, q;
    316       gomp_ull tmp;
    317 
    318       if (start == end)
    319 	return false;
    320 
    321       if (__builtin_expect (ws->mode, 0) == 0)
    322 	n = (end - start) / incr;
    323       else
    324 	n = (start - end) / -incr;
    325       q = (n + nthreads - 1) / nthreads;
    326 
    327       if (q < chunk_size)
    328 	q = chunk_size;
    329       if (__builtin_expect (q <= n, 1))
    330 	nend = start + q * incr;
    331       else
    332 	nend = end;
    333 
    334       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
    335       if (__builtin_expect (tmp == start, 1))
    336 	break;
    337 
    338       start = tmp;
    339     }
    340 
    341   *pstart = start;
    342   *pend = nend;
    343   return true;
    344 }
    345 #endif /* HAVE_SYNC_BUILTINS */
    346