Home | History | Annotate | Line # | Download | only in libgomp
iter.c revision 1.6
      1 /* Copyright (C) 2005-2016 Free Software Foundation, Inc.
      2    Contributed by Richard Henderson <rth (at) redhat.com>.
      3 
      4    This file is part of the GNU Offloading and Multi Processing Library
      5    (libgomp).
      6 
      7    Libgomp is free software; you can redistribute it and/or modify it
      8    under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 3, or (at your option)
     10    any later version.
     11 
     12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     15    more details.
     16 
     17    Under Section 7 of GPL version 3, you are granted additional
     18    permissions described in the GCC Runtime Library Exception, version
     19    3.1, as published by the Free Software Foundation.
     20 
     21    You should have received a copy of the GNU General Public License and
     22    a copy of the GCC Runtime Library Exception along with this program;
     23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24    <http://www.gnu.org/licenses/>.  */
     25 
     26 /* This file contains routines for managing work-share iteration, both
     27    for loops and sections.  */
     28 
     29 #include "libgomp.h"
     30 #include <stdlib.h>
     31 
     32 
     33 /* This function implements the STATIC scheduling method.  The caller should
     34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
     35    to perform; nonzero if not.  Return less than 0 if this thread had
     36    received the absolutely last iteration.  */
     37 
     38 int
     39 gomp_iter_static_next (long *pstart, long *pend)
     40 {
     41   struct gomp_thread *thr = gomp_thread ();
     42   struct gomp_team *team = thr->ts.team;
     43   struct gomp_work_share *ws = thr->ts.work_share;
     44   unsigned long nthreads = team ? team->nthreads : 1;
     45 
     46   if (thr->ts.static_trip == -1)
     47     return -1;
     48 
     49   /* Quick test for degenerate teams and orphaned constructs.  */
     50   if (nthreads == 1)
     51     {
     52       *pstart = ws->next;
     53       *pend = ws->end;
     54       thr->ts.static_trip = -1;
     55       return ws->next == ws->end;
     56     }
     57 
     58   /* We interpret chunk_size zero as "unspecified", which means that we
     59      should break up the iterations such that each thread makes only one
     60      trip through the outer loop.  */
     61   if (ws->chunk_size == 0)
     62     {
     63       unsigned long n, q, i, t;
     64       unsigned long s0, e0;
     65       long s, e;
     66 
     67       if (thr->ts.static_trip > 0)
     68 	return 1;
     69 
     70       /* Compute the total number of iterations.  */
     71       s = ws->incr + (ws->incr > 0 ? -1 : 1);
     72       n = (ws->end - ws->next + s) / ws->incr;
     73       i = thr->ts.team_id;
     74 
     75       /* Compute the "zero-based" start and end points.  That is, as
     76          if the loop began at zero and incremented by one.  */
     77       q = n / nthreads;
     78       t = n % nthreads;
     79       if (i < t)
     80 	{
     81 	  t = 0;
     82 	  q++;
     83 	}
     84       s0 = q * i + t;
     85       e0 = s0 + q;
     86 
     87       /* Notice when no iterations allocated for this thread.  */
     88       if (s0 >= e0)
     89 	{
     90 	  thr->ts.static_trip = 1;
     91 	  return 1;
     92 	}
     93 
     94       /* Transform these to the actual start and end numbers.  */
     95       s = (long)s0 * ws->incr + ws->next;
     96       e = (long)e0 * ws->incr + ws->next;
     97 
     98       *pstart = s;
     99       *pend = e;
    100       thr->ts.static_trip = (e0 == n ? -1 : 1);
    101       return 0;
    102     }
    103   else
    104     {
    105       unsigned long n, s0, e0, i, c;
    106       long s, e;
    107 
    108       /* Otherwise, each thread gets exactly chunk_size iterations
    109 	 (if available) each time through the loop.  */
    110 
    111       s = ws->incr + (ws->incr > 0 ? -1 : 1);
    112       n = (ws->end - ws->next + s) / ws->incr;
    113       i = thr->ts.team_id;
    114       c = ws->chunk_size;
    115 
    116       /* Initial guess is a C sized chunk positioned nthreads iterations
    117 	 in, offset by our thread number.  */
    118       s0 = (thr->ts.static_trip * nthreads + i) * c;
    119       e0 = s0 + c;
    120 
    121       /* Detect overflow.  */
    122       if (s0 >= n)
    123 	return 1;
    124       if (e0 > n)
    125 	e0 = n;
    126 
    127       /* Transform these to the actual start and end numbers.  */
    128       s = (long)s0 * ws->incr + ws->next;
    129       e = (long)e0 * ws->incr + ws->next;
    130 
    131       *pstart = s;
    132       *pend = e;
    133 
    134       if (e0 == n)
    135 	thr->ts.static_trip = -1;
    136       else
    137 	thr->ts.static_trip++;
    138       return 0;
    139     }
    140 }
    141 
    142 
    143 /* This function implements the DYNAMIC scheduling method.  Arguments are
    144    as for gomp_iter_static_next.  This function must be called with ws->lock
    145    held.  */
    146 
    147 bool
    148 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
    149 {
    150   struct gomp_thread *thr = gomp_thread ();
    151   struct gomp_work_share *ws = thr->ts.work_share;
    152   long start, end, chunk, left;
    153 
    154   start = ws->next;
    155   if (start == ws->end)
    156     return false;
    157 
    158   chunk = ws->chunk_size;
    159   left = ws->end - start;
    160   if (ws->incr < 0)
    161     {
    162       if (chunk < left)
    163 	chunk = left;
    164     }
    165   else
    166     {
    167       if (chunk > left)
    168 	chunk = left;
    169     }
    170   end = start + chunk;
    171 
    172   ws->next = end;
    173   *pstart = start;
    174   *pend = end;
    175   return true;
    176 }
    177 
    178 
    179 #ifdef HAVE_SYNC_BUILTINS
    180 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    181    instead.  Note that the only memory value that changes is ws->next.  */
    182 
    183 bool
    184 gomp_iter_dynamic_next (long *pstart, long *pend)
    185 {
    186   struct gomp_thread *thr = gomp_thread ();
    187   struct gomp_work_share *ws = thr->ts.work_share;
    188   long start, end, nend, chunk, incr;
    189 
    190   end = ws->end;
    191   incr = ws->incr;
    192   chunk = ws->chunk_size;
    193 
    194   if (__builtin_expect (ws->mode, 1))
    195     {
    196       long tmp = __sync_fetch_and_add (&ws->next, chunk);
    197       if (incr > 0)
    198 	{
    199 	  if (tmp >= end)
    200 	    return false;
    201 	  nend = tmp + chunk;
    202 	  if (nend > end)
    203 	    nend = end;
    204 	  *pstart = tmp;
    205 	  *pend = nend;
    206 	  return true;
    207 	}
    208       else
    209 	{
    210 	  if (tmp <= end)
    211 	    return false;
    212 	  nend = tmp + chunk;
    213 	  if (nend < end)
    214 	    nend = end;
    215 	  *pstart = tmp;
    216 	  *pend = nend;
    217 	  return true;
    218 	}
    219     }
    220 
    221   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
    222   while (1)
    223     {
    224       long left = end - start;
    225       long tmp;
    226 
    227       if (start == end)
    228 	return false;
    229 
    230       if (incr < 0)
    231 	{
    232 	  if (chunk < left)
    233 	    chunk = left;
    234 	}
    235       else
    236 	{
    237 	  if (chunk > left)
    238 	    chunk = left;
    239 	}
    240       nend = start + chunk;
    241 
    242       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
    243       if (__builtin_expect (tmp == start, 1))
    244 	break;
    245 
    246       start = tmp;
    247     }
    248 
    249   *pstart = start;
    250   *pend = nend;
    251   return true;
    252 }
    253 #endif /* HAVE_SYNC_BUILTINS */
    254 
    255 
    256 /* This function implements the GUIDED scheduling method.  Arguments are
    257    as for gomp_iter_static_next.  This function must be called with the
    258    work share lock held.  */
    259 
    260 bool
    261 gomp_iter_guided_next_locked (long *pstart, long *pend)
    262 {
    263   struct gomp_thread *thr = gomp_thread ();
    264   struct gomp_work_share *ws = thr->ts.work_share;
    265   struct gomp_team *team = thr->ts.team;
    266   unsigned long nthreads = team ? team->nthreads : 1;
    267   unsigned long n, q;
    268   long start, end;
    269 
    270   if (ws->next == ws->end)
    271     return false;
    272 
    273   start = ws->next;
    274   n = (ws->end - start) / ws->incr;
    275   q = (n + nthreads - 1) / nthreads;
    276 
    277   if (q < ws->chunk_size)
    278     q = ws->chunk_size;
    279   if (q <= n)
    280     end = start + q * ws->incr;
    281   else
    282     end = ws->end;
    283 
    284   ws->next = end;
    285   *pstart = start;
    286   *pend = end;
    287   return true;
    288 }
    289 
    290 #ifdef HAVE_SYNC_BUILTINS
    291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    292    instead.  Note that the only memory value that changes is ws->next.  */
    293 
    294 bool
    295 gomp_iter_guided_next (long *pstart, long *pend)
    296 {
    297   struct gomp_thread *thr = gomp_thread ();
    298   struct gomp_work_share *ws = thr->ts.work_share;
    299   struct gomp_team *team = thr->ts.team;
    300   unsigned long nthreads = team ? team->nthreads : 1;
    301   long start, end, nend, incr;
    302   unsigned long chunk_size;
    303 
    304   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
    305   end = ws->end;
    306   incr = ws->incr;
    307   chunk_size = ws->chunk_size;
    308 
    309   while (1)
    310     {
    311       unsigned long n, q;
    312       long tmp;
    313 
    314       if (start == end)
    315 	return false;
    316 
    317       n = (end - start) / incr;
    318       q = (n + nthreads - 1) / nthreads;
    319 
    320       if (q < chunk_size)
    321 	q = chunk_size;
    322       if (__builtin_expect (q <= n, 1))
    323 	nend = start + q * incr;
    324       else
    325 	nend = end;
    326 
    327       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
    328       if (__builtin_expect (tmp == start, 1))
    329 	break;
    330 
    331       start = tmp;
    332     }
    333 
    334   *pstart = start;
    335   *pend = nend;
    336   return true;
    337 }
    338 #endif /* HAVE_SYNC_BUILTINS */
    339