Home | History | Annotate | Line # | Download | only in libgomp
iter.c revision 1.3
      1 /* Copyright (C) 2005-2013 Free Software Foundation, Inc.
      2    Contributed by Richard Henderson <rth (at) redhat.com>.
      3 
      4    This file is part of the GNU OpenMP Library (libgomp).
      5 
      6    Libgomp is free software; you can redistribute it and/or modify it
      7    under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 3, or (at your option)
      9    any later version.
     10 
     11    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     14    more details.
     15 
     16    Under Section 7 of GPL version 3, you are granted additional
     17    permissions described in the GCC Runtime Library Exception, version
     18    3.1, as published by the Free Software Foundation.
     19 
     20    You should have received a copy of the GNU General Public License and
     21    a copy of the GCC Runtime Library Exception along with this program;
     22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23    <http://www.gnu.org/licenses/>.  */
     24 
     25 /* This file contains routines for managing work-share iteration, both
     26    for loops and sections.  */
     27 
     28 #include "libgomp.h"
     29 #include <stdlib.h>
     30 
     31 
     32 /* This function implements the STATIC scheduling method.  The caller should
     33    iterate *pstart <= x < *pend.  Return zero if there are more iterations
     34    to perform; nonzero if not.  Return less than 0 if this thread had
     35    received the absolutely last iteration.  */
     36 
     37 int
     38 gomp_iter_static_next (long *pstart, long *pend)
     39 {
     40   struct gomp_thread *thr = gomp_thread ();
     41   struct gomp_team *team = thr->ts.team;
     42   struct gomp_work_share *ws = thr->ts.work_share;
     43   unsigned long nthreads = team ? team->nthreads : 1;
     44 
     45   if (thr->ts.static_trip == -1)
     46     return -1;
     47 
     48   /* Quick test for degenerate teams and orphaned constructs.  */
     49   if (nthreads == 1)
     50     {
     51       *pstart = ws->next;
     52       *pend = ws->end;
     53       thr->ts.static_trip = -1;
     54       return ws->next == ws->end;
     55     }
     56 
     57   /* We interpret chunk_size zero as "unspecified", which means that we
     58      should break up the iterations such that each thread makes only one
     59      trip through the outer loop.  */
     60   if (ws->chunk_size == 0)
     61     {
     62       unsigned long n, q, i, t;
     63       unsigned long s0, e0;
     64       long s, e;
     65 
     66       if (thr->ts.static_trip > 0)
     67 	return 1;
     68 
     69       /* Compute the total number of iterations.  */
     70       s = ws->incr + (ws->incr > 0 ? -1 : 1);
     71       n = (ws->end - ws->next + s) / ws->incr;
     72       i = thr->ts.team_id;
     73 
     74       /* Compute the "zero-based" start and end points.  That is, as
     75          if the loop began at zero and incremented by one.  */
     76       q = n / nthreads;
     77       t = n % nthreads;
     78       if (i < t)
     79 	{
     80 	  t = 0;
     81 	  q++;
     82 	}
     83       s0 = q * i + t;
     84       e0 = s0 + q;
     85 
     86       /* Notice when no iterations allocated for this thread.  */
     87       if (s0 >= e0)
     88 	{
     89 	  thr->ts.static_trip = 1;
     90 	  return 1;
     91 	}
     92 
     93       /* Transform these to the actual start and end numbers.  */
     94       s = (long)s0 * ws->incr + ws->next;
     95       e = (long)e0 * ws->incr + ws->next;
     96 
     97       *pstart = s;
     98       *pend = e;
     99       thr->ts.static_trip = (e0 == n ? -1 : 1);
    100       return 0;
    101     }
    102   else
    103     {
    104       unsigned long n, s0, e0, i, c;
    105       long s, e;
    106 
    107       /* Otherwise, each thread gets exactly chunk_size iterations
    108 	 (if available) each time through the loop.  */
    109 
    110       s = ws->incr + (ws->incr > 0 ? -1 : 1);
    111       n = (ws->end - ws->next + s) / ws->incr;
    112       i = thr->ts.team_id;
    113       c = ws->chunk_size;
    114 
    115       /* Initial guess is a C sized chunk positioned nthreads iterations
    116 	 in, offset by our thread number.  */
    117       s0 = (thr->ts.static_trip * nthreads + i) * c;
    118       e0 = s0 + c;
    119 
    120       /* Detect overflow.  */
    121       if (s0 >= n)
    122 	return 1;
    123       if (e0 > n)
    124 	e0 = n;
    125 
    126       /* Transform these to the actual start and end numbers.  */
    127       s = (long)s0 * ws->incr + ws->next;
    128       e = (long)e0 * ws->incr + ws->next;
    129 
    130       *pstart = s;
    131       *pend = e;
    132 
    133       if (e0 == n)
    134 	thr->ts.static_trip = -1;
    135       else
    136 	thr->ts.static_trip++;
    137       return 0;
    138     }
    139 }
    140 
    141 
    142 /* This function implements the DYNAMIC scheduling method.  Arguments are
    143    as for gomp_iter_static_next.  This function must be called with ws->lock
    144    held.  */
    145 
    146 bool
    147 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
    148 {
    149   struct gomp_thread *thr = gomp_thread ();
    150   struct gomp_work_share *ws = thr->ts.work_share;
    151   long start, end, chunk, left;
    152 
    153   start = ws->next;
    154   if (start == ws->end)
    155     return false;
    156 
    157   chunk = ws->chunk_size;
    158   left = ws->end - start;
    159   if (ws->incr < 0)
    160     {
    161       if (chunk < left)
    162 	chunk = left;
    163     }
    164   else
    165     {
    166       if (chunk > left)
    167 	chunk = left;
    168     }
    169   end = start + chunk;
    170 
    171   ws->next = end;
    172   *pstart = start;
    173   *pend = end;
    174   return true;
    175 }
    176 
    177 
    178 #ifdef HAVE_SYNC_BUILTINS
    179 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    180    instead.  Note that the only memory value that changes is ws->next.  */
    181 
    182 bool
    183 gomp_iter_dynamic_next (long *pstart, long *pend)
    184 {
    185   struct gomp_thread *thr = gomp_thread ();
    186   struct gomp_work_share *ws = thr->ts.work_share;
    187   long start, end, nend, chunk, incr;
    188 
    189   end = ws->end;
    190   incr = ws->incr;
    191   chunk = ws->chunk_size;
    192 
    193   if (__builtin_expect (ws->mode, 1))
    194     {
    195       long tmp = __sync_fetch_and_add (&ws->next, chunk);
    196       if (incr > 0)
    197 	{
    198 	  if (tmp >= end)
    199 	    return false;
    200 	  nend = tmp + chunk;
    201 	  if (nend > end)
    202 	    nend = end;
    203 	  *pstart = tmp;
    204 	  *pend = nend;
    205 	  return true;
    206 	}
    207       else
    208 	{
    209 	  if (tmp <= end)
    210 	    return false;
    211 	  nend = tmp + chunk;
    212 	  if (nend < end)
    213 	    nend = end;
    214 	  *pstart = tmp;
    215 	  *pend = nend;
    216 	  return true;
    217 	}
    218     }
    219 
    220   start = ws->next;
    221   while (1)
    222     {
    223       long left = end - start;
    224       long tmp;
    225 
    226       if (start == end)
    227 	return false;
    228 
    229       if (incr < 0)
    230 	{
    231 	  if (chunk < left)
    232 	    chunk = left;
    233 	}
    234       else
    235 	{
    236 	  if (chunk > left)
    237 	    chunk = left;
    238 	}
    239       nend = start + chunk;
    240 
    241       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
    242       if (__builtin_expect (tmp == start, 1))
    243 	break;
    244 
    245       start = tmp;
    246     }
    247 
    248   *pstart = start;
    249   *pend = nend;
    250   return true;
    251 }
    252 #endif /* HAVE_SYNC_BUILTINS */
    253 
    254 
    255 /* This function implements the GUIDED scheduling method.  Arguments are
    256    as for gomp_iter_static_next.  This function must be called with the
    257    work share lock held.  */
    258 
    259 bool
    260 gomp_iter_guided_next_locked (long *pstart, long *pend)
    261 {
    262   struct gomp_thread *thr = gomp_thread ();
    263   struct gomp_work_share *ws = thr->ts.work_share;
    264   struct gomp_team *team = thr->ts.team;
    265   unsigned long nthreads = team ? team->nthreads : 1;
    266   unsigned long n, q;
    267   long start, end;
    268 
    269   if (ws->next == ws->end)
    270     return false;
    271 
    272   start = ws->next;
    273   n = (ws->end - start) / ws->incr;
    274   q = (n + nthreads - 1) / nthreads;
    275 
    276   if (q < ws->chunk_size)
    277     q = ws->chunk_size;
    278   if (q <= n)
    279     end = start + q * ws->incr;
    280   else
    281     end = ws->end;
    282 
    283   ws->next = end;
    284   *pstart = start;
    285   *pend = end;
    286   return true;
    287 }
    288 
    289 #ifdef HAVE_SYNC_BUILTINS
    290 /* Similar, but doesn't require the lock held, and uses compare-and-swap
    291    instead.  Note that the only memory value that changes is ws->next.  */
    292 
    293 bool
    294 gomp_iter_guided_next (long *pstart, long *pend)
    295 {
    296   struct gomp_thread *thr = gomp_thread ();
    297   struct gomp_work_share *ws = thr->ts.work_share;
    298   struct gomp_team *team = thr->ts.team;
    299   unsigned long nthreads = team ? team->nthreads : 1;
    300   long start, end, nend, incr;
    301   unsigned long chunk_size;
    302 
    303   start = ws->next;
    304   end = ws->end;
    305   incr = ws->incr;
    306   chunk_size = ws->chunk_size;
    307 
    308   while (1)
    309     {
    310       unsigned long n, q;
    311       long tmp;
    312 
    313       if (start == end)
    314 	return false;
    315 
    316       n = (end - start) / incr;
    317       q = (n + nthreads - 1) / nthreads;
    318 
    319       if (q < chunk_size)
    320 	q = chunk_size;
    321       if (__builtin_expect (q <= n, 1))
    322 	nend = start + q * incr;
    323       else
    324 	nend = end;
    325 
    326       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
    327       if (__builtin_expect (tmp == start, 1))
    328 	break;
    329 
    330       start = tmp;
    331     }
    332 
    333   *pstart = start;
    334   *pend = nend;
    335   return true;
    336 }
    337 #endif /* HAVE_SYNC_BUILTINS */
    338