Home | History | Annotate | Line # | Download | only in libgomp
task.c revision 1.5
      1 /* Copyright (C) 2007-2015 Free Software Foundation, Inc.
      2    Contributed by Richard Henderson <rth (at) redhat.com>.
      3 
      4    This file is part of the GNU Offloading and Multi Processing Library
      5    (libgomp).
      6 
      7    Libgomp is free software; you can redistribute it and/or modify it
      8    under the terms of the GNU General Public License as published by
      9    the Free Software Foundation; either version 3, or (at your option)
     10    any later version.
     11 
     12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     15    more details.
     16 
     17    Under Section 7 of GPL version 3, you are granted additional
     18    permissions described in the GCC Runtime Library Exception, version
     19    3.1, as published by the Free Software Foundation.
     20 
     21    You should have received a copy of the GNU General Public License and
     22    a copy of the GCC Runtime Library Exception along with this program;
     23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24    <http://www.gnu.org/licenses/>.  */
     25 
     26 /* This file handles the maintainence of tasks in response to task
     27    creation and termination.  */
     28 
     29 #include "libgomp.h"
     30 #include <stdlib.h>
     31 #include <string.h>
     32 
     33 typedef struct gomp_task_depend_entry *hash_entry_type;
     34 
     35 static inline void *
     36 htab_alloc (size_t size)
     37 {
     38   return gomp_malloc (size);
     39 }
     40 
     41 static inline void
     42 htab_free (void *ptr)
     43 {
     44   free (ptr);
     45 }
     46 
     47 #include "hashtab.h"
     48 
     49 static inline hashval_t
     50 htab_hash (hash_entry_type element)
     51 {
     52   return hash_pointer (element->addr);
     53 }
     54 
     55 static inline bool
     56 htab_eq (hash_entry_type x, hash_entry_type y)
     57 {
     58   return x->addr == y->addr;
     59 }
     60 
     61 /* Create a new task data structure.  */
     62 
     63 void
     64 gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
     65 		struct gomp_task_icv *prev_icv)
     66 {
     67   task->parent = parent_task;
     68   task->icv = *prev_icv;
     69   task->kind = GOMP_TASK_IMPLICIT;
     70   task->taskwait = NULL;
     71   task->in_tied_task = false;
     72   task->final_task = false;
     73   task->copy_ctors_done = false;
     74   task->parent_depends_on = false;
     75   task->children = NULL;
     76   task->taskgroup = NULL;
     77   task->dependers = NULL;
     78   task->depend_hash = NULL;
     79   task->depend_count = 0;
     80 }
     81 
     82 /* Clean up a task, after completing it.  */
     83 
     84 void
     85 gomp_end_task (void)
     86 {
     87   struct gomp_thread *thr = gomp_thread ();
     88   struct gomp_task *task = thr->task;
     89 
     90   gomp_finish_task (task);
     91   thr->task = task->parent;
     92 }
     93 
     94 static inline void
     95 gomp_clear_parent (struct gomp_task *children)
     96 {
     97   struct gomp_task *task = children;
     98 
     99   if (task)
    100     do
    101       {
    102 	task->parent = NULL;
    103 	task = task->next_child;
    104       }
    105     while (task != children);
    106 }
    107 
    108 static void gomp_task_maybe_wait_for_dependencies (void **depend);
    109 
    110 /* Called when encountering an explicit task directive.  If IF_CLAUSE is
    111    false, then we must not delay in executing the task.  If UNTIED is true,
    112    then the task may be executed by any member of the team.  */
    113 
    114 void
    115 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
    116 	   long arg_size, long arg_align, bool if_clause, unsigned flags,
    117 	   void **depend)
    118 {
    119   struct gomp_thread *thr = gomp_thread ();
    120   struct gomp_team *team = thr->ts.team;
    121 
    122 #ifdef HAVE_BROKEN_POSIX_SEMAPHORES
    123   /* If pthread_mutex_* is used for omp_*lock*, then each task must be
    124      tied to one thread all the time.  This means UNTIED tasks must be
    125      tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN
    126      might be running on different thread than FN.  */
    127   if (cpyfn)
    128     if_clause = false;
    129   if (flags & 1)
    130     flags &= ~1;
    131 #endif
    132 
    133   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
    134   if (team
    135       && (gomp_team_barrier_cancelled (&team->barrier)
    136 	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
    137     return;
    138 
    139   if (!if_clause || team == NULL
    140       || (thr->task && thr->task->final_task)
    141       || team->task_count > 64 * team->nthreads)
    142     {
    143       struct gomp_task task;
    144 
    145       /* If there are depend clauses and earlier deferred sibling tasks
    146 	 with depend clauses, check if there isn't a dependency.  If there
    147 	 is, we need to wait for them.  There is no need to handle
    148 	 depend clauses for non-deferred tasks other than this, because
    149 	 the parent task is suspended until the child task finishes and thus
    150 	 it can't start further child tasks.  */
    151       if ((flags & 8) && thr->task && thr->task->depend_hash)
    152 	gomp_task_maybe_wait_for_dependencies (depend);
    153 
    154       gomp_init_task (&task, thr->task, gomp_icv (false));
    155       task.kind = GOMP_TASK_IFFALSE;
    156       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
    157       if (thr->task)
    158 	{
    159 	  task.in_tied_task = thr->task->in_tied_task;
    160 	  task.taskgroup = thr->task->taskgroup;
    161 	}
    162       thr->task = &task;
    163       if (__builtin_expect (cpyfn != NULL, 0))
    164 	{
    165 	  char buf[arg_size + arg_align - 1];
    166 	  char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
    167 				& ~(uintptr_t) (arg_align - 1));
    168 	  cpyfn (arg, data);
    169 	  fn (arg);
    170 	}
    171       else
    172 	fn (data);
    173       /* Access to "children" is normally done inside a task_lock
    174 	 mutex region, but the only way this particular task.children
    175 	 can be set is if this thread's task work function (fn)
    176 	 creates children.  So since the setter is *this* thread, we
    177 	 need no barriers here when testing for non-NULL.  We can have
    178 	 task.children set by the current thread then changed by a
    179 	 child thread, but seeing a stale non-NULL value is not a
    180 	 problem.  Once past the task_lock acquisition, this thread
    181 	 will see the real value of task.children.  */
    182       if (task.children != NULL)
    183 	{
    184 	  gomp_mutex_lock (&team->task_lock);
    185 	  gomp_clear_parent (task.children);
    186 	  gomp_mutex_unlock (&team->task_lock);
    187 	}
    188       gomp_end_task ();
    189     }
    190   else
    191     {
    192       struct gomp_task *task;
    193       struct gomp_task *parent = thr->task;
    194       struct gomp_taskgroup *taskgroup = parent->taskgroup;
    195       char *arg;
    196       bool do_wake;
    197       size_t depend_size = 0;
    198 
    199       if (flags & 8)
    200 	depend_size = ((uintptr_t) depend[0]
    201 		       * sizeof (struct gomp_task_depend_entry));
    202       task = gomp_malloc (sizeof (*task) + depend_size
    203 			  + arg_size + arg_align - 1);
    204       arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
    205 		      & ~(uintptr_t) (arg_align - 1));
    206       gomp_init_task (task, parent, gomp_icv (false));
    207       task->kind = GOMP_TASK_IFFALSE;
    208       task->in_tied_task = parent->in_tied_task;
    209       task->taskgroup = taskgroup;
    210       thr->task = task;
    211       if (cpyfn)
    212 	{
    213 	  cpyfn (arg, data);
    214 	  task->copy_ctors_done = true;
    215 	}
    216       else
    217 	memcpy (arg, data, arg_size);
    218       thr->task = parent;
    219       task->kind = GOMP_TASK_WAITING;
    220       task->fn = fn;
    221       task->fn_data = arg;
    222       task->final_task = (flags & 2) >> 1;
    223       gomp_mutex_lock (&team->task_lock);
    224       /* If parallel or taskgroup has been cancelled, don't start new
    225 	 tasks.  */
    226       if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
    227 			     || (taskgroup && taskgroup->cancelled))
    228 			    && !task->copy_ctors_done, 0))
    229 	{
    230 	  gomp_mutex_unlock (&team->task_lock);
    231 	  gomp_finish_task (task);
    232 	  free (task);
    233 	  return;
    234 	}
    235       if (taskgroup)
    236 	taskgroup->num_children++;
    237       if (depend_size)
    238 	{
    239 	  size_t ndepend = (uintptr_t) depend[0];
    240 	  size_t nout = (uintptr_t) depend[1];
    241 	  size_t i;
    242 	  hash_entry_type ent;
    243 
    244 	  task->depend_count = ndepend;
    245 	  task->num_dependees = 0;
    246 	  if (parent->depend_hash == NULL)
    247 	    parent->depend_hash
    248 	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
    249 	  for (i = 0; i < ndepend; i++)
    250 	    {
    251 	      task->depend[i].addr = depend[2 + i];
    252 	      task->depend[i].next = NULL;
    253 	      task->depend[i].prev = NULL;
    254 	      task->depend[i].task = task;
    255 	      task->depend[i].is_in = i >= nout;
    256 	      task->depend[i].redundant = false;
    257 	      task->depend[i].redundant_out = false;
    258 
    259 	      hash_entry_type *slot
    260 		= htab_find_slot (&parent->depend_hash, &task->depend[i],
    261 				  INSERT);
    262 	      hash_entry_type out = NULL, last = NULL;
    263 	      if (*slot)
    264 		{
    265 		  /* If multiple depends on the same task are the
    266 		     same, all but the first one are redundant.
    267 		     As inout/out come first, if any of them is
    268 		     inout/out, it will win, which is the right
    269 		     semantics.  */
    270 		  if ((*slot)->task == task)
    271 		    {
    272 		      task->depend[i].redundant = true;
    273 		      continue;
    274 		    }
    275 		  for (ent = *slot; ent; ent = ent->next)
    276 		    {
    277 		      if (ent->redundant_out)
    278 			break;
    279 
    280 		      last = ent;
    281 
    282 		      /* depend(in:...) doesn't depend on earlier
    283 			 depend(in:...).  */
    284 		      if (i >= nout && ent->is_in)
    285 			continue;
    286 
    287 		      if (!ent->is_in)
    288 			out = ent;
    289 
    290 		      struct gomp_task *tsk = ent->task;
    291 		      if (tsk->dependers == NULL)
    292 			{
    293 			  tsk->dependers
    294 			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
    295 					   + 6 * sizeof (struct gomp_task *));
    296 			  tsk->dependers->n_elem = 1;
    297 			  tsk->dependers->allocated = 6;
    298 			  tsk->dependers->elem[0] = task;
    299 			  task->num_dependees++;
    300 			  continue;
    301 			}
    302 		      /* We already have some other dependency on tsk
    303 			 from earlier depend clause.  */
    304 		      else if (tsk->dependers->n_elem
    305 			       && (tsk->dependers->elem[tsk->dependers->n_elem
    306 							- 1]
    307 				   == task))
    308 			continue;
    309 		      else if (tsk->dependers->n_elem
    310 			       == tsk->dependers->allocated)
    311 			{
    312 			  tsk->dependers->allocated
    313 			    = tsk->dependers->allocated * 2 + 2;
    314 			  tsk->dependers
    315 			    = gomp_realloc (tsk->dependers,
    316 					    sizeof (struct gomp_dependers_vec)
    317 					    + (tsk->dependers->allocated
    318 					       * sizeof (struct gomp_task *)));
    319 			}
    320 		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
    321 		      task->num_dependees++;
    322 		    }
    323 		  task->depend[i].next = *slot;
    324 		  (*slot)->prev = &task->depend[i];
    325 		}
    326 	      *slot = &task->depend[i];
    327 
    328 	      /* There is no need to store more than one depend({,in}out:)
    329 		 task per address in the hash table chain for the purpose
    330 		 of creation of deferred tasks, because each out
    331 		 depends on all earlier outs, thus it is enough to record
    332 		 just the last depend({,in}out:).  For depend(in:), we need
    333 		 to keep all of the previous ones not terminated yet, because
    334 		 a later depend({,in}out:) might need to depend on all of
    335 		 them.  So, if the new task's clause is depend({,in}out:),
    336 		 we know there is at most one other depend({,in}out:) clause
    337 		 in the list (out).  For non-deferred tasks we want to see
    338 		 all outs, so they are moved to the end of the chain,
    339 		 after first redundant_out entry all following entries
    340 		 should be redundant_out.  */
    341 	      if (!task->depend[i].is_in && out)
    342 		{
    343 		  if (out != last)
    344 		    {
    345 		      out->next->prev = out->prev;
    346 		      out->prev->next = out->next;
    347 		      out->next = last->next;
    348 		      out->prev = last;
    349 		      last->next = out;
    350 		      if (out->next)
    351 			out->next->prev = out;
    352 		    }
    353 		  out->redundant_out = true;
    354 		}
    355 	    }
    356 	  if (task->num_dependees)
    357 	    {
    358 	      gomp_mutex_unlock (&team->task_lock);
    359 	      return;
    360 	    }
    361 	}
    362       if (parent->children)
    363 	{
    364 	  task->next_child = parent->children;
    365 	  task->prev_child = parent->children->prev_child;
    366 	  task->next_child->prev_child = task;
    367 	  task->prev_child->next_child = task;
    368 	}
    369       else
    370 	{
    371 	  task->next_child = task;
    372 	  task->prev_child = task;
    373 	}
    374       parent->children = task;
    375       if (taskgroup)
    376 	{
    377 	  if (taskgroup->children)
    378 	    {
    379 	      task->next_taskgroup = taskgroup->children;
    380 	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
    381 	      task->next_taskgroup->prev_taskgroup = task;
    382 	      task->prev_taskgroup->next_taskgroup = task;
    383 	    }
    384 	  else
    385 	    {
    386 	      task->next_taskgroup = task;
    387 	      task->prev_taskgroup = task;
    388 	    }
    389 	  taskgroup->children = task;
    390 	}
    391       if (team->task_queue)
    392 	{
    393 	  task->next_queue = team->task_queue;
    394 	  task->prev_queue = team->task_queue->prev_queue;
    395 	  task->next_queue->prev_queue = task;
    396 	  task->prev_queue->next_queue = task;
    397 	}
    398       else
    399 	{
    400 	  task->next_queue = task;
    401 	  task->prev_queue = task;
    402 	  team->task_queue = task;
    403 	}
    404       ++team->task_count;
    405       ++team->task_queued_count;
    406       gomp_team_barrier_set_task_pending (&team->barrier);
    407       do_wake = team->task_running_count + !parent->in_tied_task
    408 		< team->nthreads;
    409       gomp_mutex_unlock (&team->task_lock);
    410       if (do_wake)
    411 	gomp_team_barrier_wake (&team->barrier, 1);
    412     }
    413 }
    414 
    415 static inline bool
    416 gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
    417 		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
    418 {
    419   if (parent)
    420     {
    421       if (parent->children == child_task)
    422 	parent->children = child_task->next_child;
    423       if (__builtin_expect (child_task->parent_depends_on, 0)
    424 	  && parent->taskwait->last_parent_depends_on == child_task)
    425 	{
    426 	  if (child_task->prev_child->kind == GOMP_TASK_WAITING
    427 	      && child_task->prev_child->parent_depends_on)
    428 	    parent->taskwait->last_parent_depends_on = child_task->prev_child;
    429 	  else
    430 	    parent->taskwait->last_parent_depends_on = NULL;
    431 	}
    432     }
    433   if (taskgroup && taskgroup->children == child_task)
    434     taskgroup->children = child_task->next_taskgroup;
    435   child_task->prev_queue->next_queue = child_task->next_queue;
    436   child_task->next_queue->prev_queue = child_task->prev_queue;
    437   if (team->task_queue == child_task)
    438     {
    439       if (child_task->next_queue != child_task)
    440 	team->task_queue = child_task->next_queue;
    441       else
    442 	team->task_queue = NULL;
    443     }
    444   child_task->kind = GOMP_TASK_TIED;
    445   if (--team->task_queued_count == 0)
    446     gomp_team_barrier_clear_task_pending (&team->barrier);
    447   if ((gomp_team_barrier_cancelled (&team->barrier)
    448        || (taskgroup && taskgroup->cancelled))
    449       && !child_task->copy_ctors_done)
    450     return true;
    451   return false;
    452 }
    453 
    454 static void
    455 gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
    456 {
    457   struct gomp_task *parent = child_task->parent;
    458   size_t i;
    459 
    460   for (i = 0; i < child_task->depend_count; i++)
    461     if (!child_task->depend[i].redundant)
    462       {
    463 	if (child_task->depend[i].next)
    464 	  child_task->depend[i].next->prev = child_task->depend[i].prev;
    465 	if (child_task->depend[i].prev)
    466 	  child_task->depend[i].prev->next = child_task->depend[i].next;
    467 	else
    468 	  {
    469 	    hash_entry_type *slot
    470 	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
    471 				NO_INSERT);
    472 	    if (*slot != &child_task->depend[i])
    473 	      abort ();
    474 	    if (child_task->depend[i].next)
    475 	      *slot = child_task->depend[i].next;
    476 	    else
    477 	      htab_clear_slot (parent->depend_hash, slot);
    478 	  }
    479       }
    480 }
    481 
    482 static size_t
    483 gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
    484 				     struct gomp_team *team)
    485 {
    486   struct gomp_task *parent = child_task->parent;
    487   size_t i, count = child_task->dependers->n_elem, ret = 0;
    488   for (i = 0; i < count; i++)
    489     {
    490       struct gomp_task *task = child_task->dependers->elem[i];
    491       if (--task->num_dependees != 0)
    492 	continue;
    493 
    494       struct gomp_taskgroup *taskgroup = task->taskgroup;
    495       if (parent)
    496 	{
    497 	  if (parent->children)
    498 	    {
    499 	      /* If parent is in gomp_task_maybe_wait_for_dependencies
    500 		 and it doesn't need to wait for this task, put it after
    501 		 all ready to run tasks it needs to wait for.  */
    502 	      if (parent->taskwait && parent->taskwait->last_parent_depends_on
    503 		  && !task->parent_depends_on)
    504 		{
    505 		  struct gomp_task *last_parent_depends_on
    506 		    = parent->taskwait->last_parent_depends_on;
    507 		  task->next_child = last_parent_depends_on->next_child;
    508 		  task->prev_child = last_parent_depends_on;
    509 		}
    510 	      else
    511 		{
    512 		  task->next_child = parent->children;
    513 		  task->prev_child = parent->children->prev_child;
    514 		  parent->children = task;
    515 		}
    516 	      task->next_child->prev_child = task;
    517 	      task->prev_child->next_child = task;
    518 	    }
    519 	  else
    520 	    {
    521 	      task->next_child = task;
    522 	      task->prev_child = task;
    523 	      parent->children = task;
    524 	    }
    525 	  if (parent->taskwait)
    526 	    {
    527 	      if (parent->taskwait->in_taskwait)
    528 		{
    529 		  parent->taskwait->in_taskwait = false;
    530 		  gomp_sem_post (&parent->taskwait->taskwait_sem);
    531 		}
    532 	      else if (parent->taskwait->in_depend_wait)
    533 		{
    534 		  parent->taskwait->in_depend_wait = false;
    535 		  gomp_sem_post (&parent->taskwait->taskwait_sem);
    536 		}
    537 	      if (parent->taskwait->last_parent_depends_on == NULL
    538 		  && task->parent_depends_on)
    539 		parent->taskwait->last_parent_depends_on = task;
    540 	    }
    541 	}
    542       if (taskgroup)
    543 	{
    544 	  if (taskgroup->children)
    545 	    {
    546 	      task->next_taskgroup = taskgroup->children;
    547 	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
    548 	      task->next_taskgroup->prev_taskgroup = task;
    549 	      task->prev_taskgroup->next_taskgroup = task;
    550 	    }
    551 	  else
    552 	    {
    553 	      task->next_taskgroup = task;
    554 	      task->prev_taskgroup = task;
    555 	    }
    556 	  taskgroup->children = task;
    557 	  if (taskgroup->in_taskgroup_wait)
    558 	    {
    559 	      taskgroup->in_taskgroup_wait = false;
    560 	      gomp_sem_post (&taskgroup->taskgroup_sem);
    561 	    }
    562 	}
    563       if (team->task_queue)
    564 	{
    565 	  task->next_queue = team->task_queue;
    566 	  task->prev_queue = team->task_queue->prev_queue;
    567 	  task->next_queue->prev_queue = task;
    568 	  task->prev_queue->next_queue = task;
    569 	}
    570       else
    571 	{
    572 	  task->next_queue = task;
    573 	  task->prev_queue = task;
    574 	  team->task_queue = task;
    575 	}
    576       ++team->task_count;
    577       ++team->task_queued_count;
    578       ++ret;
    579     }
    580   free (child_task->dependers);
    581   child_task->dependers = NULL;
    582   if (ret > 1)
    583     gomp_team_barrier_set_task_pending (&team->barrier);
    584   return ret;
    585 }
    586 
    587 static inline size_t
    588 gomp_task_run_post_handle_depend (struct gomp_task *child_task,
    589 				  struct gomp_team *team)
    590 {
    591   if (child_task->depend_count == 0)
    592     return 0;
    593 
    594   /* If parent is gone already, the hash table is freed and nothing
    595      will use the hash table anymore, no need to remove anything from it.  */
    596   if (child_task->parent != NULL)
    597     gomp_task_run_post_handle_depend_hash (child_task);
    598 
    599   if (child_task->dependers == NULL)
    600     return 0;
    601 
    602   return gomp_task_run_post_handle_dependers (child_task, team);
    603 }
    604 
    605 static inline void
    606 gomp_task_run_post_remove_parent (struct gomp_task *child_task)
    607 {
    608   struct gomp_task *parent = child_task->parent;
    609   if (parent == NULL)
    610     return;
    611   if (__builtin_expect (child_task->parent_depends_on, 0)
    612       && --parent->taskwait->n_depend == 0
    613       && parent->taskwait->in_depend_wait)
    614     {
    615       parent->taskwait->in_depend_wait = false;
    616       gomp_sem_post (&parent->taskwait->taskwait_sem);
    617     }
    618   child_task->prev_child->next_child = child_task->next_child;
    619   child_task->next_child->prev_child = child_task->prev_child;
    620   if (parent->children != child_task)
    621     return;
    622   if (child_task->next_child != child_task)
    623     parent->children = child_task->next_child;
    624   else
    625     {
    626       /* We access task->children in GOMP_taskwait
    627 	 outside of the task lock mutex region, so
    628 	 need a release barrier here to ensure memory
    629 	 written by child_task->fn above is flushed
    630 	 before the NULL is written.  */
    631       __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
    632       if (parent->taskwait && parent->taskwait->in_taskwait)
    633 	{
    634 	  parent->taskwait->in_taskwait = false;
    635 	  gomp_sem_post (&parent->taskwait->taskwait_sem);
    636 	}
    637     }
    638 }
    639 
    640 static inline void
    641 gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
    642 {
    643   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
    644   if (taskgroup == NULL)
    645     return;
    646   child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
    647   child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
    648   if (taskgroup->num_children > 1)
    649     --taskgroup->num_children;
    650   else
    651     {
    652       /* We access taskgroup->num_children in GOMP_taskgroup_end
    653 	 outside of the task lock mutex region, so
    654 	 need a release barrier here to ensure memory
    655 	 written by child_task->fn above is flushed
    656 	 before the NULL is written.  */
    657       __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
    658     }
    659   if (taskgroup->children != child_task)
    660     return;
    661   if (child_task->next_taskgroup != child_task)
    662     taskgroup->children = child_task->next_taskgroup;
    663   else
    664     {
    665       taskgroup->children = NULL;
    666       if (taskgroup->in_taskgroup_wait)
    667 	{
    668 	  taskgroup->in_taskgroup_wait = false;
    669 	  gomp_sem_post (&taskgroup->taskgroup_sem);
    670 	}
    671     }
    672 }
    673 
    674 void
    675 gomp_barrier_handle_tasks (gomp_barrier_state_t state)
    676 {
    677   struct gomp_thread *thr = gomp_thread ();
    678   struct gomp_team *team = thr->ts.team;
    679   struct gomp_task *task = thr->task;
    680   struct gomp_task *child_task = NULL;
    681   struct gomp_task *to_free = NULL;
    682   int do_wake = 0;
    683 
    684   gomp_mutex_lock (&team->task_lock);
    685   if (gomp_barrier_last_thread (state))
    686     {
    687       if (team->task_count == 0)
    688 	{
    689 	  gomp_team_barrier_done (&team->barrier, state);
    690 	  gomp_mutex_unlock (&team->task_lock);
    691 	  gomp_team_barrier_wake (&team->barrier, 0);
    692 	  return;
    693 	}
    694       gomp_team_barrier_set_waiting_for_tasks (&team->barrier);
    695     }
    696 
    697   while (1)
    698     {
    699       bool cancelled = false;
    700       if (team->task_queue != NULL)
    701 	{
    702 	  child_task = team->task_queue;
    703 	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
    704 					 child_task->taskgroup, team);
    705 	  if (__builtin_expect (cancelled, 0))
    706 	    {
    707 	      if (to_free)
    708 		{
    709 		  gomp_finish_task (to_free);
    710 		  free (to_free);
    711 		  to_free = NULL;
    712 		}
    713 	      goto finish_cancelled;
    714 	    }
    715 	  team->task_running_count++;
    716 	  child_task->in_tied_task = true;
    717 	}
    718       gomp_mutex_unlock (&team->task_lock);
    719       if (do_wake)
    720 	{
    721 	  gomp_team_barrier_wake (&team->barrier, do_wake);
    722 	  do_wake = 0;
    723 	}
    724       if (to_free)
    725 	{
    726 	  gomp_finish_task (to_free);
    727 	  free (to_free);
    728 	  to_free = NULL;
    729 	}
    730       if (child_task)
    731 	{
    732 	  thr->task = child_task;
    733 	  child_task->fn (child_task->fn_data);
    734 	  thr->task = task;
    735 	}
    736       else
    737 	return;
    738       gomp_mutex_lock (&team->task_lock);
    739       if (child_task)
    740 	{
    741 	 finish_cancelled:;
    742 	  size_t new_tasks
    743 	    = gomp_task_run_post_handle_depend (child_task, team);
    744 	  gomp_task_run_post_remove_parent (child_task);
    745 	  gomp_clear_parent (child_task->children);
    746 	  gomp_task_run_post_remove_taskgroup (child_task);
    747 	  to_free = child_task;
    748 	  child_task = NULL;
    749 	  if (!cancelled)
    750 	    team->task_running_count--;
    751 	  if (new_tasks > 1)
    752 	    {
    753 	      do_wake = team->nthreads - team->task_running_count;
    754 	      if (do_wake > new_tasks)
    755 		do_wake = new_tasks;
    756 	    }
    757 	  if (--team->task_count == 0
    758 	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
    759 	    {
    760 	      gomp_team_barrier_done (&team->barrier, state);
    761 	      gomp_mutex_unlock (&team->task_lock);
    762 	      gomp_team_barrier_wake (&team->barrier, 0);
    763 	      gomp_mutex_lock (&team->task_lock);
    764 	    }
    765 	}
    766     }
    767 }
    768 
    769 /* Called when encountering a taskwait directive.  */
    770 
    771 void
    772 GOMP_taskwait (void)
    773 {
    774   struct gomp_thread *thr = gomp_thread ();
    775   struct gomp_team *team = thr->ts.team;
    776   struct gomp_task *task = thr->task;
    777   struct gomp_task *child_task = NULL;
    778   struct gomp_task *to_free = NULL;
    779   struct gomp_taskwait taskwait;
    780   int do_wake = 0;
    781 
    782   /* The acquire barrier on load of task->children here synchronizes
    783      with the write of a NULL in gomp_task_run_post_remove_parent.  It is
    784      not necessary that we synchronize with other non-NULL writes at
    785      this point, but we must ensure that all writes to memory by a
    786      child thread task work function are seen before we exit from
    787      GOMP_taskwait.  */
    788   if (task == NULL
    789       || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL)
    790     return;
    791 
    792   memset (&taskwait, 0, sizeof (taskwait));
    793   gomp_mutex_lock (&team->task_lock);
    794   while (1)
    795     {
    796       bool cancelled = false;
    797       if (task->children == NULL)
    798 	{
    799 	  bool destroy_taskwait = task->taskwait != NULL;
    800 	  task->taskwait = NULL;
    801 	  gomp_mutex_unlock (&team->task_lock);
    802 	  if (to_free)
    803 	    {
    804 	      gomp_finish_task (to_free);
    805 	      free (to_free);
    806 	    }
    807 	  if (destroy_taskwait)
    808 	    gomp_sem_destroy (&taskwait.taskwait_sem);
    809 	  return;
    810 	}
    811       if (task->children->kind == GOMP_TASK_WAITING)
    812 	{
    813 	  child_task = task->children;
    814 	  cancelled
    815 	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
    816 				 team);
    817 	  if (__builtin_expect (cancelled, 0))
    818 	    {
    819 	      if (to_free)
    820 		{
    821 		  gomp_finish_task (to_free);
    822 		  free (to_free);
    823 		  to_free = NULL;
    824 		}
    825 	      goto finish_cancelled;
    826 	    }
    827 	}
    828       else
    829 	{
    830 	  /* All tasks we are waiting for are already running
    831 	     in other threads.  Wait for them.  */
    832 	  if (task->taskwait == NULL)
    833 	    {
    834 	      taskwait.in_depend_wait = false;
    835 	      gomp_sem_init (&taskwait.taskwait_sem, 0);
    836 	      task->taskwait = &taskwait;
    837 	    }
    838 	  taskwait.in_taskwait = true;
    839 	}
    840       gomp_mutex_unlock (&team->task_lock);
    841       if (do_wake)
    842 	{
    843 	  gomp_team_barrier_wake (&team->barrier, do_wake);
    844 	  do_wake = 0;
    845 	}
    846       if (to_free)
    847 	{
    848 	  gomp_finish_task (to_free);
    849 	  free (to_free);
    850 	  to_free = NULL;
    851 	}
    852       if (child_task)
    853 	{
    854 	  thr->task = child_task;
    855 	  child_task->fn (child_task->fn_data);
    856 	  thr->task = task;
    857 	}
    858       else
    859 	gomp_sem_wait (&taskwait.taskwait_sem);
    860       gomp_mutex_lock (&team->task_lock);
    861       if (child_task)
    862 	{
    863 	 finish_cancelled:;
    864 	  size_t new_tasks
    865 	    = gomp_task_run_post_handle_depend (child_task, team);
    866 	  child_task->prev_child->next_child = child_task->next_child;
    867 	  child_task->next_child->prev_child = child_task->prev_child;
    868 	  if (task->children == child_task)
    869 	    {
    870 	      if (child_task->next_child != child_task)
    871 		task->children = child_task->next_child;
    872 	      else
    873 		task->children = NULL;
    874 	    }
    875 	  gomp_clear_parent (child_task->children);
    876 	  gomp_task_run_post_remove_taskgroup (child_task);
    877 	  to_free = child_task;
    878 	  child_task = NULL;
    879 	  team->task_count--;
    880 	  if (new_tasks > 1)
    881 	    {
    882 	      do_wake = team->nthreads - team->task_running_count
    883 			- !task->in_tied_task;
    884 	      if (do_wake > new_tasks)
    885 		do_wake = new_tasks;
    886 	    }
    887 	}
    888     }
    889 }
    890 
    891 /* This is like GOMP_taskwait, but we only wait for tasks that the
    892    upcoming task depends on.  */
    893 
    894 static void
    895 gomp_task_maybe_wait_for_dependencies (void **depend)
    896 {
    897   struct gomp_thread *thr = gomp_thread ();
    898   struct gomp_task *task = thr->task;
    899   struct gomp_team *team = thr->ts.team;
    900   struct gomp_task_depend_entry elem, *ent = NULL;
    901   struct gomp_taskwait taskwait;
    902   struct gomp_task *last_parent_depends_on = NULL;
    903   size_t ndepend = (uintptr_t) depend[0];
    904   size_t nout = (uintptr_t) depend[1];
    905   size_t i;
    906   size_t num_awaited = 0;
    907   struct gomp_task *child_task = NULL;
    908   struct gomp_task *to_free = NULL;
    909   int do_wake = 0;
    910 
    911   gomp_mutex_lock (&team->task_lock);
    912   for (i = 0; i < ndepend; i++)
    913     {
    914       elem.addr = depend[i + 2];
    915       ent = htab_find (task->depend_hash, &elem);
    916       for (; ent; ent = ent->next)
    917 	if (i >= nout && ent->is_in)
    918 	  continue;
    919 	else
    920 	  {
    921 	    struct gomp_task *tsk = ent->task;
    922 	    if (!tsk->parent_depends_on)
    923 	      {
    924 		tsk->parent_depends_on = true;
    925 		++num_awaited;
    926 		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
    927 		  {
    928 		    /* If a task we need to wait for is not already
    929 		       running and is ready to be scheduled, move it
    930 		       to front, so that we run it as soon as possible.  */
    931 		    if (last_parent_depends_on)
    932 		      {
    933 			tsk->prev_child->next_child = tsk->next_child;
    934 			tsk->next_child->prev_child = tsk->prev_child;
    935 			tsk->prev_child = last_parent_depends_on;
    936 			tsk->next_child = last_parent_depends_on->next_child;
    937 			tsk->prev_child->next_child = tsk;
    938 			tsk->next_child->prev_child = tsk;
    939 		      }
    940 		    else if (tsk != task->children)
    941 		      {
    942 			tsk->prev_child->next_child = tsk->next_child;
    943 			tsk->next_child->prev_child = tsk->prev_child;
    944 			tsk->prev_child = task->children;
    945 			tsk->next_child = task->children->next_child;
    946 			task->children = tsk;
    947 			tsk->prev_child->next_child = tsk;
    948 			tsk->next_child->prev_child = tsk;
    949 		      }
    950 		    last_parent_depends_on = tsk;
    951 		  }
    952 	      }
    953 	  }
    954     }
    955   if (num_awaited == 0)
    956     {
    957       gomp_mutex_unlock (&team->task_lock);
    958       return;
    959     }
    960 
    961   memset (&taskwait, 0, sizeof (taskwait));
    962   taskwait.n_depend = num_awaited;
    963   taskwait.last_parent_depends_on = last_parent_depends_on;
    964   gomp_sem_init (&taskwait.taskwait_sem, 0);
    965   task->taskwait = &taskwait;
    966 
    967   while (1)
    968     {
    969       bool cancelled = false;
    970       if (taskwait.n_depend == 0)
    971 	{
    972 	  task->taskwait = NULL;
    973 	  gomp_mutex_unlock (&team->task_lock);
    974 	  if (to_free)
    975 	    {
    976 	      gomp_finish_task (to_free);
    977 	      free (to_free);
    978 	    }
    979 	  gomp_sem_destroy (&taskwait.taskwait_sem);
    980 	  return;
    981 	}
    982       if (task->children->kind == GOMP_TASK_WAITING)
    983 	{
    984 	  child_task = task->children;
    985 	  cancelled
    986 	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
    987 				 team);
    988 	  if (__builtin_expect (cancelled, 0))
    989 	    {
    990 	      if (to_free)
    991 		{
    992 		  gomp_finish_task (to_free);
    993 		  free (to_free);
    994 		  to_free = NULL;
    995 		}
    996 	      goto finish_cancelled;
    997 	    }
    998 	}
    999       else
   1000 	/* All tasks we are waiting for are already running
   1001 	   in other threads.  Wait for them.  */
   1002 	taskwait.in_depend_wait = true;
   1003       gomp_mutex_unlock (&team->task_lock);
   1004       if (do_wake)
   1005 	{
   1006 	  gomp_team_barrier_wake (&team->barrier, do_wake);
   1007 	  do_wake = 0;
   1008 	}
   1009       if (to_free)
   1010 	{
   1011 	  gomp_finish_task (to_free);
   1012 	  free (to_free);
   1013 	  to_free = NULL;
   1014 	}
   1015       if (child_task)
   1016 	{
   1017 	  thr->task = child_task;
   1018 	  child_task->fn (child_task->fn_data);
   1019 	  thr->task = task;
   1020 	}
   1021       else
   1022 	gomp_sem_wait (&taskwait.taskwait_sem);
   1023       gomp_mutex_lock (&team->task_lock);
   1024       if (child_task)
   1025 	{
   1026 	 finish_cancelled:;
   1027 	  size_t new_tasks
   1028 	    = gomp_task_run_post_handle_depend (child_task, team);
   1029 	  if (child_task->parent_depends_on)
   1030 	    --taskwait.n_depend;
   1031 	  child_task->prev_child->next_child = child_task->next_child;
   1032 	  child_task->next_child->prev_child = child_task->prev_child;
   1033 	  if (task->children == child_task)
   1034 	    {
   1035 	      if (child_task->next_child != child_task)
   1036 		task->children = child_task->next_child;
   1037 	      else
   1038 		task->children = NULL;
   1039 	    }
   1040 	  gomp_clear_parent (child_task->children);
   1041 	  gomp_task_run_post_remove_taskgroup (child_task);
   1042 	  to_free = child_task;
   1043 	  child_task = NULL;
   1044 	  team->task_count--;
   1045 	  if (new_tasks > 1)
   1046 	    {
   1047 	      do_wake = team->nthreads - team->task_running_count
   1048 			- !task->in_tied_task;
   1049 	      if (do_wake > new_tasks)
   1050 		do_wake = new_tasks;
   1051 	    }
   1052 	}
   1053     }
   1054 }
   1055 
   1056 /* Called when encountering a taskyield directive.  */
   1057 
   1058 void
   1059 GOMP_taskyield (void)
   1060 {
   1061   /* Nothing at the moment.  */
   1062 }
   1063 
   1064 void
   1065 GOMP_taskgroup_start (void)
   1066 {
   1067   struct gomp_thread *thr = gomp_thread ();
   1068   struct gomp_team *team = thr->ts.team;
   1069   struct gomp_task *task = thr->task;
   1070   struct gomp_taskgroup *taskgroup;
   1071 
   1072   /* If team is NULL, all tasks are executed as
   1073      GOMP_TASK_IFFALSE tasks and thus all children tasks of
   1074      taskgroup and their descendant tasks will be finished
   1075      by the time GOMP_taskgroup_end is called.  */
   1076   if (team == NULL)
   1077     return;
   1078   taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
   1079   taskgroup->prev = task->taskgroup;
   1080   taskgroup->children = NULL;
   1081   taskgroup->in_taskgroup_wait = false;
   1082   taskgroup->cancelled = false;
   1083   taskgroup->num_children = 0;
   1084   gomp_sem_init (&taskgroup->taskgroup_sem, 0);
   1085   task->taskgroup = taskgroup;
   1086 }
   1087 
   1088 void
   1089 GOMP_taskgroup_end (void)
   1090 {
   1091   struct gomp_thread *thr = gomp_thread ();
   1092   struct gomp_team *team = thr->ts.team;
   1093   struct gomp_task *task = thr->task;
   1094   struct gomp_taskgroup *taskgroup;
   1095   struct gomp_task *child_task = NULL;
   1096   struct gomp_task *to_free = NULL;
   1097   int do_wake = 0;
   1098 
   1099   if (team == NULL)
   1100     return;
   1101   taskgroup = task->taskgroup;
   1102 
   1103   /* The acquire barrier on load of taskgroup->num_children here
   1104      synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
   1105      It is not necessary that we synchronize with other non-0 writes at
   1106      this point, but we must ensure that all writes to memory by a
   1107      child thread task work function are seen before we exit from
   1108      GOMP_taskgroup_end.  */
   1109   if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
   1110     goto finish;
   1111 
   1112   gomp_mutex_lock (&team->task_lock);
   1113   while (1)
   1114     {
   1115       bool cancelled = false;
   1116       if (taskgroup->children == NULL)
   1117 	{
   1118 	  if (taskgroup->num_children)
   1119 	    {
   1120 	      if (task->children == NULL)
   1121 		goto do_wait;
   1122 	      child_task = task->children;
   1123             }
   1124           else
   1125 	    {
   1126 	      gomp_mutex_unlock (&team->task_lock);
   1127 	      if (to_free)
   1128 		{
   1129 		  gomp_finish_task (to_free);
   1130 		  free (to_free);
   1131 		}
   1132 	      goto finish;
   1133 	    }
   1134 	}
   1135       else
   1136 	child_task = taskgroup->children;
   1137       if (child_task->kind == GOMP_TASK_WAITING)
   1138 	{
   1139 	  cancelled
   1140 	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
   1141 				 team);
   1142 	  if (__builtin_expect (cancelled, 0))
   1143 	    {
   1144 	      if (to_free)
   1145 		{
   1146 		  gomp_finish_task (to_free);
   1147 		  free (to_free);
   1148 		  to_free = NULL;
   1149 		}
   1150 	      goto finish_cancelled;
   1151 	    }
   1152 	}
   1153       else
   1154 	{
   1155 	  child_task = NULL;
   1156 	 do_wait:
   1157 	  /* All tasks we are waiting for are already running
   1158 	     in other threads.  Wait for them.  */
   1159 	  taskgroup->in_taskgroup_wait = true;
   1160 	}
   1161       gomp_mutex_unlock (&team->task_lock);
   1162       if (do_wake)
   1163 	{
   1164 	  gomp_team_barrier_wake (&team->barrier, do_wake);
   1165 	  do_wake = 0;
   1166 	}
   1167       if (to_free)
   1168 	{
   1169 	  gomp_finish_task (to_free);
   1170 	  free (to_free);
   1171 	  to_free = NULL;
   1172 	}
   1173       if (child_task)
   1174 	{
   1175 	  thr->task = child_task;
   1176 	  child_task->fn (child_task->fn_data);
   1177 	  thr->task = task;
   1178 	}
   1179       else
   1180 	gomp_sem_wait (&taskgroup->taskgroup_sem);
   1181       gomp_mutex_lock (&team->task_lock);
   1182       if (child_task)
   1183 	{
   1184 	 finish_cancelled:;
   1185 	  size_t new_tasks
   1186 	    = gomp_task_run_post_handle_depend (child_task, team);
   1187 	  gomp_task_run_post_remove_parent (child_task);
   1188 	  gomp_clear_parent (child_task->children);
   1189 	  gomp_task_run_post_remove_taskgroup (child_task);
   1190 	  to_free = child_task;
   1191 	  child_task = NULL;
   1192 	  team->task_count--;
   1193 	  if (new_tasks > 1)
   1194 	    {
   1195 	      do_wake = team->nthreads - team->task_running_count
   1196 			- !task->in_tied_task;
   1197 	      if (do_wake > new_tasks)
   1198 		do_wake = new_tasks;
   1199 	    }
   1200 	}
   1201     }
   1202 
   1203  finish:
   1204   task->taskgroup = taskgroup->prev;
   1205   gomp_sem_destroy (&taskgroup->taskgroup_sem);
   1206   free (taskgroup);
   1207 }
   1208 
   1209 int
   1210 omp_in_final (void)
   1211 {
   1212   struct gomp_thread *thr = gomp_thread ();
   1213   return thr->task && thr->task->final_task;
   1214 }
   1215 
   1216 ialias (omp_in_final)
   1217