Home | History | Annotate | Line # | Download | only in nvptx
team.c revision 1.1.1.9
      1  1.1.1.9  mrg /* Copyright (C) 2015-2024 Free Software Foundation, Inc.
      2  1.1.1.2  mrg    Contributed by Alexander Monakov <amonakov (at) ispras.ru>
      3  1.1.1.2  mrg 
      4  1.1.1.2  mrg    This file is part of the GNU Offloading and Multi Processing Library
      5  1.1.1.2  mrg    (libgomp).
      6  1.1.1.2  mrg 
      7  1.1.1.2  mrg    Libgomp is free software; you can redistribute it and/or modify it
      8  1.1.1.2  mrg    under the terms of the GNU General Public License as published by
      9  1.1.1.2  mrg    the Free Software Foundation; either version 3, or (at your option)
     10  1.1.1.2  mrg    any later version.
     11  1.1.1.2  mrg 
     12  1.1.1.2  mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
     13  1.1.1.2  mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
     14  1.1.1.2  mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
     15  1.1.1.2  mrg    more details.
     16  1.1.1.2  mrg 
     17  1.1.1.2  mrg    Under Section 7 of GPL version 3, you are granted additional
     18  1.1.1.2  mrg    permissions described in the GCC Runtime Library Exception, version
     19  1.1.1.2  mrg    3.1, as published by the Free Software Foundation.
     20  1.1.1.2  mrg 
     21  1.1.1.2  mrg    You should have received a copy of the GNU General Public License and
     22  1.1.1.2  mrg    a copy of the GCC Runtime Library Exception along with this program;
     23  1.1.1.2  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24  1.1.1.2  mrg    <http://www.gnu.org/licenses/>.  */
     25  1.1.1.2  mrg 
     26  1.1.1.7  mrg /* This file handles maintenance of threads on NVPTX.  */
     27  1.1.1.2  mrg 
     28  1.1.1.2  mrg #if defined __nvptx_softstack__ && defined __nvptx_unisimt__
     29  1.1.1.2  mrg 
     30  1.1.1.2  mrg #include "libgomp.h"
     31  1.1.1.2  mrg #include <stdlib.h>
     32  1.1.1.2  mrg #include <string.h>
     33  1.1.1.2  mrg 
     34  1.1.1.2  mrg struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
     35  1.1.1.8  mrg int __gomp_team_num __attribute__((shared,nocommon));
     36  1.1.1.2  mrg 
     37  1.1.1.2  mrg static void gomp_thread_start (struct gomp_thread_pool *);
     38  1.1.1.9  mrg extern void build_indirect_map (void);
     39  1.1.1.2  mrg 
     40  1.1.1.9  mrg /* There should be some .shared space reserved for us.  There's no way to
     41  1.1.1.9  mrg    express this magic extern sizeless array in C so use asm.  */
     42  1.1.1.9  mrg asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
     43  1.1.1.9  mrg 
     44  1.1.1.9  mrg /* Defined in basic-allocator.c via config/nvptx/allocator.c.  */
     45  1.1.1.9  mrg void __nvptx_lowlat_init (void *heap, size_t size);
     46  1.1.1.2  mrg 
     47  1.1.1.2  mrg /* This externally visible function handles target region entry.  It
     48  1.1.1.2  mrg    sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
     49  1.1.1.2  mrg    in the master thread or gomp_thread_start in other threads.
     50  1.1.1.2  mrg 
     51  1.1.1.2  mrg    The name of this function is part of the interface with the compiler: for
     52  1.1.1.2  mrg    each target region, GCC emits a PTX .kernel function that sets up soft-stack
     53  1.1.1.2  mrg    and uniform-simt state and calls this function, passing in FN the original
     54  1.1.1.2  mrg    function outlined for the target region.  */
     55  1.1.1.2  mrg 
     56  1.1.1.2  mrg void
     57  1.1.1.2  mrg gomp_nvptx_main (void (*fn) (void *), void *fn_data)
     58  1.1.1.2  mrg {
     59  1.1.1.2  mrg   int tid, ntids;
     60  1.1.1.2  mrg   asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
     61  1.1.1.2  mrg   asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
     62  1.1.1.9  mrg 
     63  1.1.1.2  mrg   if (tid == 0)
     64  1.1.1.2  mrg     {
     65  1.1.1.2  mrg       gomp_global_icv.nthreads_var = ntids;
     66  1.1.1.8  mrg       gomp_global_icv.thread_limit_var = ntids;
     67  1.1.1.2  mrg       /* Starting additional threads is not supported.  */
     68  1.1.1.2  mrg       gomp_global_icv.dyn_var = true;
     69  1.1.1.2  mrg 
     70  1.1.1.8  mrg       __gomp_team_num = 0;
     71  1.1.1.2  mrg       nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
     72  1.1.1.2  mrg       memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
     73  1.1.1.2  mrg 
     74  1.1.1.9  mrg       /* Initialize indirect function support.  */
     75  1.1.1.9  mrg       unsigned int block_id;
     76  1.1.1.9  mrg       asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
     77  1.1.1.9  mrg       if (block_id == 0)
     78  1.1.1.9  mrg 	build_indirect_map ();
     79  1.1.1.9  mrg 
     80  1.1.1.9  mrg       /* Find the low-latency heap details ....  */
     81  1.1.1.9  mrg       uint32_t *shared_pool;
     82  1.1.1.9  mrg       uint32_t shared_pool_size = 0;
     83  1.1.1.9  mrg       asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
     84  1.1.1.9  mrg #if __PTX_ISA_VERSION_MAJOR__ > 4 \
     85  1.1.1.9  mrg     || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1)
     86  1.1.1.9  mrg       asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
     87  1.1.1.9  mrg 	   : "=r"(shared_pool_size));
     88  1.1.1.9  mrg #endif
     89  1.1.1.9  mrg       __nvptx_lowlat_init (shared_pool, shared_pool_size);
     90  1.1.1.9  mrg 
     91  1.1.1.9  mrg       /* Initialize the thread pool.  */
     92  1.1.1.2  mrg       struct gomp_thread_pool *pool = alloca (sizeof (*pool));
     93  1.1.1.2  mrg       pool->threads = alloca (ntids * sizeof (*pool->threads));
     94  1.1.1.2  mrg       for (tid = 0; tid < ntids; tid++)
     95  1.1.1.2  mrg 	pool->threads[tid] = nvptx_thrs + tid;
     96  1.1.1.2  mrg       pool->threads_size = ntids;
     97  1.1.1.2  mrg       pool->threads_used = ntids;
     98  1.1.1.2  mrg       pool->threads_busy = 1;
     99  1.1.1.2  mrg       pool->last_team = NULL;
    100  1.1.1.2  mrg       gomp_simple_barrier_init (&pool->threads_dock, ntids);
    101  1.1.1.2  mrg 
    102  1.1.1.2  mrg       nvptx_thrs[0].thread_pool = pool;
    103  1.1.1.2  mrg       asm ("bar.sync 0;");
    104  1.1.1.2  mrg       fn (fn_data);
    105  1.1.1.2  mrg 
    106  1.1.1.2  mrg       gomp_free_thread (nvptx_thrs);
    107  1.1.1.2  mrg     }
    108  1.1.1.2  mrg   else
    109  1.1.1.2  mrg     {
    110  1.1.1.2  mrg       asm ("bar.sync 0;");
    111  1.1.1.2  mrg       gomp_thread_start (nvptx_thrs[0].thread_pool);
    112  1.1.1.2  mrg     }
    113  1.1.1.2  mrg }
    114  1.1.1.2  mrg 
    115  1.1.1.2  mrg /* This function contains the idle loop in which a thread waits
    116  1.1.1.2  mrg    to be called up to become part of a team.  */
    117  1.1.1.2  mrg 
    118  1.1.1.2  mrg static void
    119  1.1.1.2  mrg gomp_thread_start (struct gomp_thread_pool *pool)
    120  1.1.1.2  mrg {
    121  1.1.1.2  mrg   struct gomp_thread *thr = gomp_thread ();
    122  1.1.1.2  mrg 
    123  1.1.1.2  mrg   gomp_sem_init (&thr->release, 0);
    124  1.1.1.2  mrg   thr->thread_pool = pool;
    125  1.1.1.2  mrg 
    126  1.1.1.2  mrg   do
    127  1.1.1.2  mrg     {
    128  1.1.1.2  mrg       gomp_simple_barrier_wait (&pool->threads_dock);
    129  1.1.1.2  mrg       if (!thr->fn)
    130  1.1.1.2  mrg 	continue;
    131  1.1.1.2  mrg       thr->fn (thr->data);
    132  1.1.1.2  mrg       thr->fn = NULL;
    133  1.1.1.2  mrg 
    134  1.1.1.2  mrg       struct gomp_task *task = thr->task;
    135  1.1.1.2  mrg       gomp_team_barrier_wait_final (&thr->ts.team->barrier);
    136  1.1.1.2  mrg       gomp_finish_task (task);
    137  1.1.1.2  mrg     }
    138  1.1.1.2  mrg   /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
    139  1.1.1.2  mrg      it can trash stack pointer R1 in loops lacking exit edges.  Add a cheap
    140  1.1.1.2  mrg      artificial exit that the driver would not be able to optimize out.  */
    141  1.1.1.2  mrg   while (nvptx_thrs);
    142  1.1.1.2  mrg }
    143  1.1.1.2  mrg 
    144  1.1.1.2  mrg /* Launch a team.  */
    145  1.1.1.2  mrg 
    146  1.1.1.2  mrg void
    147  1.1.1.2  mrg gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
    148  1.1.1.6  mrg 		 unsigned flags, struct gomp_team *team,
    149  1.1.1.6  mrg 		 struct gomp_taskgroup *taskgroup)
    150  1.1.1.2  mrg {
    151  1.1.1.2  mrg   struct gomp_thread *thr, *nthr;
    152  1.1.1.2  mrg   struct gomp_task *task;
    153  1.1.1.2  mrg   struct gomp_task_icv *icv;
    154  1.1.1.2  mrg   struct gomp_thread_pool *pool;
    155  1.1.1.2  mrg   unsigned long nthreads_var;
    156  1.1.1.2  mrg 
    157  1.1.1.2  mrg   thr = gomp_thread ();
    158  1.1.1.2  mrg   pool = thr->thread_pool;
    159  1.1.1.2  mrg   task = thr->task;
    160  1.1.1.2  mrg   icv = task ? &task->icv : &gomp_global_icv;
    161  1.1.1.2  mrg 
    162  1.1.1.2  mrg   /* Always save the previous state, even if this isn't a nested team.
    163  1.1.1.2  mrg      In particular, we should save any work share state from an outer
    164  1.1.1.2  mrg      orphaned work share construct.  */
    165  1.1.1.2  mrg   team->prev_ts = thr->ts;
    166  1.1.1.2  mrg 
    167  1.1.1.2  mrg   thr->ts.team = team;
    168  1.1.1.2  mrg   thr->ts.team_id = 0;
    169  1.1.1.2  mrg   ++thr->ts.level;
    170  1.1.1.2  mrg   if (nthreads > 1)
    171  1.1.1.2  mrg     ++thr->ts.active_level;
    172  1.1.1.2  mrg   thr->ts.work_share = &team->work_shares[0];
    173  1.1.1.2  mrg   thr->ts.last_work_share = NULL;
    174  1.1.1.2  mrg   thr->ts.single_count = 0;
    175  1.1.1.2  mrg   thr->ts.static_trip = 0;
    176  1.1.1.2  mrg   thr->task = &team->implicit_task[0];
    177  1.1.1.2  mrg   nthreads_var = icv->nthreads_var;
    178  1.1.1.2  mrg   gomp_init_task (thr->task, task, icv);
    179  1.1.1.2  mrg   team->implicit_task[0].icv.nthreads_var = nthreads_var;
    180  1.1.1.6  mrg   team->implicit_task[0].taskgroup = taskgroup;
    181  1.1.1.2  mrg 
    182  1.1.1.2  mrg   if (nthreads == 1)
    183  1.1.1.2  mrg     return;
    184  1.1.1.2  mrg 
    185  1.1.1.2  mrg   /* Release existing idle threads.  */
    186  1.1.1.2  mrg   for (unsigned i = 1; i < nthreads; ++i)
    187  1.1.1.2  mrg     {
    188  1.1.1.2  mrg       nthr = pool->threads[i];
    189  1.1.1.2  mrg       nthr->ts.team = team;
    190  1.1.1.2  mrg       nthr->ts.work_share = &team->work_shares[0];
    191  1.1.1.2  mrg       nthr->ts.last_work_share = NULL;
    192  1.1.1.2  mrg       nthr->ts.team_id = i;
    193  1.1.1.2  mrg       nthr->ts.level = team->prev_ts.level + 1;
    194  1.1.1.2  mrg       nthr->ts.active_level = thr->ts.active_level;
    195  1.1.1.2  mrg       nthr->ts.single_count = 0;
    196  1.1.1.2  mrg       nthr->ts.static_trip = 0;
    197  1.1.1.2  mrg       nthr->task = &team->implicit_task[i];
    198  1.1.1.2  mrg       gomp_init_task (nthr->task, task, icv);
    199  1.1.1.2  mrg       team->implicit_task[i].icv.nthreads_var = nthreads_var;
    200  1.1.1.6  mrg       team->implicit_task[i].taskgroup = taskgroup;
    201  1.1.1.2  mrg       nthr->fn = fn;
    202  1.1.1.2  mrg       nthr->data = data;
    203  1.1.1.2  mrg       team->ordered_release[i] = &nthr->release;
    204  1.1.1.2  mrg     }
    205  1.1.1.2  mrg 
    206  1.1.1.2  mrg   gomp_simple_barrier_wait (&pool->threads_dock);
    207  1.1.1.2  mrg }
    208  1.1.1.2  mrg 
    209  1.1.1.6  mrg int
    210  1.1.1.6  mrg gomp_pause_host (void)
    211  1.1.1.6  mrg {
    212  1.1.1.6  mrg   return -1;
    213  1.1.1.6  mrg }
    214  1.1.1.6  mrg 
    215  1.1.1.2  mrg #include "../../team.c"
    216  1.1.1.2  mrg #endif
    217