team.c revision 1.1.1.9 1 1.1.1.9 mrg /* Copyright (C) 2015-2024 Free Software Foundation, Inc.
2 1.1.1.2 mrg Contributed by Alexander Monakov <amonakov (at) ispras.ru>
3 1.1.1.2 mrg
4 1.1.1.2 mrg This file is part of the GNU Offloading and Multi Processing Library
5 1.1.1.2 mrg (libgomp).
6 1.1.1.2 mrg
7 1.1.1.2 mrg Libgomp is free software; you can redistribute it and/or modify it
8 1.1.1.2 mrg under the terms of the GNU General Public License as published by
9 1.1.1.2 mrg the Free Software Foundation; either version 3, or (at your option)
10 1.1.1.2 mrg any later version.
11 1.1.1.2 mrg
12 1.1.1.2 mrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 1.1.1.2 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 1.1.1.2 mrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 1.1.1.2 mrg more details.
16 1.1.1.2 mrg
17 1.1.1.2 mrg Under Section 7 of GPL version 3, you are granted additional
18 1.1.1.2 mrg permissions described in the GCC Runtime Library Exception, version
19 1.1.1.2 mrg 3.1, as published by the Free Software Foundation.
20 1.1.1.2 mrg
21 1.1.1.2 mrg You should have received a copy of the GNU General Public License and
22 1.1.1.2 mrg a copy of the GCC Runtime Library Exception along with this program;
23 1.1.1.2 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 1.1.1.2 mrg <http://www.gnu.org/licenses/>. */
25 1.1.1.2 mrg
26 1.1.1.7 mrg /* This file handles maintenance of threads on NVPTX. */
27 1.1.1.2 mrg
28 1.1.1.2 mrg #if defined __nvptx_softstack__ && defined __nvptx_unisimt__
29 1.1.1.2 mrg
30 1.1.1.2 mrg #include "libgomp.h"
31 1.1.1.2 mrg #include <stdlib.h>
32 1.1.1.2 mrg #include <string.h>
33 1.1.1.2 mrg
34 1.1.1.2 mrg struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
35 1.1.1.8 mrg int __gomp_team_num __attribute__((shared,nocommon));
36 1.1.1.2 mrg
37 1.1.1.2 mrg static void gomp_thread_start (struct gomp_thread_pool *);
38 1.1.1.9 mrg extern void build_indirect_map (void);
39 1.1.1.2 mrg
40 1.1.1.9 mrg /* There should be some .shared space reserved for us. There's no way to
41 1.1.1.9 mrg express this magic extern sizeless array in C so use asm. */
42 1.1.1.9 mrg asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
43 1.1.1.9 mrg
44 1.1.1.9 mrg /* Defined in basic-allocator.c via config/nvptx/allocator.c. */
45 1.1.1.9 mrg void __nvptx_lowlat_init (void *heap, size_t size);
46 1.1.1.2 mrg
47 1.1.1.2 mrg /* This externally visible function handles target region entry. It
48 1.1.1.2 mrg sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
49 1.1.1.2 mrg in the master thread or gomp_thread_start in other threads.
50 1.1.1.2 mrg
51 1.1.1.2 mrg The name of this function is part of the interface with the compiler: for
52 1.1.1.2 mrg each target region, GCC emits a PTX .kernel function that sets up soft-stack
53 1.1.1.2 mrg and uniform-simt state and calls this function, passing in FN the original
54 1.1.1.2 mrg function outlined for the target region. */
55 1.1.1.2 mrg
56 1.1.1.2 mrg void
57 1.1.1.2 mrg gomp_nvptx_main (void (*fn) (void *), void *fn_data)
58 1.1.1.2 mrg {
59 1.1.1.2 mrg int tid, ntids;
60 1.1.1.2 mrg asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
61 1.1.1.2 mrg asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
62 1.1.1.9 mrg
63 1.1.1.2 mrg if (tid == 0)
64 1.1.1.2 mrg {
65 1.1.1.2 mrg gomp_global_icv.nthreads_var = ntids;
66 1.1.1.8 mrg gomp_global_icv.thread_limit_var = ntids;
67 1.1.1.2 mrg /* Starting additional threads is not supported. */
68 1.1.1.2 mrg gomp_global_icv.dyn_var = true;
69 1.1.1.2 mrg
70 1.1.1.8 mrg __gomp_team_num = 0;
71 1.1.1.2 mrg nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
72 1.1.1.2 mrg memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
73 1.1.1.2 mrg
74 1.1.1.9 mrg /* Initialize indirect function support. */
75 1.1.1.9 mrg unsigned int block_id;
76 1.1.1.9 mrg asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
77 1.1.1.9 mrg if (block_id == 0)
78 1.1.1.9 mrg build_indirect_map ();
79 1.1.1.9 mrg
80 1.1.1.9 mrg /* Find the low-latency heap details .... */
81 1.1.1.9 mrg uint32_t *shared_pool;
82 1.1.1.9 mrg uint32_t shared_pool_size = 0;
83 1.1.1.9 mrg asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
84 1.1.1.9 mrg #if __PTX_ISA_VERSION_MAJOR__ > 4 \
85 1.1.1.9 mrg || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1)
86 1.1.1.9 mrg asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
87 1.1.1.9 mrg : "=r"(shared_pool_size));
88 1.1.1.9 mrg #endif
89 1.1.1.9 mrg __nvptx_lowlat_init (shared_pool, shared_pool_size);
90 1.1.1.9 mrg
91 1.1.1.9 mrg /* Initialize the thread pool. */
92 1.1.1.2 mrg struct gomp_thread_pool *pool = alloca (sizeof (*pool));
93 1.1.1.2 mrg pool->threads = alloca (ntids * sizeof (*pool->threads));
94 1.1.1.2 mrg for (tid = 0; tid < ntids; tid++)
95 1.1.1.2 mrg pool->threads[tid] = nvptx_thrs + tid;
96 1.1.1.2 mrg pool->threads_size = ntids;
97 1.1.1.2 mrg pool->threads_used = ntids;
98 1.1.1.2 mrg pool->threads_busy = 1;
99 1.1.1.2 mrg pool->last_team = NULL;
100 1.1.1.2 mrg gomp_simple_barrier_init (&pool->threads_dock, ntids);
101 1.1.1.2 mrg
102 1.1.1.2 mrg nvptx_thrs[0].thread_pool = pool;
103 1.1.1.2 mrg asm ("bar.sync 0;");
104 1.1.1.2 mrg fn (fn_data);
105 1.1.1.2 mrg
106 1.1.1.2 mrg gomp_free_thread (nvptx_thrs);
107 1.1.1.2 mrg }
108 1.1.1.2 mrg else
109 1.1.1.2 mrg {
110 1.1.1.2 mrg asm ("bar.sync 0;");
111 1.1.1.2 mrg gomp_thread_start (nvptx_thrs[0].thread_pool);
112 1.1.1.2 mrg }
113 1.1.1.2 mrg }
114 1.1.1.2 mrg
115 1.1.1.2 mrg /* This function contains the idle loop in which a thread waits
116 1.1.1.2 mrg to be called up to become part of a team. */
117 1.1.1.2 mrg
118 1.1.1.2 mrg static void
119 1.1.1.2 mrg gomp_thread_start (struct gomp_thread_pool *pool)
120 1.1.1.2 mrg {
121 1.1.1.2 mrg struct gomp_thread *thr = gomp_thread ();
122 1.1.1.2 mrg
123 1.1.1.2 mrg gomp_sem_init (&thr->release, 0);
124 1.1.1.2 mrg thr->thread_pool = pool;
125 1.1.1.2 mrg
126 1.1.1.2 mrg do
127 1.1.1.2 mrg {
128 1.1.1.2 mrg gomp_simple_barrier_wait (&pool->threads_dock);
129 1.1.1.2 mrg if (!thr->fn)
130 1.1.1.2 mrg continue;
131 1.1.1.2 mrg thr->fn (thr->data);
132 1.1.1.2 mrg thr->fn = NULL;
133 1.1.1.2 mrg
134 1.1.1.2 mrg struct gomp_task *task = thr->task;
135 1.1.1.2 mrg gomp_team_barrier_wait_final (&thr->ts.team->barrier);
136 1.1.1.2 mrg gomp_finish_task (task);
137 1.1.1.2 mrg }
138 1.1.1.2 mrg /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
139 1.1.1.2 mrg it can trash stack pointer R1 in loops lacking exit edges. Add a cheap
140 1.1.1.2 mrg artificial exit that the driver would not be able to optimize out. */
141 1.1.1.2 mrg while (nvptx_thrs);
142 1.1.1.2 mrg }
143 1.1.1.2 mrg
144 1.1.1.2 mrg /* Launch a team. */
145 1.1.1.2 mrg
146 1.1.1.2 mrg void
147 1.1.1.2 mrg gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
148 1.1.1.6 mrg unsigned flags, struct gomp_team *team,
149 1.1.1.6 mrg struct gomp_taskgroup *taskgroup)
150 1.1.1.2 mrg {
151 1.1.1.2 mrg struct gomp_thread *thr, *nthr;
152 1.1.1.2 mrg struct gomp_task *task;
153 1.1.1.2 mrg struct gomp_task_icv *icv;
154 1.1.1.2 mrg struct gomp_thread_pool *pool;
155 1.1.1.2 mrg unsigned long nthreads_var;
156 1.1.1.2 mrg
157 1.1.1.2 mrg thr = gomp_thread ();
158 1.1.1.2 mrg pool = thr->thread_pool;
159 1.1.1.2 mrg task = thr->task;
160 1.1.1.2 mrg icv = task ? &task->icv : &gomp_global_icv;
161 1.1.1.2 mrg
162 1.1.1.2 mrg /* Always save the previous state, even if this isn't a nested team.
163 1.1.1.2 mrg In particular, we should save any work share state from an outer
164 1.1.1.2 mrg orphaned work share construct. */
165 1.1.1.2 mrg team->prev_ts = thr->ts;
166 1.1.1.2 mrg
167 1.1.1.2 mrg thr->ts.team = team;
168 1.1.1.2 mrg thr->ts.team_id = 0;
169 1.1.1.2 mrg ++thr->ts.level;
170 1.1.1.2 mrg if (nthreads > 1)
171 1.1.1.2 mrg ++thr->ts.active_level;
172 1.1.1.2 mrg thr->ts.work_share = &team->work_shares[0];
173 1.1.1.2 mrg thr->ts.last_work_share = NULL;
174 1.1.1.2 mrg thr->ts.single_count = 0;
175 1.1.1.2 mrg thr->ts.static_trip = 0;
176 1.1.1.2 mrg thr->task = &team->implicit_task[0];
177 1.1.1.2 mrg nthreads_var = icv->nthreads_var;
178 1.1.1.2 mrg gomp_init_task (thr->task, task, icv);
179 1.1.1.2 mrg team->implicit_task[0].icv.nthreads_var = nthreads_var;
180 1.1.1.6 mrg team->implicit_task[0].taskgroup = taskgroup;
181 1.1.1.2 mrg
182 1.1.1.2 mrg if (nthreads == 1)
183 1.1.1.2 mrg return;
184 1.1.1.2 mrg
185 1.1.1.2 mrg /* Release existing idle threads. */
186 1.1.1.2 mrg for (unsigned i = 1; i < nthreads; ++i)
187 1.1.1.2 mrg {
188 1.1.1.2 mrg nthr = pool->threads[i];
189 1.1.1.2 mrg nthr->ts.team = team;
190 1.1.1.2 mrg nthr->ts.work_share = &team->work_shares[0];
191 1.1.1.2 mrg nthr->ts.last_work_share = NULL;
192 1.1.1.2 mrg nthr->ts.team_id = i;
193 1.1.1.2 mrg nthr->ts.level = team->prev_ts.level + 1;
194 1.1.1.2 mrg nthr->ts.active_level = thr->ts.active_level;
195 1.1.1.2 mrg nthr->ts.single_count = 0;
196 1.1.1.2 mrg nthr->ts.static_trip = 0;
197 1.1.1.2 mrg nthr->task = &team->implicit_task[i];
198 1.1.1.2 mrg gomp_init_task (nthr->task, task, icv);
199 1.1.1.2 mrg team->implicit_task[i].icv.nthreads_var = nthreads_var;
200 1.1.1.6 mrg team->implicit_task[i].taskgroup = taskgroup;
201 1.1.1.2 mrg nthr->fn = fn;
202 1.1.1.2 mrg nthr->data = data;
203 1.1.1.2 mrg team->ordered_release[i] = &nthr->release;
204 1.1.1.2 mrg }
205 1.1.1.2 mrg
206 1.1.1.2 mrg gomp_simple_barrier_wait (&pool->threads_dock);
207 1.1.1.2 mrg }
208 1.1.1.2 mrg
209 1.1.1.6 mrg int
210 1.1.1.6 mrg gomp_pause_host (void)
211 1.1.1.6 mrg {
212 1.1.1.6 mrg return -1;
213 1.1.1.6 mrg }
214 1.1.1.6 mrg
215 1.1.1.2 mrg #include "../../team.c"
216 1.1.1.2 mrg #endif
217