team.c revision 1.1.1.9 1 /* Copyright (C) 2015-2024 Free Software Foundation, Inc.
2 Contributed by Alexander Monakov <amonakov (at) ispras.ru>
3
4 This file is part of the GNU Offloading and Multi Processing Library
5 (libgomp).
6
7 Libgomp is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 /* This file handles maintenance of threads on NVPTX. */
27
28 #if defined __nvptx_softstack__ && defined __nvptx_unisimt__
29
30 #include "libgomp.h"
31 #include <stdlib.h>
32 #include <string.h>
33
34 struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
35 int __gomp_team_num __attribute__((shared,nocommon));
36
37 static void gomp_thread_start (struct gomp_thread_pool *);
38 extern void build_indirect_map (void);
39
40 /* There should be some .shared space reserved for us. There's no way to
41 express this magic extern sizeless array in C so use asm. */
42 asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
43
44 /* Defined in basic-allocator.c via config/nvptx/allocator.c. */
45 void __nvptx_lowlat_init (void *heap, size_t size);
46
47 /* This externally visible function handles target region entry. It
48 sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
49 in the master thread or gomp_thread_start in other threads.
50
51 The name of this function is part of the interface with the compiler: for
52 each target region, GCC emits a PTX .kernel function that sets up soft-stack
53 and uniform-simt state and calls this function, passing in FN the original
54 function outlined for the target region. */
55
56 void
57 gomp_nvptx_main (void (*fn) (void *), void *fn_data)
58 {
59 int tid, ntids;
60 asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
61 asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
62
63 if (tid == 0)
64 {
65 gomp_global_icv.nthreads_var = ntids;
66 gomp_global_icv.thread_limit_var = ntids;
67 /* Starting additional threads is not supported. */
68 gomp_global_icv.dyn_var = true;
69
70 __gomp_team_num = 0;
71 nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
72 memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
73
74 /* Initialize indirect function support. */
75 unsigned int block_id;
76 asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
77 if (block_id == 0)
78 build_indirect_map ();
79
80 /* Find the low-latency heap details .... */
81 uint32_t *shared_pool;
82 uint32_t shared_pool_size = 0;
83 asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
84 #if __PTX_ISA_VERSION_MAJOR__ > 4 \
85 || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR__ >= 1)
86 asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
87 : "=r"(shared_pool_size));
88 #endif
89 __nvptx_lowlat_init (shared_pool, shared_pool_size);
90
91 /* Initialize the thread pool. */
92 struct gomp_thread_pool *pool = alloca (sizeof (*pool));
93 pool->threads = alloca (ntids * sizeof (*pool->threads));
94 for (tid = 0; tid < ntids; tid++)
95 pool->threads[tid] = nvptx_thrs + tid;
96 pool->threads_size = ntids;
97 pool->threads_used = ntids;
98 pool->threads_busy = 1;
99 pool->last_team = NULL;
100 gomp_simple_barrier_init (&pool->threads_dock, ntids);
101
102 nvptx_thrs[0].thread_pool = pool;
103 asm ("bar.sync 0;");
104 fn (fn_data);
105
106 gomp_free_thread (nvptx_thrs);
107 }
108 else
109 {
110 asm ("bar.sync 0;");
111 gomp_thread_start (nvptx_thrs[0].thread_pool);
112 }
113 }
114
115 /* This function contains the idle loop in which a thread waits
116 to be called up to become part of a team. */
117
118 static void
119 gomp_thread_start (struct gomp_thread_pool *pool)
120 {
121 struct gomp_thread *thr = gomp_thread ();
122
123 gomp_sem_init (&thr->release, 0);
124 thr->thread_pool = pool;
125
126 do
127 {
128 gomp_simple_barrier_wait (&pool->threads_dock);
129 if (!thr->fn)
130 continue;
131 thr->fn (thr->data);
132 thr->fn = NULL;
133
134 struct gomp_task *task = thr->task;
135 gomp_team_barrier_wait_final (&thr->ts.team->barrier);
136 gomp_finish_task (task);
137 }
138 /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
139 it can trash stack pointer R1 in loops lacking exit edges. Add a cheap
140 artificial exit that the driver would not be able to optimize out. */
141 while (nvptx_thrs);
142 }
143
144 /* Launch a team. */
145
146 void
147 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
148 unsigned flags, struct gomp_team *team,
149 struct gomp_taskgroup *taskgroup)
150 {
151 struct gomp_thread *thr, *nthr;
152 struct gomp_task *task;
153 struct gomp_task_icv *icv;
154 struct gomp_thread_pool *pool;
155 unsigned long nthreads_var;
156
157 thr = gomp_thread ();
158 pool = thr->thread_pool;
159 task = thr->task;
160 icv = task ? &task->icv : &gomp_global_icv;
161
162 /* Always save the previous state, even if this isn't a nested team.
163 In particular, we should save any work share state from an outer
164 orphaned work share construct. */
165 team->prev_ts = thr->ts;
166
167 thr->ts.team = team;
168 thr->ts.team_id = 0;
169 ++thr->ts.level;
170 if (nthreads > 1)
171 ++thr->ts.active_level;
172 thr->ts.work_share = &team->work_shares[0];
173 thr->ts.last_work_share = NULL;
174 thr->ts.single_count = 0;
175 thr->ts.static_trip = 0;
176 thr->task = &team->implicit_task[0];
177 nthreads_var = icv->nthreads_var;
178 gomp_init_task (thr->task, task, icv);
179 team->implicit_task[0].icv.nthreads_var = nthreads_var;
180 team->implicit_task[0].taskgroup = taskgroup;
181
182 if (nthreads == 1)
183 return;
184
185 /* Release existing idle threads. */
186 for (unsigned i = 1; i < nthreads; ++i)
187 {
188 nthr = pool->threads[i];
189 nthr->ts.team = team;
190 nthr->ts.work_share = &team->work_shares[0];
191 nthr->ts.last_work_share = NULL;
192 nthr->ts.team_id = i;
193 nthr->ts.level = team->prev_ts.level + 1;
194 nthr->ts.active_level = thr->ts.active_level;
195 nthr->ts.single_count = 0;
196 nthr->ts.static_trip = 0;
197 nthr->task = &team->implicit_task[i];
198 gomp_init_task (nthr->task, task, icv);
199 team->implicit_task[i].icv.nthreads_var = nthreads_var;
200 team->implicit_task[i].taskgroup = taskgroup;
201 nthr->fn = fn;
202 nthr->data = data;
203 team->ordered_release[i] = &nthr->release;
204 }
205
206 gomp_simple_barrier_wait (&pool->threads_dock);
207 }
208
209 int
210 gomp_pause_host (void)
211 {
212 return -1;
213 }
214
215 #include "../../team.c"
216 #endif
217