si_compute.c revision 848b8605
1/*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#include "util/u_memory.h"
26
27#include "../radeon/r600_cs.h"
28#include "si_pipe.h"
29#include "si_shader.h"
30#include "sid.h"
31
32#include "radeon_llvm_util.h"
33
34#define MAX_GLOBAL_BUFFERS 20
35#if HAVE_LLVM < 0x0305
36#define NUM_USER_SGPRS 2
37#else
38#define NUM_USER_SGPRS 4
39#endif
40
41struct si_pipe_compute {
42	struct si_context *ctx;
43
44	unsigned local_size;
45	unsigned private_size;
46	unsigned input_size;
47	unsigned num_kernels;
48	struct si_pipe_shader *kernels;
49	unsigned num_user_sgprs;
50
51	struct r600_resource *input_buffer;
52	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
53
54	LLVMContextRef llvm_ctx;
55};
56
57static void *si_create_compute_state(
58	struct pipe_context *ctx,
59	const struct pipe_compute_state *cso)
60{
61	struct si_context *sctx = (struct si_context *)ctx;
62	struct si_pipe_compute *program =
63					CALLOC_STRUCT(si_pipe_compute);
64	const struct pipe_llvm_program_header *header;
65	const unsigned char *code;
66	unsigned i;
67
68	program->llvm_ctx = LLVMContextCreate();
69
70	header = cso->prog;
71	code = cso->prog + sizeof(struct pipe_llvm_program_header);
72
73	program->ctx = sctx;
74	program->local_size = cso->req_local_mem;
75	program->private_size = cso->req_private_mem;
76	program->input_size = cso->req_input_mem;
77
78	program->num_kernels = radeon_llvm_get_num_kernels(program->llvm_ctx, code,
79							header->num_bytes);
80	program->kernels = CALLOC(sizeof(struct si_pipe_shader),
81							program->num_kernels);
82	for (i = 0; i < program->num_kernels; i++) {
83		LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
84							code, header->num_bytes);
85		si_compile_llvm(sctx, &program->kernels[i], mod);
86		LLVMDisposeModule(mod);
87	}
88
89	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
90		PIPE_USAGE_IMMUTABLE, program->input_size);
91
92	return program;
93}
94
95static void si_bind_compute_state(struct pipe_context *ctx, void *state)
96{
97	struct si_context *sctx = (struct si_context*)ctx;
98	sctx->cs_shader_state.program = (struct si_pipe_compute*)state;
99}
100
101static void si_set_global_binding(
102	struct pipe_context *ctx, unsigned first, unsigned n,
103	struct pipe_resource **resources,
104	uint32_t **handles)
105{
106	unsigned i;
107	struct si_context *sctx = (struct si_context*)ctx;
108	struct si_pipe_compute *program = sctx->cs_shader_state.program;
109
110	if (!resources) {
111		for (i = first; i < first + n; i++) {
112			pipe_resource_reference(&program->global_buffers[i], NULL);
113		}
114		return;
115	}
116
117	for (i = first; i < first + n; i++) {
118		uint64_t va;
119		uint32_t offset;
120		pipe_resource_reference(&program->global_buffers[i], resources[i]);
121		va = r600_resource(resources[i])->gpu_address;
122		offset = util_le32_to_cpu(*handles[i]);
123		va += offset;
124		va = util_cpu_to_le64(va);
125		memcpy(handles[i], &va, sizeof(va));
126	}
127}
128
129/**
130 * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
131 * /p block_layout is the number of threads in each work group.
132 * /p grid layout is the number of work groups.
133 */
134static unsigned compute_num_waves_for_scratch(
135		const struct radeon_info *info,
136		const uint *block_layout,
137		const uint *grid_layout)
138{
139	unsigned num_sh = MAX2(info->max_sh_per_se, 1);
140	unsigned num_se = MAX2(info->max_se, 1);
141	unsigned num_blocks = 1;
142	unsigned threads_per_block = 1;
143	unsigned waves_per_block;
144	unsigned waves_per_sh;
145	unsigned waves;
146	unsigned scratch_waves;
147	unsigned i;
148
149	for (i = 0; i < 3; i++) {
150		threads_per_block *= block_layout[i];
151		num_blocks *= grid_layout[i];
152	}
153
154	waves_per_block = align(threads_per_block, 64) / 64;
155	waves = waves_per_block * num_blocks;
156	waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
157	scratch_waves = waves_per_sh * num_sh * num_se;
158
159	if (waves_per_block > waves_per_sh) {
160		scratch_waves = waves_per_block * num_sh * num_se;
161	}
162
163	return scratch_waves;
164}
165
166static void si_launch_grid(
167		struct pipe_context *ctx,
168		const uint *block_layout, const uint *grid_layout,
169		uint32_t pc, const void *input)
170{
171	struct si_context *sctx = (struct si_context*)ctx;
172	struct si_pipe_compute *program = sctx->cs_shader_state.program;
173	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
174	struct r600_resource *input_buffer = program->input_buffer;
175	unsigned kernel_args_size;
176	unsigned num_work_size_bytes = 36;
177	uint32_t kernel_args_offset = 0;
178	uint32_t *kernel_args;
179	uint64_t kernel_args_va;
180	uint64_t scratch_buffer_va = 0;
181	uint64_t shader_va;
182	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
183	unsigned i;
184	struct si_pipe_shader *shader = &program->kernels[pc];
185	unsigned lds_blocks;
186	unsigned num_waves_for_scratch;
187
188	pm4->compute_pkt = true;
189	si_cmd_context_control(pm4);
190
191	si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE);
192	si_pm4_cmd_add(pm4, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) |
193	                    EVENT_INDEX(0x7) |
194			    EVENT_WRITE_INV_L2);
195	si_pm4_cmd_end(pm4, false);
196
197	si_pm4_inval_texture_cache(pm4);
198	si_pm4_inval_shader_cache(pm4);
199	si_cmd_surface_sync(pm4, pm4->cp_coher_cntl);
200
201	/* Upload the kernel arguments */
202
203	/* The extra num_work_size_bytes are for work group / work item size information */
204	kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
205
206	kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
207			sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
208	for (i = 0; i < 3; i++) {
209		kernel_args[i] = grid_layout[i];
210		kernel_args[i + 3] = grid_layout[i] * block_layout[i];
211		kernel_args[i + 6] = block_layout[i];
212	}
213
214	num_waves_for_scratch =	compute_num_waves_for_scratch(
215		&sctx->screen->b.info, block_layout, grid_layout);
216
217	memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size);
218
219	if (shader->scratch_bytes_per_wave > 0) {
220		unsigned scratch_bytes = shader->scratch_bytes_per_wave *
221						num_waves_for_scratch;
222
223		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
224		            "Total Scratch: %u bytes\n", num_waves_for_scratch,
225			    shader->scratch_bytes_per_wave, scratch_bytes);
226		if (!shader->scratch_bo) {
227			shader->scratch_bo = (struct r600_resource*)
228				si_resource_create_custom(sctx->b.b.screen,
229				PIPE_USAGE_DEFAULT, scratch_bytes);
230		}
231		scratch_buffer_va = shader->scratch_bo->gpu_address;
232		si_pm4_add_bo(pm4, shader->scratch_bo,
233				RADEON_USAGE_READWRITE,
234				RADEON_PRIO_SHADER_RESOURCE_RW);
235
236	}
237
238	for (i = 0; i < (kernel_args_size / 4); i++) {
239		COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
240			kernel_args[i]);
241	}
242
243	sctx->b.ws->buffer_unmap(input_buffer->cs_buf);
244
245	kernel_args_va = input_buffer->gpu_address;
246	kernel_args_va += kernel_args_offset;
247
248	si_pm4_add_bo(pm4, input_buffer, RADEON_USAGE_READ,
249		RADEON_PRIO_SHADER_DATA);
250
251	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va);
252	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 4, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) | S_008F04_STRIDE(0));
253	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va);
254	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12,
255		S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32)
256		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64));
257
258	si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0);
259	si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0);
260	si_pm4_set_reg(pm4, R_00B818_COMPUTE_START_Z, 0);
261
262	si_pm4_set_reg(pm4, R_00B81C_COMPUTE_NUM_THREAD_X,
263				S_00B81C_NUM_THREAD_FULL(block_layout[0]));
264	si_pm4_set_reg(pm4, R_00B820_COMPUTE_NUM_THREAD_Y,
265				S_00B820_NUM_THREAD_FULL(block_layout[1]));
266	si_pm4_set_reg(pm4, R_00B824_COMPUTE_NUM_THREAD_Z,
267				S_00B824_NUM_THREAD_FULL(block_layout[2]));
268
269	/* Global buffers */
270	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
271		struct r600_resource *buffer =
272				(struct r600_resource*)program->global_buffers[i];
273		if (!buffer) {
274			continue;
275		}
276		si_pm4_add_bo(pm4, buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
277	}
278
279	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
280	 * and is now per pipe, so it should be handled in the
281	 * kernel if we want to use something other than the default value,
282	 * which is now 0x22f.
283	 */
284	if (sctx->b.chip_class <= SI) {
285		/* XXX: This should be:
286		 * (number of compute units) * 4 * (waves per simd) - 1 */
287
288		si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID,
289						0x190 /* Default value */);
290	}
291
292	shader_va = shader->bo->gpu_address;
293	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
294	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
295	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
296
297	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
298		/* We always use at least 3 VGPRS, these come from
299		 * TIDIG_COMP_CNT.
300		 * XXX: The compiler should account for this.
301		 */
302		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
303		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
304		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
305		 * XXX: The compiler should account for this.
306		 */
307		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
308		                        shader->num_sgprs)) - 1) / 8))
309		;
310
311	lds_blocks = shader->lds_size;
312	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
313	 * blocks of 256 bytes, so if there are 4 bytes lds allocated in
314	 * the shader and 4 bytes allocated by the state tracker, then
315	 * we will set LDS_SIZE to 512 bytes rather than 256.
316	 */
317	if (sctx->b.chip_class <= SI) {
318		lds_blocks += align(program->local_size, 256) >> 8;
319	} else {
320		lds_blocks += align(program->local_size, 512) >> 9;
321	}
322
323	assert(lds_blocks <= 0xFF);
324
325	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
326		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
327		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
328		| S_00B84C_TGID_X_EN(1)
329		| S_00B84C_TGID_Y_EN(1)
330		| S_00B84C_TGID_Z_EN(1)
331		| S_00B84C_TG_SIZE_EN(1)
332		| S_00B84C_TIDIG_COMP_CNT(2)
333		| S_00B84C_LDS_SIZE(lds_blocks)
334		| S_00B84C_EXCP_EN(0))
335		;
336	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
337
338	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
339		S_00B858_SH0_CU_EN(0xffff /* Default value */)
340		| S_00B858_SH1_CU_EN(0xffff /* Default value */))
341		;
342
343	si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1,
344		S_00B85C_SH0_CU_EN(0xffff /* Default value */)
345		| S_00B85C_SH1_CU_EN(0xffff /* Default value */))
346		;
347
348	si_pm4_set_reg(pm4, R_00B860_COMPUTE_TMPRING_SIZE,
349		/* The maximum value for WAVES is 32 * num CU.
350		 * If you program this value incorrectly, the GPU will hang if
351		 * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled.
352		 */
353		S_00B860_WAVES(num_waves_for_scratch)
354		| S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10))
355		;
356
357	si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT);
358	si_pm4_cmd_add(pm4, grid_layout[0]); /* Thread groups DIM_X */
359	si_pm4_cmd_add(pm4, grid_layout[1]); /* Thread groups DIM_Y */
360	si_pm4_cmd_add(pm4, grid_layout[2]); /* Thread gropus DIM_Z */
361	si_pm4_cmd_add(pm4, 1); /* DISPATCH_INITIATOR */
362        si_pm4_cmd_end(pm4, false);
363
364	si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE);
365	si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(0x4)));
366	si_pm4_cmd_end(pm4, false);
367
368	si_pm4_inval_texture_cache(pm4);
369	si_pm4_inval_shader_cache(pm4);
370	si_cmd_surface_sync(pm4, pm4->cp_coher_cntl);
371
372	si_pm4_emit(sctx, pm4);
373
374#if 0
375	fprintf(stderr, "cdw: %i\n", sctx->cs->cdw);
376	for (i = 0; i < sctx->cs->cdw; i++) {
377		fprintf(stderr, "%4i : 0x%08X\n", i, sctx->cs->buf[i]);
378	}
379#endif
380
381	si_pm4_free_state(sctx, pm4, ~0);
382}
383
384
385static void si_delete_compute_state(struct pipe_context *ctx, void* state){
386	struct si_pipe_compute *program = (struct si_pipe_compute *)state;
387
388	if (!state) {
389		return;
390	}
391
392	if (program->kernels) {
393		for (int i = 0; i < program->num_kernels; i++){
394			if (program->kernels[i].bo){
395				si_pipe_shader_destroy(ctx, &program->kernels[i]);
396			}
397		}
398		FREE(program->kernels);
399	}
400
401	if (program->llvm_ctx){
402		LLVMContextDispose(program->llvm_ctx);
403	}
404	pipe_resource_reference(
405		(struct pipe_resource **)&program->input_buffer, NULL);
406
407	//And then free the program itself.
408	FREE(program);
409}
410
411static void si_set_compute_resources(struct pipe_context * ctx_,
412		unsigned start, unsigned count,
413		struct pipe_surface ** surfaces) { }
414
415void si_init_compute_functions(struct si_context *sctx)
416{
417	sctx->b.b.create_compute_state = si_create_compute_state;
418	sctx->b.b.delete_compute_state = si_delete_compute_state;
419	sctx->b.b.bind_compute_state = si_bind_compute_state;
420/*	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */
421	sctx->b.b.set_compute_resources = si_set_compute_resources;
422	sctx->b.b.set_global_binding = si_set_global_binding;
423	sctx->b.b.launch_grid = si_launch_grid;
424}
425