evergreen_compute.c revision af69d88d
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon_llvm_util.h"
51#endif
52#include <inttypes.h>
53
54/**
55RAT0 is for global binding write
56VTX1 is for global binding read
57
58for wrting images RAT1...
59for reading images TEX2...
60  TEX2-RAT1 is paired
61
62TEX2... consumes the same fetch resources, that VTX2... would consume
63
64CONST0 and VTX0 is for parameters
65  CONST0 is binding smaller input parameter buffer, and for constant indexing,
66  also constant cached
67  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68  the constant cache can handle
69
70RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71because we reserve RAT0 for global bindings. With byteaddressing enabled,
72we should reserve another one too.=> 10 image binding for writing max.
73
74from Nvidia OpenCL:
75  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
76  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
77
78so 10 for writing is enough. 176 is the max for reading according to the docs
79
80writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81writable images will consume TEX slots, VTX slots too because of linear indexing
82
83*/
84
85struct r600_resource* r600_compute_buffer_alloc_vram(
86       struct r600_screen *screen,
87       unsigned size)
88{
89	struct pipe_resource * buffer = NULL;
90	assert(size);
91
92	buffer = pipe_buffer_create(
93		(struct pipe_screen*) screen,
94		PIPE_BIND_CUSTOM,
95		PIPE_USAGE_IMMUTABLE,
96		size);
97
98	return (struct r600_resource *)buffer;
99}
100
101
102static void evergreen_set_rat(
103	struct r600_pipe_compute *pipe,
104	unsigned id,
105	struct r600_resource* bo,
106	int start,
107	int size)
108{
109	struct pipe_surface rat_templ;
110	struct r600_surface *surf = NULL;
111	struct r600_context *rctx = NULL;
112
113	assert(id < 12);
114	assert((size & 3) == 0);
115	assert((start & 0xFF) == 0);
116
117	rctx = pipe->ctx;
118
119	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
120
121	/* Create the RAT surface */
122	memset(&rat_templ, 0, sizeof(rat_templ));
123	rat_templ.format = PIPE_FORMAT_R32_UINT;
124	rat_templ.u.tex.level = 0;
125	rat_templ.u.tex.first_layer = 0;
126	rat_templ.u.tex.last_layer = 0;
127
128	/* Add the RAT the list of color buffers */
129	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
130		(struct pipe_context *)pipe->ctx,
131		(struct pipe_resource *)bo, &rat_templ);
132
133	/* Update the number of color buffers */
134	pipe->ctx->framebuffer.state.nr_cbufs =
135		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
136
137	/* Update the cb_target_mask
138	 * XXX: I think this is a potential spot for bugs once we start doing
139	 * GL interop.  cb_target_mask may be modified in the 3D sections
140	 * of this driver. */
141	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
142
143	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
144	evergreen_init_color_surface_rat(rctx, surf);
145}
146
147static void evergreen_cs_set_vertex_buffer(
148	struct r600_context * rctx,
149	unsigned vb_index,
150	unsigned offset,
151	struct pipe_resource * buffer)
152{
153	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
154	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
155	vb->stride = 1;
156	vb->buffer_offset = offset;
157	vb->buffer = buffer;
158	vb->user_buffer = NULL;
159
160	/* The vertex instructions in the compute shaders use the texture cache,
161	 * so we need to invalidate it. */
162	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
163	state->enabled_mask |= 1 << vb_index;
164	state->dirty_mask |= 1 << vb_index;
165	state->atom.dirty = true;
166}
167
168static void evergreen_cs_set_constant_buffer(
169	struct r600_context * rctx,
170	unsigned cb_index,
171	unsigned offset,
172	unsigned size,
173	struct pipe_resource * buffer)
174{
175	struct pipe_constant_buffer cb;
176	cb.buffer_size = size;
177	cb.buffer_offset = offset;
178	cb.buffer = buffer;
179	cb.user_buffer = NULL;
180
181	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
182}
183
184static const struct u_resource_vtbl r600_global_buffer_vtbl =
185{
186	u_default_resource_get_handle, /* get_handle */
187	r600_compute_global_buffer_destroy, /* resource_destroy */
188	r600_compute_global_transfer_map, /* transfer_map */
189	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
190	r600_compute_global_transfer_unmap, /* transfer_unmap */
191	r600_compute_global_transfer_inline_write /* transfer_inline_write */
192};
193
194
195void *evergreen_create_compute_state(
196	struct pipe_context *ctx_,
197	const const struct pipe_compute_state *cso)
198{
199	struct r600_context *ctx = (struct r600_context *)ctx_;
200	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
201
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const unsigned char * code;
205	unsigned i;
206
207	shader->llvm_ctx = LLVMContextCreate();
208
209	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
210
211	header = cso->prog;
212	code = cso->prog + sizeof(struct pipe_llvm_program_header);
213#endif
214
215	shader->ctx = (struct r600_context*)ctx;
216	shader->local_size = cso->req_local_mem;
217	shader->private_size = cso->req_private_mem;
218	shader->input_size = cso->req_input_mem;
219
220#ifdef HAVE_OPENCL
221	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
222							header->num_bytes);
223	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
224
225	for (i = 0; i < shader->num_kernels; i++) {
226		struct r600_kernel *kernel = &shader->kernels[i];
227		kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
228							code, header->num_bytes);
229	}
230#endif
231	return shader;
232}
233
234void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
235{
236	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
237
238	if (!shader)
239		return;
240
241	FREE(shader->kernels);
242
243#ifdef HAVE_OPENCL
244	if (shader->llvm_ctx){
245		LLVMContextDispose(shader->llvm_ctx);
246	}
247#endif
248
249	FREE(shader);
250}
251
252static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
253{
254	struct r600_context *ctx = (struct r600_context *)ctx_;
255
256	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
257
258	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
259}
260
261/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
262 * kernel parameters there are implicit parameters that need to be stored
263 * in the vertex buffer as well.  Here is how these parameters are organized in
264 * the buffer:
265 *
266 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
267 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
268 * DWORDS 6-8: Number of work items within each work group in each dimension
269 *             (x,y,z)
270 * DWORDS 9+ : Kernel parameters
271 */
272void evergreen_compute_upload_input(
273	struct pipe_context *ctx_,
274	const uint *block_layout,
275	const uint *grid_layout,
276	const void *input)
277{
278	struct r600_context *ctx = (struct r600_context *)ctx_;
279	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
280	unsigned i;
281	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
282	 * parameters.
283	 */
284	unsigned input_size = shader->input_size + 36;
285	uint32_t * num_work_groups_start;
286	uint32_t * global_size_start;
287	uint32_t * local_size_start;
288	uint32_t * kernel_parameters_start;
289	struct pipe_box box;
290	struct pipe_transfer *transfer = NULL;
291
292	if (shader->input_size == 0) {
293		return;
294	}
295
296	if (!shader->kernel_param) {
297		/* Add space for the grid dimensions */
298		shader->kernel_param = (struct r600_resource *)
299			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
300					PIPE_USAGE_IMMUTABLE, input_size);
301	}
302
303	u_box_1d(0, input_size, &box);
304	num_work_groups_start = ctx_->transfer_map(ctx_,
305			(struct pipe_resource*)shader->kernel_param,
306			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
307			&box, &transfer);
308	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
309	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
310	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
311
312	/* Copy the work group size */
313	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
314
315	/* Copy the global size */
316	for (i = 0; i < 3; i++) {
317		global_size_start[i] = grid_layout[i] * block_layout[i];
318	}
319
320	/* Copy the local dimensions */
321	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
322
323	/* Copy the kernel inputs */
324	memcpy(kernel_parameters_start, input, shader->input_size);
325
326	for (i = 0; i < (input_size / 4); i++) {
327		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
328			((unsigned*)num_work_groups_start)[i]);
329	}
330
331	ctx_->transfer_unmap(ctx_, transfer);
332
333	/* ID=0 is reserved for the parameters */
334	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
335			(struct pipe_resource*)shader->kernel_param);
336}
337
338static void evergreen_emit_direct_dispatch(
339		struct r600_context *rctx,
340		const uint *block_layout, const uint *grid_layout)
341{
342	int i;
343	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
344	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
345	unsigned num_waves;
346	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
347	unsigned wave_divisor = (16 * num_pipes);
348	int group_size = 1;
349	int grid_size = 1;
350	unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
351
352	/* Calculate group_size/grid_size */
353	for (i = 0; i < 3; i++) {
354		group_size *= block_layout[i];
355	}
356
357	for (i = 0; i < 3; i++)	{
358		grid_size *= grid_layout[i];
359	}
360
361	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
362	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
363			wave_divisor - 1) / wave_divisor;
364
365	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
366				"%u wavefronts per thread block, "
367				"allocating %u dwords lds.\n",
368				num_pipes, num_waves, lds_size);
369
370	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
371
372	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
373	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
374	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
375	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
376
377	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
378								group_size);
379
380	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
381	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
382	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
383	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
384
385	if (rctx->b.chip_class < CAYMAN) {
386		assert(lds_size <= 8192);
387	} else {
388		/* Cayman appears to have a slightly smaller limit, see the
389		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
390		assert(lds_size <= 8160);
391	}
392
393	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
394					lds_size | (num_waves << 14));
395
396	/* Dispatch packet */
397	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
398	radeon_emit(cs, grid_layout[0]);
399	radeon_emit(cs, grid_layout[1]);
400	radeon_emit(cs, grid_layout[2]);
401	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
402	radeon_emit(cs, 1);
403}
404
405static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
406		const uint *grid_layout)
407{
408	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
409	unsigned i;
410
411	/* make sure that the gfx ring is only one active */
412	if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
413		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
414	}
415
416	/* Initialize all the compute-related registers.
417	 *
418	 * See evergreen_init_atom_start_compute_cs() in this file for the list
419	 * of registers initialized by the start_compute_cs_cmd atom.
420	 */
421	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
422
423	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
424	r600_flush_emit(ctx);
425
426	/* Emit colorbuffers. */
427	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
428	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
429		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
430		unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
431						       (struct r600_resource*)cb->base.texture,
432						       RADEON_USAGE_READWRITE,
433						       RADEON_PRIO_SHADER_RESOURCE_RW);
434
435		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
436		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
437		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
438		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
439		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
440		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
441		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
442		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
443
444		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
445		radeon_emit(cs, reloc);
446
447		if (!ctx->keep_tiling_flags) {
448			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
449			radeon_emit(cs, reloc);
450		}
451
452		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
453		radeon_emit(cs, reloc);
454	}
455	if (ctx->keep_tiling_flags) {
456		for (; i < 8 ; i++) {
457			r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
458						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
459		}
460		for (; i < 12; i++) {
461			r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
462						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
463		}
464	}
465
466	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
467	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
468					ctx->compute_cb_target_mask);
469
470
471	/* Emit vertex buffer state */
472	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
473	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
474
475	/* Emit constant buffer state */
476	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
477
478	/* Emit compute shader state */
479	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
480
481	/* Emit dispatch state and dispatch packet */
482	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
483
484	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
485	 */
486	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
487		      R600_CONTEXT_INV_VERTEX_CACHE |
488	              R600_CONTEXT_INV_TEX_CACHE;
489	r600_flush_emit(ctx);
490	ctx->b.flags = 0;
491
492	if (ctx->b.chip_class >= CAYMAN) {
493		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
494		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
495		/* DEALLOC_STATE prevents the GPU from hanging when a
496		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
497		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
498		 */
499		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
500		cs->buf[cs->cdw++] = 0;
501	}
502
503#if 0
504	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
505	for (i = 0; i < cs->cdw; i++) {
506		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
507	}
508#endif
509
510}
511
512
513/**
514 * Emit function for r600_cs_shader_state atom
515 */
516void evergreen_emit_cs_shader(
517		struct r600_context *rctx,
518		struct r600_atom *atom)
519{
520	struct r600_cs_shader_state *state =
521					(struct r600_cs_shader_state*)atom;
522	struct r600_pipe_compute *shader = state->shader;
523	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
524	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
525
526	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
527	radeon_emit(cs, kernel->code_bo->gpu_address >> 8); /* R_0288D0_SQ_PGM_START_LS */
528	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
529			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
530			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
531	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
532
533	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
534	radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
535					      kernel->code_bo, RADEON_USAGE_READ,
536					      RADEON_PRIO_SHADER_DATA));
537}
538
539static void evergreen_launch_grid(
540		struct pipe_context *ctx_,
541		const uint *block_layout, const uint *grid_layout,
542		uint32_t pc, const void *input)
543{
544	struct r600_context *ctx = (struct r600_context *)ctx_;
545
546	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
547	struct r600_kernel *kernel = &shader->kernels[pc];
548
549	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
550
551#ifdef HAVE_OPENCL
552
553	if (!kernel->code_bo) {
554		void *p;
555		struct r600_bytecode *bc = &kernel->bc;
556		LLVMModuleRef mod = kernel->llvm_module;
557		boolean use_kill = false;
558		bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
559		unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
560		unsigned sb_disasm = use_sb ||
561			(ctx->screen->b.debug_flags & DBG_SB_DISASM);
562
563		r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
564			   ctx->screen->has_compressed_msaa_texturing);
565		bc->type = TGSI_PROCESSOR_COMPUTE;
566		bc->isa = ctx->isa;
567		r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
568
569		if (dump && !sb_disasm) {
570			r600_bytecode_disasm(bc);
571		} else if ((dump && sb_disasm) || use_sb) {
572			if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
573				R600_ERR("r600_sb_bytecode_process failed!\n");
574		}
575
576		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
577							kernel->bc.ndw * 4);
578		p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
579		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
580		ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
581	}
582#endif
583	shader->active_kernel = kernel;
584	ctx->cs_shader_state.kernel_index = pc;
585	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
586	compute_emit_cs(ctx, block_layout, grid_layout);
587}
588
589static void evergreen_set_compute_resources(struct pipe_context * ctx_,
590		unsigned start, unsigned count,
591		struct pipe_surface ** surfaces)
592{
593	struct r600_context *ctx = (struct r600_context *)ctx_;
594	struct r600_surface **resources = (struct r600_surface **)surfaces;
595
596	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
597			start, count);
598
599	for (unsigned i = 0; i < count; i++) {
600		/* The First two vertex buffers are reserved for parameters and
601		 * global buffers. */
602		unsigned vtx_id = 2 + i;
603		if (resources[i]) {
604			struct r600_resource_global *buffer =
605				(struct r600_resource_global*)
606				resources[i]->base.texture;
607			if (resources[i]->base.writable) {
608				assert(i+1 < 12);
609
610				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
611				(struct r600_resource *)resources[i]->base.texture,
612				buffer->chunk->start_in_dw*4,
613				resources[i]->base.texture->width0);
614			}
615
616			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
617					buffer->chunk->start_in_dw * 4,
618					resources[i]->base.texture);
619		}
620	}
621}
622
623void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
624		unsigned start_slot, unsigned count,
625		struct pipe_sampler_view **views)
626{
627	struct r600_pipe_sampler_view **resource =
628		(struct r600_pipe_sampler_view **)views;
629
630	for (unsigned i = 0; i < count; i++)	{
631		if (resource[i]) {
632			assert(i+1 < 12);
633			/* XXX: Implement */
634			assert(!"Compute samplers not implemented.");
635			///FETCH0 = VTX0 (param buffer),
636			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
637		}
638	}
639}
640
641
642static void evergreen_set_global_binding(
643	struct pipe_context *ctx_, unsigned first, unsigned n,
644	struct pipe_resource **resources,
645	uint32_t **handles)
646{
647	struct r600_context *ctx = (struct r600_context *)ctx_;
648	struct compute_memory_pool *pool = ctx->screen->global_pool;
649	struct r600_resource_global **buffers =
650		(struct r600_resource_global **)resources;
651	unsigned i;
652
653	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
654			first, n);
655
656	if (!resources) {
657		/* XXX: Unset */
658		return;
659	}
660
661	/* We mark these items for promotion to the pool if they
662	 * aren't already there */
663	for (i = first; i < first + n; i++) {
664		struct compute_memory_item *item = buffers[i]->chunk;
665
666		if (!is_item_in_pool(item))
667			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
668	}
669
670	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
671		/* XXX: Unset */
672		return;
673	}
674
675	for (i = first; i < first + n; i++)
676	{
677		uint32_t buffer_offset;
678		uint32_t handle;
679		assert(resources[i]->target == PIPE_BUFFER);
680		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
681
682		buffer_offset = util_le32_to_cpu(*(handles[i]));
683		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
684
685		*(handles[i]) = util_cpu_to_le32(handle);
686	}
687
688	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
689	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
690				(struct pipe_resource*)pool->bo);
691}
692
693/**
694 * This function initializes all the compute specific registers that need to
695 * be initialized for each compute command stream.  Registers that are common
696 * to both compute and 3D will be initialized at the beginning of each compute
697 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
698 * packet requires that the shader type bit be set, we must initialize all
699 * context registers needed for compute in this function.  The registers
700 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
701 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
702 * on the GPU family.
703 */
704void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
705{
706	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
707	int num_threads;
708	int num_stack_entries;
709
710	/* since all required registers are initialised in the
711	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
712	 */
713	r600_init_command_buffer(cb, 256);
714	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
715
716	/* This must be first. */
717	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
718	r600_store_value(cb, 0x80000000);
719	r600_store_value(cb, 0x80000000);
720
721	/* We're setting config registers here. */
722	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
723	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
724
725	switch (ctx->b.family) {
726	case CHIP_CEDAR:
727	default:
728		num_threads = 128;
729		num_stack_entries = 256;
730		break;
731	case CHIP_REDWOOD:
732		num_threads = 128;
733		num_stack_entries = 256;
734		break;
735	case CHIP_JUNIPER:
736		num_threads = 128;
737		num_stack_entries = 512;
738		break;
739	case CHIP_CYPRESS:
740	case CHIP_HEMLOCK:
741		num_threads = 128;
742		num_stack_entries = 512;
743		break;
744	case CHIP_PALM:
745		num_threads = 128;
746		num_stack_entries = 256;
747		break;
748	case CHIP_SUMO:
749		num_threads = 128;
750		num_stack_entries = 256;
751		break;
752	case CHIP_SUMO2:
753		num_threads = 128;
754		num_stack_entries = 512;
755		break;
756	case CHIP_BARTS:
757		num_threads = 128;
758		num_stack_entries = 512;
759		break;
760	case CHIP_TURKS:
761		num_threads = 128;
762		num_stack_entries = 256;
763		break;
764	case CHIP_CAICOS:
765		num_threads = 128;
766		num_stack_entries = 256;
767		break;
768	}
769
770	/* Config Registers */
771	if (ctx->b.chip_class < CAYMAN)
772		evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
773					   ctx->screen->b.info.drm_minor);
774	else
775		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
776					ctx->screen->b.info.drm_minor);
777
778	/* The primitive type always needs to be POINTLIST for compute. */
779	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
780						V_008958_DI_PT_POINTLIST);
781
782	if (ctx->b.chip_class < CAYMAN) {
783
784		/* These registers control which simds can be used by each stage.
785		 * The default for these registers is 0xffffffff, which means
786		 * all simds are available for each stage.  It's possible we may
787		 * want to play around with these in the future, but for now
788		 * the default value is fine.
789		 *
790		 * R_008E20_SQ_STATIC_THREAD_MGMT1
791		 * R_008E24_SQ_STATIC_THREAD_MGMT2
792		 * R_008E28_SQ_STATIC_THREAD_MGMT3
793		 */
794
795		/* XXX: We may need to adjust the thread and stack resouce
796		 * values for 3D/compute interop */
797
798		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
799
800		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
801		 * Set the number of threads used by the PS/VS/GS/ES stage to
802		 * 0.
803		 */
804		r600_store_value(cb, 0);
805
806		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
807		 * Set the number of threads used by the CS (aka LS) stage to
808		 * the maximum number of threads and set the number of threads
809		 * for the HS stage to 0. */
810		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
811
812		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
813		 * Set the Control Flow stack entries to 0 for PS/VS stages */
814		r600_store_value(cb, 0);
815
816		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
817		 * Set the Control Flow stack entries to 0 for GS/ES stages */
818		r600_store_value(cb, 0);
819
820		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
821		 * Set the Contol Flow stack entries to 0 for the HS stage, and
822		 * set it to the maximum value for the CS (aka LS) stage. */
823		r600_store_value(cb,
824			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
825	}
826	/* Give the compute shader all the available LDS space.
827	 * NOTE: This only sets the maximum number of dwords that a compute
828	 * shader can allocate.  When a shader is executed, we still need to
829	 * allocate the appropriate amount of LDS dwords using the
830	 * CM_R_0288E8_SQ_LDS_ALLOC register.
831	 */
832	if (ctx->b.chip_class < CAYMAN) {
833		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
834			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
835	} else {
836		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
837			S_0286FC_NUM_PS_LDS(0) |
838			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
839	}
840
841	/* Context Registers */
842
843	if (ctx->b.chip_class < CAYMAN) {
844		/* workaround for hw issues with dyn gpr - must set all limits
845		 * to 240 instead of 0, 0x1e == 240 / 8
846		 */
847		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
848				S_028838_PS_GPRS(0x1e) |
849				S_028838_VS_GPRS(0x1e) |
850				S_028838_GS_GPRS(0x1e) |
851				S_028838_ES_GPRS(0x1e) |
852				S_028838_HS_GPRS(0x1e) |
853				S_028838_LS_GPRS(0x1e));
854	}
855
856	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
857	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
858		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
859
860	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
861
862	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
863						S_0286E8_TID_IN_GROUP_ENA
864						| S_0286E8_TGID_ENA
865						| S_0286E8_DISABLE_INDEX_PACK)
866						;
867
868	/* The LOOP_CONST registers are an optimizations for loops that allows
869	 * you to store the initial counter, increment value, and maximum
870	 * counter value in a register so that hardware can calculate the
871	 * correct number of iterations for the loop, so that you don't need
872	 * to have the loop counter in your shader code.  We don't currently use
873	 * this optimization, so we must keep track of the counter in the
874	 * shader and use a break instruction to exit loops.  However, the
875	 * hardware will still uses this register to determine when to exit a
876	 * loop, so we need to initialize the counter to 0, set the increment
877	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
878	 * is the maximum value allowed.  This gives us a maximum of 4096
879	 * iterations for our loops, but hopefully our break instruction will
880	 * execute before some time before the 4096th iteration.
881	 */
882	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
883}
884
885void evergreen_init_compute_state_functions(struct r600_context *ctx)
886{
887	ctx->b.b.create_compute_state = evergreen_create_compute_state;
888	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
889	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
890//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
891	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
892	ctx->b.b.set_global_binding = evergreen_set_global_binding;
893	ctx->b.b.launch_grid = evergreen_launch_grid;
894
895}
896
897struct pipe_resource *r600_compute_global_buffer_create(
898	struct pipe_screen *screen,
899	const struct pipe_resource *templ)
900{
901	struct r600_resource_global* result = NULL;
902	struct r600_screen* rscreen = NULL;
903	int size_in_dw = 0;
904
905	assert(templ->target == PIPE_BUFFER);
906	assert(templ->bind & PIPE_BIND_GLOBAL);
907	assert(templ->array_size == 1 || templ->array_size == 0);
908	assert(templ->depth0 == 1 || templ->depth0 == 0);
909	assert(templ->height0 == 1 || templ->height0 == 0);
910
911	result = (struct r600_resource_global*)
912	CALLOC(sizeof(struct r600_resource_global), 1);
913	rscreen = (struct r600_screen*)screen;
914
915	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
916	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
917			templ->array_size);
918
919	result->base.b.vtbl = &r600_global_buffer_vtbl;
920	result->base.b.b.screen = screen;
921	result->base.b.b = *templ;
922	pipe_reference_init(&result->base.b.b.reference, 1);
923
924	size_in_dw = (templ->width0+3) / 4;
925
926	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
927
928	if (result->chunk == NULL)
929	{
930		free(result);
931		return NULL;
932	}
933
934	return &result->base.b.b;
935}
936
937void r600_compute_global_buffer_destroy(
938	struct pipe_screen *screen,
939	struct pipe_resource *res)
940{
941	struct r600_resource_global* buffer = NULL;
942	struct r600_screen* rscreen = NULL;
943
944	assert(res->target == PIPE_BUFFER);
945	assert(res->bind & PIPE_BIND_GLOBAL);
946
947	buffer = (struct r600_resource_global*)res;
948	rscreen = (struct r600_screen*)screen;
949
950	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
951
952	buffer->chunk = NULL;
953	free(res);
954}
955
956void *r600_compute_global_transfer_map(
957	struct pipe_context *ctx_,
958	struct pipe_resource *resource,
959	unsigned level,
960	unsigned usage,
961	const struct pipe_box *box,
962	struct pipe_transfer **ptransfer)
963{
964	struct r600_context *rctx = (struct r600_context*)ctx_;
965	struct compute_memory_pool *pool = rctx->screen->global_pool;
966	struct r600_resource_global* buffer =
967		(struct r600_resource_global*)resource;
968
969	struct compute_memory_item *item = buffer->chunk;
970	struct pipe_resource *dst = NULL;
971	unsigned offset = box->x;
972
973	if (is_item_in_pool(item)) {
974		compute_memory_demote_item(pool, item, ctx_);
975	}
976	else {
977		if (item->real_buffer == NULL) {
978			item->real_buffer = (struct r600_resource*)
979					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
980		}
981	}
982
983	dst = (struct pipe_resource*)item->real_buffer;
984
985	if (usage & PIPE_TRANSFER_READ)
986		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
987
988	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
989			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
990			"width = %u, height = %u, depth = %u)\n", level, usage,
991			box->x, box->y, box->z, box->width, box->height,
992			box->depth);
993	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
994		"%u (box.x)\n", item->id, box->x);
995
996
997	assert(resource->target == PIPE_BUFFER);
998	assert(resource->bind & PIPE_BIND_GLOBAL);
999	assert(box->x >= 0);
1000	assert(box->y == 0);
1001	assert(box->z == 0);
1002
1003	///TODO: do it better, mapping is not possible if the pool is too big
1004	return pipe_buffer_map_range(ctx_, dst,
1005			offset, box->width, usage, ptransfer);
1006}
1007
1008void r600_compute_global_transfer_unmap(
1009	struct pipe_context *ctx_,
1010	struct pipe_transfer* transfer)
1011{
1012	/* struct r600_resource_global are not real resources, they just map
1013	 * to an offset within the compute memory pool.  The function
1014	 * r600_compute_global_transfer_map() maps the memory pool
1015	 * resource rather than the struct r600_resource_global passed to
1016	 * it as an argument and then initalizes ptransfer->resource with
1017	 * the memory pool resource (via pipe_buffer_map_range).
1018	 * When transfer_unmap is called it uses the memory pool's
1019	 * vtable which calls r600_buffer_transfer_map() rather than
1020	 * this function.
1021	 */
1022	assert (!"This function should not be called");
1023}
1024
1025void r600_compute_global_transfer_flush_region(
1026	struct pipe_context *ctx_,
1027	struct pipe_transfer *transfer,
1028	const struct pipe_box *box)
1029{
1030	assert(0 && "TODO");
1031}
1032
1033void r600_compute_global_transfer_inline_write(
1034	struct pipe_context *pipe,
1035	struct pipe_resource *resource,
1036	unsigned level,
1037	unsigned usage,
1038	const struct pipe_box *box,
1039	const void *data,
1040	unsigned stride,
1041	unsigned layer_stride)
1042{
1043	assert(0 && "TODO");
1044}
1045