1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#ifdef HAVE_OPENCL
28#include <gelf.h>
29#include <libelf.h>
30#endif
31#include <stdio.h>
32#include <errno.h>
33#include "pipe/p_defines.h"
34#include "pipe/p_state.h"
35#include "pipe/p_context.h"
36#include "util/u_blitter.h"
37#include "util/list.h"
38#include "util/u_transfer.h"
39#include "util/u_surface.h"
40#include "util/u_pack_color.h"
41#include "util/u_memory.h"
42#include "util/u_inlines.h"
43#include "util/u_framebuffer.h"
44#include "tgsi/tgsi_parse.h"
45#include "pipebuffer/pb_buffer.h"
46#include "evergreend.h"
47#include "r600_shader.h"
48#include "r600_pipe.h"
49#include "r600_formats.h"
50#include "evergreen_compute.h"
51#include "evergreen_compute_internal.h"
52#include "compute_memory_pool.h"
53#include "sb/sb_public.h"
54#include <inttypes.h>
55
56/**
57RAT0 is for global binding write
58VTX1 is for global binding read
59
60for wrting images RAT1...
61for reading images TEX2...
62  TEX2-RAT1 is paired
63
64TEX2... consumes the same fetch resources, that VTX2... would consume
65
66CONST0 and VTX0 is for parameters
67  CONST0 is binding smaller input parameter buffer, and for constant indexing,
68  also constant cached
69  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70  the constant cache can handle
71
72RAT-s are limited to 12, so we can only bind at most 11 texture for writing
73because we reserve RAT0 for global bindings. With byteaddressing enabled,
74we should reserve another one too.=> 10 image binding for writing max.
75
76from Nvidia OpenCL:
77  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
78  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
79
80so 10 for writing is enough. 176 is the max for reading according to the docs
81
82writable images should be listed first < 10, so their id corresponds to RAT(id+1)
83writable images will consume TEX slots, VTX slots too because of linear indexing
84
85*/
86
87#ifdef HAVE_OPENCL
88static void radeon_shader_binary_init(struct r600_shader_binary *b)
89{
90	memset(b, 0, sizeof(*b));
91}
92
93static void radeon_shader_binary_clean(struct r600_shader_binary *b)
94{
95	if (!b)
96		return;
97	FREE(b->code);
98	FREE(b->config);
99	FREE(b->rodata);
100	FREE(b->global_symbol_offsets);
101	FREE(b->relocs);
102	FREE(b->disasm_string);
103}
104#endif
105
106struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
107						     unsigned size)
108{
109	struct pipe_resource *buffer = NULL;
110	assert(size);
111
112	buffer = pipe_buffer_create((struct pipe_screen*) screen,
113				    0, PIPE_USAGE_IMMUTABLE, size);
114
115	return (struct r600_resource *)buffer;
116}
117
118
119static void evergreen_set_rat(struct r600_pipe_compute *pipe,
120			      unsigned id,
121			      struct r600_resource *bo,
122			      int start,
123			      int size)
124{
125	struct pipe_surface rat_templ;
126	struct r600_surface *surf = NULL;
127	struct r600_context *rctx = NULL;
128
129	assert(id < 12);
130	assert((size & 3) == 0);
131	assert((start & 0xFF) == 0);
132
133	rctx = pipe->ctx;
134
135	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
136
137	/* Create the RAT surface */
138	memset(&rat_templ, 0, sizeof(rat_templ));
139	rat_templ.format = PIPE_FORMAT_R32_UINT;
140	rat_templ.u.tex.level = 0;
141	rat_templ.u.tex.first_layer = 0;
142	rat_templ.u.tex.last_layer = 0;
143
144	/* Add the RAT the list of color buffers. Drop the old buffer first. */
145	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
146	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
147		(struct pipe_context *)pipe->ctx,
148		(struct pipe_resource *)bo, &rat_templ);
149
150	/* Update the number of color buffers */
151	pipe->ctx->framebuffer.state.nr_cbufs =
152		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
153
154	/* Update the cb_target_mask
155	 * XXX: I think this is a potential spot for bugs once we start doing
156	 * GL interop.  cb_target_mask may be modified in the 3D sections
157	 * of this driver. */
158	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
159
160	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
161	evergreen_init_color_surface_rat(rctx, surf);
162}
163
164static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
165					   unsigned vb_index,
166					   unsigned offset,
167					   struct pipe_resource *buffer)
168{
169	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
170	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
171	vb->stride = 1;
172	vb->buffer_offset = offset;
173	vb->buffer.resource = buffer;
174	vb->is_user_buffer = false;
175
176	/* The vertex instructions in the compute shaders use the texture cache,
177	 * so we need to invalidate it. */
178	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
179	state->enabled_mask |= 1 << vb_index;
180	state->dirty_mask |= 1 << vb_index;
181	r600_mark_atom_dirty(rctx, &state->atom);
182}
183
184static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
185					     unsigned cb_index,
186					     unsigned offset,
187					     unsigned size,
188					     struct pipe_resource *buffer)
189{
190	struct pipe_constant_buffer cb;
191	cb.buffer_size = size;
192	cb.buffer_offset = offset;
193	cb.buffer = buffer;
194	cb.user_buffer = NULL;
195
196	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
197}
198
199/* We need to define these R600 registers here, because we can't include
200 * evergreend.h and r600d.h.
201 */
202#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
203#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
204
205#ifdef HAVE_OPENCL
206static void parse_symbol_table(Elf_Data *symbol_table_data,
207				const GElf_Shdr *symbol_table_header,
208				struct r600_shader_binary *binary)
209{
210	GElf_Sym symbol;
211	unsigned i = 0;
212	unsigned symbol_count =
213		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
214
215	/* We are over allocating this list, because symbol_count gives the
216	 * total number of symbols, and we will only be filling the list
217	 * with offsets of global symbols.  The memory savings from
218	 * allocating the correct size of this list will be small, and
219	 * I don't think it is worth the cost of pre-computing the number
220	 * of global symbols.
221	 */
222	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
223
224	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
225		unsigned i;
226		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
227		    symbol.st_shndx == 0 /* Undefined symbol */) {
228			continue;
229		}
230
231		binary->global_symbol_offsets[binary->global_symbol_count] =
232					symbol.st_value;
233
234		/* Sort the list using bubble sort.  This list will usually
235		 * be small. */
236		for (i = binary->global_symbol_count; i > 0; --i) {
237			uint64_t lhs = binary->global_symbol_offsets[i - 1];
238			uint64_t rhs = binary->global_symbol_offsets[i];
239			if (lhs < rhs) {
240				break;
241			}
242			binary->global_symbol_offsets[i] = lhs;
243			binary->global_symbol_offsets[i - 1] = rhs;
244		}
245		++binary->global_symbol_count;
246	}
247}
248
249
250static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
251			unsigned symbol_sh_link,
252			struct r600_shader_binary *binary)
253{
254	unsigned i;
255
256	if (!relocs || !symbols || !binary->reloc_count) {
257		return;
258	}
259	binary->relocs = CALLOC(binary->reloc_count,
260			sizeof(struct r600_shader_reloc));
261	for (i = 0; i < binary->reloc_count; i++) {
262		GElf_Sym symbol;
263		GElf_Rel rel;
264		char *symbol_name;
265		struct r600_shader_reloc *reloc = &binary->relocs[i];
266
267		gelf_getrel(relocs, i, &rel);
268		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
269		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
270
271		reloc->offset = rel.r_offset;
272		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
273		reloc->name[sizeof(reloc->name)-1] = 0;
274	}
275}
276
277static void r600_elf_read(const char *elf_data, unsigned elf_size,
278		 struct r600_shader_binary *binary)
279{
280	char *elf_buffer;
281	Elf *elf;
282	Elf_Scn *section = NULL;
283	Elf_Data *symbols = NULL, *relocs = NULL;
284	size_t section_str_index;
285	unsigned symbol_sh_link = 0;
286
287	/* One of the libelf implementations
288	 * (http://www.mr511.de/software/english.htm) requires calling
289	 * elf_version() before elf_memory().
290	 */
291	elf_version(EV_CURRENT);
292	elf_buffer = MALLOC(elf_size);
293	memcpy(elf_buffer, elf_data, elf_size);
294
295	elf = elf_memory(elf_buffer, elf_size);
296
297	elf_getshdrstrndx(elf, &section_str_index);
298
299	while ((section = elf_nextscn(elf, section))) {
300		const char *name;
301		Elf_Data *section_data = NULL;
302		GElf_Shdr section_header;
303		if (gelf_getshdr(section, &section_header) != &section_header) {
304			fprintf(stderr, "Failed to read ELF section header\n");
305			return;
306		}
307		name = elf_strptr(elf, section_str_index, section_header.sh_name);
308		if (!strcmp(name, ".text")) {
309			section_data = elf_getdata(section, section_data);
310			binary->code_size = section_data->d_size;
311			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
312			memcpy(binary->code, section_data->d_buf, binary->code_size);
313		} else if (!strcmp(name, ".AMDGPU.config")) {
314			section_data = elf_getdata(section, section_data);
315			binary->config_size = section_data->d_size;
316			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
317			memcpy(binary->config, section_data->d_buf, binary->config_size);
318		} else if (!strcmp(name, ".AMDGPU.disasm")) {
319			/* Always read disassembly if it's available. */
320			section_data = elf_getdata(section, section_data);
321			binary->disasm_string = strndup(section_data->d_buf,
322							section_data->d_size);
323		} else if (!strncmp(name, ".rodata", 7)) {
324			section_data = elf_getdata(section, section_data);
325			binary->rodata_size = section_data->d_size;
326			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
327			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
328		} else if (!strncmp(name, ".symtab", 7)) {
329			symbols = elf_getdata(section, section_data);
330			symbol_sh_link = section_header.sh_link;
331			parse_symbol_table(symbols, &section_header, binary);
332		} else if (!strcmp(name, ".rel.text")) {
333			relocs = elf_getdata(section, section_data);
334			binary->reloc_count = section_header.sh_size /
335					section_header.sh_entsize;
336		}
337	}
338
339	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
340
341	if (elf){
342		elf_end(elf);
343	}
344	FREE(elf_buffer);
345
346	/* Cache the config size per symbol */
347	if (binary->global_symbol_count) {
348		binary->config_size_per_symbol =
349			binary->config_size / binary->global_symbol_count;
350	} else {
351		binary->global_symbol_count = 1;
352		binary->config_size_per_symbol = binary->config_size;
353	}
354}
355
356static const unsigned char *r600_shader_binary_config_start(
357	const struct r600_shader_binary *binary,
358	uint64_t symbol_offset)
359{
360	unsigned i;
361	for (i = 0; i < binary->global_symbol_count; ++i) {
362		if (binary->global_symbol_offsets[i] == symbol_offset) {
363			unsigned offset = i * binary->config_size_per_symbol;
364			return binary->config + offset;
365		}
366	}
367	return binary->config;
368}
369
370static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
371					   struct r600_bytecode *bc,
372					   uint64_t symbol_offset,
373					   boolean *use_kill)
374{
375       unsigned i;
376       const unsigned char *config =
377               r600_shader_binary_config_start(binary, symbol_offset);
378
379       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
380               unsigned reg =
381                       util_le32_to_cpu(*(uint32_t*)(config + i));
382               unsigned value =
383                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
384               switch (reg) {
385               /* R600 / R700 */
386               case R_028850_SQ_PGM_RESOURCES_PS:
387               case R_028868_SQ_PGM_RESOURCES_VS:
388               /* Evergreen / Northern Islands */
389               case R_028844_SQ_PGM_RESOURCES_PS:
390               case R_028860_SQ_PGM_RESOURCES_VS:
391               case R_0288D4_SQ_PGM_RESOURCES_LS:
392                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
393                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
394                       break;
395               case R_02880C_DB_SHADER_CONTROL:
396                       *use_kill = G_02880C_KILL_ENABLE(value);
397                       break;
398               case R_0288E8_SQ_LDS_ALLOC:
399                       bc->nlds_dw = value;
400                       break;
401               }
402       }
403}
404
405static unsigned r600_create_shader(struct r600_bytecode *bc,
406				   const struct r600_shader_binary *binary,
407				   boolean *use_kill)
408
409{
410	assert(binary->code_size % 4 == 0);
411	bc->bytecode = CALLOC(1, binary->code_size);
412	memcpy(bc->bytecode, binary->code, binary->code_size);
413	bc->ndw = binary->code_size / 4;
414
415	r600_shader_binary_read_config(binary, bc, 0, use_kill);
416	return 0;
417}
418
419#endif
420
421static void r600_destroy_shader(struct r600_bytecode *bc)
422{
423	FREE(bc->bytecode);
424}
425
426static void *evergreen_create_compute_state(struct pipe_context *ctx,
427					    const struct pipe_compute_state *cso)
428{
429	struct r600_context *rctx = (struct r600_context *)ctx;
430	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
431#ifdef HAVE_OPENCL
432	const struct pipe_binary_program_header *header;
433	void *p;
434	boolean use_kill;
435#endif
436
437	shader->ctx = rctx;
438	shader->local_size = cso->req_local_mem;
439	shader->private_size = cso->req_private_mem;
440	shader->input_size = cso->req_input_mem;
441
442	shader->ir_type = cso->ir_type;
443
444	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
445	    shader->ir_type == PIPE_SHADER_IR_NIR) {
446		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
447		return shader;
448	}
449#ifdef HAVE_OPENCL
450	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
451	header = cso->prog;
452	radeon_shader_binary_init(&shader->binary);
453	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
454	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
455
456	/* Upload code + ROdata */
457	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
458							shader->bc.ndw * 4);
459	p = r600_buffer_map_sync_with_rings(
460		&rctx->b, shader->code_bo,
461		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
462	//TODO: use util_memcpy_cpu_to_le32 ?
463	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
464	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
465#endif
466
467	return shader;
468}
469
470static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
471{
472	struct r600_context *rctx = (struct r600_context *)ctx;
473	struct r600_pipe_compute *shader = state;
474
475	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
476
477	if (!shader)
478		return;
479
480	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
481	    shader->ir_type == PIPE_SHADER_IR_NIR) {
482		r600_delete_shader_selector(ctx, shader->sel);
483	} else {
484#ifdef HAVE_OPENCL
485		radeon_shader_binary_clean(&shader->binary);
486		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
487		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
488#endif
489		r600_destroy_shader(&shader->bc);
490	}
491	FREE(shader);
492}
493
494static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
495{
496	struct r600_context *rctx = (struct r600_context *)ctx;
497	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
498	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
499
500	if (!state) {
501		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
502		return;
503	}
504
505	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
506	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
507		bool compute_dirty;
508		cstate->sel->ir_type = cstate->ir_type;
509		if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
510			R600_ERR("Failed to select compute shader\n");
511	}
512
513	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
514}
515
516/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
517 * kernel parameters there are implicit parameters that need to be stored
518 * in the vertex buffer as well.  Here is how these parameters are organized in
519 * the buffer:
520 *
521 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
522 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
523 * DWORDS 6-8: Number of work items within each work group in each dimension
524 *             (x,y,z)
525 * DWORDS 9+ : Kernel parameters
526 */
527static void evergreen_compute_upload_input(struct pipe_context *ctx,
528					   const struct pipe_grid_info *info)
529{
530	struct r600_context *rctx = (struct r600_context *)ctx;
531	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
532	unsigned i;
533	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
534	 * parameters.
535	 */
536	unsigned input_size;
537	uint32_t *num_work_groups_start;
538	uint32_t *global_size_start;
539	uint32_t *local_size_start;
540	uint32_t *kernel_parameters_start;
541	struct pipe_box box;
542	struct pipe_transfer *transfer = NULL;
543
544	if (!shader)
545		return;
546	if (shader->input_size == 0) {
547		return;
548	}
549	input_size = shader->input_size + 36;
550	if (!shader->kernel_param) {
551		/* Add space for the grid dimensions */
552		shader->kernel_param = (struct r600_resource *)
553			pipe_buffer_create(ctx->screen, 0,
554					PIPE_USAGE_IMMUTABLE, input_size);
555	}
556
557	u_box_1d(0, input_size, &box);
558	num_work_groups_start = ctx->buffer_map(ctx,
559			(struct pipe_resource*)shader->kernel_param,
560			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
561			&box, &transfer);
562	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
563	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
564	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
565
566	/* Copy the work group size */
567	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
568
569	/* Copy the global size */
570	for (i = 0; i < 3; i++) {
571		global_size_start[i] = info->grid[i] * info->block[i];
572	}
573
574	/* Copy the local dimensions */
575	memcpy(local_size_start, info->block, 3 * sizeof(uint));
576
577	/* Copy the kernel inputs */
578	memcpy(kernel_parameters_start, info->input, shader->input_size);
579
580	for (i = 0; i < (input_size / 4); i++) {
581		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
582			((unsigned*)num_work_groups_start)[i]);
583	}
584
585	ctx->buffer_unmap(ctx, transfer);
586
587	/* ID=0 and ID=3 are reserved for the parameters.
588	 * LLVM will preferably use ID=0, but it does not work for dynamic
589	 * indices. */
590	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
591			(struct pipe_resource*)shader->kernel_param);
592	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
593			(struct pipe_resource*)shader->kernel_param);
594}
595
596static void evergreen_emit_dispatch(struct r600_context *rctx,
597				    const struct pipe_grid_info *info,
598				    uint32_t indirect_grid[3])
599{
600	int i;
601	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
602	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
603	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
604	unsigned num_waves;
605	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
606	unsigned wave_divisor = (16 * num_pipes);
607	int group_size = 1;
608	int grid_size = 1;
609	unsigned lds_size = shader->local_size / 4;
610
611	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
612	    shader->ir_type != PIPE_SHADER_IR_NIR)
613		lds_size += shader->bc.nlds_dw;
614
615	/* Calculate group_size/grid_size */
616	for (i = 0; i < 3; i++) {
617		group_size *= info->block[i];
618	}
619
620	for (i = 0; i < 3; i++)	{
621		grid_size *= info->grid[i];
622	}
623
624	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
625	num_waves = (info->block[0] * info->block[1] * info->block[2] +
626			wave_divisor - 1) / wave_divisor;
627
628	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
629				"%u wavefronts per thread block, "
630				"allocating %u dwords lds.\n",
631				num_pipes, num_waves, lds_size);
632
633	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
634
635	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
636	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
637	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
638	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
639
640	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
641								group_size);
642
643	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
644	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
645	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
646	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
647
648	if (rctx->b.chip_class < CAYMAN) {
649		assert(lds_size <= 8192);
650	} else {
651		/* Cayman appears to have a slightly smaller limit, see the
652		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
653		assert(lds_size <= 8160);
654	}
655
656	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
657					lds_size | (num_waves << 14));
658
659	if (info->indirect) {
660		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
661		radeon_emit(cs, indirect_grid[0]);
662		radeon_emit(cs, indirect_grid[1]);
663		radeon_emit(cs, indirect_grid[2]);
664		radeon_emit(cs, 1);
665	} else {
666		/* Dispatch packet */
667		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
668		radeon_emit(cs, info->grid[0]);
669		radeon_emit(cs, info->grid[1]);
670		radeon_emit(cs, info->grid[2]);
671		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
672		radeon_emit(cs, 1);
673	}
674
675	if (rctx->is_debug)
676		eg_trace_emit(rctx);
677}
678
679static void compute_setup_cbs(struct r600_context *rctx)
680{
681	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
682	unsigned i;
683
684	/* Emit colorbuffers. */
685	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
686	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
687		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
688		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
689						       (struct r600_resource*)cb->base.texture,
690						       RADEON_USAGE_READWRITE,
691						       RADEON_PRIO_SHADER_RW_BUFFER);
692
693		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
694		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
695		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
696		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
697		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
698		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
699		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
700		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
701
702		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
703		radeon_emit(cs, reloc);
704
705		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
706		radeon_emit(cs, reloc);
707	}
708	for (; i < 8 ; i++)
709		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
710					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
711	for (; i < 12; i++)
712		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
713					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
714
715	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
716	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
717				       rctx->compute_cb_target_mask);
718}
719
720static void compute_emit_cs(struct r600_context *rctx,
721			    const struct pipe_grid_info *info)
722{
723	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
724	bool compute_dirty = false;
725	struct r600_pipe_shader *current;
726	struct r600_shader_atomic combined_atomics[8];
727	uint8_t atomic_used_mask;
728	uint32_t indirect_grid[3] = { 0, 0, 0 };
729
730	/* make sure that the gfx ring is only one active */
731	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
732		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
733	}
734
735	r600_update_compressed_resource_state(rctx, true);
736
737	if (!rctx->cmd_buf_is_compute) {
738		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
739		rctx->cmd_buf_is_compute = true;
740	}
741
742	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
743	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
744		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
745			R600_ERR("Failed to select compute shader\n");
746			return;
747		}
748
749		current = rctx->cs_shader_state.shader->sel->current;
750		if (compute_dirty) {
751			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
752			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
753			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
754		}
755
756		bool need_buf_const = current->shader.uses_tex_buffers ||
757			current->shader.has_txq_cube_array_z_comp;
758
759		if (info->indirect) {
760			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
761			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
762			unsigned offset = info->indirect_offset / 4;
763			indirect_grid[0] = data[offset];
764			indirect_grid[1] = data[offset + 1];
765			indirect_grid[2] = data[offset + 2];
766		}
767		for (int i = 0; i < 3; i++) {
768			rctx->cs_block_grid_sizes[i] = info->block[i];
769			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
770		}
771		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
772		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
773
774		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
775		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
776
777		if (need_buf_const) {
778			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
779		}
780		r600_update_driver_const_buffers(rctx, true);
781
782		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
783		if (atomic_used_mask) {
784			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
785			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
786		}
787	} else
788		r600_need_cs_space(rctx, 0, true, 0);
789
790	/* Initialize all the compute-related registers.
791	 *
792	 * See evergreen_init_atom_start_compute_cs() in this file for the list
793	 * of registers initialized by the start_compute_cs_cmd atom.
794	 */
795	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
796
797	/* emit config state */
798	if (rctx->b.chip_class == EVERGREEN) {
799		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
800		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
801			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
802			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
803			radeon_emit(cs, 0);
804			radeon_emit(cs, 0);
805			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
806		} else
807			r600_emit_atom(rctx, &rctx->config_state.atom);
808	}
809
810	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
811	r600_flush_emit(rctx);
812
813	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
814	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
815
816		compute_setup_cbs(rctx);
817
818		/* Emit vertex buffer state */
819		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
820		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
821	} else {
822		uint32_t rat_mask;
823
824		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
825		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
826					       rat_mask);
827	}
828
829	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
830
831	/* Emit constant buffer state */
832	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
833
834	/* Emit sampler state */
835	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
836
837	/* Emit sampler view (texture resource) state */
838	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
839
840	/* Emit images state */
841	r600_emit_atom(rctx, &rctx->compute_images.atom);
842
843	/* Emit buffers state */
844	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
845
846	/* Emit shader state */
847	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
848
849	/* Emit dispatch state and dispatch packet */
850	evergreen_emit_dispatch(rctx, info, indirect_grid);
851
852	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
853	 */
854	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
855		      R600_CONTEXT_INV_VERTEX_CACHE |
856	              R600_CONTEXT_INV_TEX_CACHE;
857	r600_flush_emit(rctx);
858	rctx->b.flags = 0;
859
860	if (rctx->b.chip_class >= CAYMAN) {
861		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
862		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
863		/* DEALLOC_STATE prevents the GPU from hanging when a
864		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
865		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
866		 */
867		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
868		radeon_emit(cs, 0);
869	}
870	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
871	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
872		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
873
874#if 0
875	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
876	for (i = 0; i < cs->cdw; i++) {
877		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
878	}
879#endif
880
881}
882
883
884/**
885 * Emit function for r600_cs_shader_state atom
886 */
887void evergreen_emit_cs_shader(struct r600_context *rctx,
888			      struct r600_atom *atom)
889{
890	struct r600_cs_shader_state *state =
891					(struct r600_cs_shader_state*)atom;
892	struct r600_pipe_compute *shader = state->shader;
893	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
894	uint64_t va;
895	struct r600_resource *code_bo;
896	unsigned ngpr, nstack;
897
898	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
899	    shader->ir_type == PIPE_SHADER_IR_NIR) {
900		code_bo = shader->sel->current->bo;
901		va = shader->sel->current->bo->gpu_address;
902		ngpr = shader->sel->current->shader.bc.ngpr;
903		nstack = shader->sel->current->shader.bc.nstack;
904	} else {
905		code_bo = shader->code_bo;
906		va = shader->code_bo->gpu_address + state->pc;
907		ngpr = shader->bc.ngpr;
908		nstack = shader->bc.nstack;
909	}
910
911	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
912	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
913	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
914			S_0288D4_NUM_GPRS(ngpr) |
915			S_0288D4_DX10_CLAMP(1) |
916			S_0288D4_STACK_SIZE(nstack));
917	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
918
919	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
920	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
921					      code_bo, RADEON_USAGE_READ,
922					      RADEON_PRIO_SHADER_BINARY));
923}
924
925static void evergreen_launch_grid(struct pipe_context *ctx,
926				  const struct pipe_grid_info *info)
927{
928	struct r600_context *rctx = (struct r600_context *)ctx;
929#ifdef HAVE_OPENCL
930	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
931	boolean use_kill;
932
933	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
934	    shader->ir_type != PIPE_SHADER_IR_NIR) {
935		rctx->cs_shader_state.pc = info->pc;
936		/* Get the config information for this kernel. */
937		r600_shader_binary_read_config(&shader->binary, &shader->bc,
938					       info->pc, &use_kill);
939	} else {
940		use_kill = false;
941		rctx->cs_shader_state.pc = 0;
942	}
943#endif
944
945	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
946
947
948	evergreen_compute_upload_input(ctx, info);
949	compute_emit_cs(rctx, info);
950}
951
952static void evergreen_set_compute_resources(struct pipe_context *ctx,
953					    unsigned start, unsigned count,
954					    struct pipe_surface **surfaces)
955{
956	struct r600_context *rctx = (struct r600_context *)ctx;
957	struct r600_surface **resources = (struct r600_surface **)surfaces;
958
959	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
960			start, count);
961
962	for (unsigned i = 0; i < count; i++) {
963		/* The First four vertex buffers are reserved for parameters and
964		 * global buffers. */
965		unsigned vtx_id = 4 + i;
966		if (resources[i]) {
967			struct r600_resource_global *buffer =
968				(struct r600_resource_global*)
969				resources[i]->base.texture;
970			if (resources[i]->base.writable) {
971				assert(i+1 < 12);
972
973				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
974				(struct r600_resource *)resources[i]->base.texture,
975				buffer->chunk->start_in_dw*4,
976				resources[i]->base.texture->width0);
977			}
978
979			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
980					buffer->chunk->start_in_dw * 4,
981					resources[i]->base.texture);
982		}
983	}
984}
985
986static void evergreen_set_global_binding(struct pipe_context *ctx,
987					 unsigned first, unsigned n,
988					 struct pipe_resource **resources,
989					 uint32_t **handles)
990{
991	struct r600_context *rctx = (struct r600_context *)ctx;
992	struct compute_memory_pool *pool = rctx->screen->global_pool;
993	struct r600_resource_global **buffers =
994		(struct r600_resource_global **)resources;
995	unsigned i;
996
997	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
998			first, n);
999
1000	if (!resources) {
1001		/* XXX: Unset */
1002		return;
1003	}
1004
1005	/* We mark these items for promotion to the pool if they
1006	 * aren't already there */
1007	for (i = first; i < first + n; i++) {
1008		struct compute_memory_item *item = buffers[i]->chunk;
1009
1010		if (!is_item_in_pool(item))
1011			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1012	}
1013
1014	if (compute_memory_finalize_pending(pool, ctx) == -1) {
1015		/* XXX: Unset */
1016		return;
1017	}
1018
1019	for (i = first; i < first + n; i++)
1020	{
1021		uint32_t buffer_offset;
1022		uint32_t handle;
1023		assert(resources[i]->target == PIPE_BUFFER);
1024		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1025
1026		buffer_offset = util_le32_to_cpu(*(handles[i]));
1027		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1028
1029		*(handles[i]) = util_cpu_to_le32(handle);
1030	}
1031
1032	/* globals for writing */
1033	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1034	/* globals for reading */
1035	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1036				(struct pipe_resource*)pool->bo);
1037
1038	/* constants for reading, LLVM puts them in text segment */
1039	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1040				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1041}
1042
1043/**
1044 * This function initializes all the compute specific registers that need to
1045 * be initialized for each compute command stream.  Registers that are common
1046 * to both compute and 3D will be initialized at the beginning of each compute
1047 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1048 * packet requires that the shader type bit be set, we must initialize all
1049 * context registers needed for compute in this function.  The registers
1050 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1051 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1052 * on the GPU family.
1053 */
1054void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1055{
1056	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1057	int num_threads;
1058	int num_stack_entries;
1059
1060	/* since all required registers are initialized in the
1061	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1062	 */
1063	r600_init_command_buffer(cb, 256);
1064	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1065
1066	/* We're setting config registers here. */
1067	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1068	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1069
1070	switch (rctx->b.family) {
1071	case CHIP_CEDAR:
1072	default:
1073		num_threads = 128;
1074		num_stack_entries = 256;
1075		break;
1076	case CHIP_REDWOOD:
1077		num_threads = 128;
1078		num_stack_entries = 256;
1079		break;
1080	case CHIP_JUNIPER:
1081		num_threads = 128;
1082		num_stack_entries = 512;
1083		break;
1084	case CHIP_CYPRESS:
1085	case CHIP_HEMLOCK:
1086		num_threads = 128;
1087		num_stack_entries = 512;
1088		break;
1089	case CHIP_PALM:
1090		num_threads = 128;
1091		num_stack_entries = 256;
1092		break;
1093	case CHIP_SUMO:
1094		num_threads = 128;
1095		num_stack_entries = 256;
1096		break;
1097	case CHIP_SUMO2:
1098		num_threads = 128;
1099		num_stack_entries = 512;
1100		break;
1101	case CHIP_BARTS:
1102		num_threads = 128;
1103		num_stack_entries = 512;
1104		break;
1105	case CHIP_TURKS:
1106		num_threads = 128;
1107		num_stack_entries = 256;
1108		break;
1109	case CHIP_CAICOS:
1110		num_threads = 128;
1111		num_stack_entries = 256;
1112		break;
1113	}
1114
1115	/* The primitive type always needs to be POINTLIST for compute. */
1116	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1117						V_008958_DI_PT_POINTLIST);
1118
1119	if (rctx->b.chip_class < CAYMAN) {
1120
1121		/* These registers control which simds can be used by each stage.
1122		 * The default for these registers is 0xffffffff, which means
1123		 * all simds are available for each stage.  It's possible we may
1124		 * want to play around with these in the future, but for now
1125		 * the default value is fine.
1126		 *
1127		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1128		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1129		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1130		 */
1131
1132		/* XXX: We may need to adjust the thread and stack resource
1133		 * values for 3D/compute interop */
1134
1135		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1136
1137		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1138		 * Set the number of threads used by the PS/VS/GS/ES stage to
1139		 * 0.
1140		 */
1141		r600_store_value(cb, 0);
1142
1143		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1144		 * Set the number of threads used by the CS (aka LS) stage to
1145		 * the maximum number of threads and set the number of threads
1146		 * for the HS stage to 0. */
1147		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1148
1149		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1150		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1151		r600_store_value(cb, 0);
1152
1153		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1154		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1155		r600_store_value(cb, 0);
1156
1157		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1158		 * Set the Contol Flow stack entries to 0 for the HS stage, and
1159		 * set it to the maximum value for the CS (aka LS) stage. */
1160		r600_store_value(cb,
1161			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1162	}
1163	/* Give the compute shader all the available LDS space.
1164	 * NOTE: This only sets the maximum number of dwords that a compute
1165	 * shader can allocate.  When a shader is executed, we still need to
1166	 * allocate the appropriate amount of LDS dwords using the
1167	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1168	 */
1169	if (rctx->b.chip_class < CAYMAN) {
1170		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1171			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1172	} else {
1173		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1174			S_0286FC_NUM_PS_LDS(0) |
1175			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1176	}
1177
1178	/* Context Registers */
1179
1180	if (rctx->b.chip_class < CAYMAN) {
1181		/* workaround for hw issues with dyn gpr - must set all limits
1182		 * to 240 instead of 0, 0x1e == 240 / 8
1183		 */
1184		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1185				S_028838_PS_GPRS(0x1e) |
1186				S_028838_VS_GPRS(0x1e) |
1187				S_028838_GS_GPRS(0x1e) |
1188				S_028838_ES_GPRS(0x1e) |
1189				S_028838_HS_GPRS(0x1e) |
1190				S_028838_LS_GPRS(0x1e));
1191	}
1192
1193	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1194	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1195		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1196
1197	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1198
1199	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1200			       S_0286E8_TID_IN_GROUP_ENA(1) |
1201			       S_0286E8_TGID_ENA(1) |
1202			       S_0286E8_DISABLE_INDEX_PACK(1));
1203
1204	/* The LOOP_CONST registers are an optimizations for loops that allows
1205	 * you to store the initial counter, increment value, and maximum
1206	 * counter value in a register so that hardware can calculate the
1207	 * correct number of iterations for the loop, so that you don't need
1208	 * to have the loop counter in your shader code.  We don't currently use
1209	 * this optimization, so we must keep track of the counter in the
1210	 * shader and use a break instruction to exit loops.  However, the
1211	 * hardware will still uses this register to determine when to exit a
1212	 * loop, so we need to initialize the counter to 0, set the increment
1213	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1214	 * is the maximum value allowed.  This gives us a maximum of 4096
1215	 * iterations for our loops, but hopefully our break instruction will
1216	 * execute before some time before the 4096th iteration.
1217	 */
1218	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1219}
1220
1221void evergreen_init_compute_state_functions(struct r600_context *rctx)
1222{
1223	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1224	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1225	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1226//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1227	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1228	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1229	rctx->b.b.launch_grid = evergreen_launch_grid;
1230
1231}
1232
1233void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1234				      struct pipe_resource *resource,
1235				      unsigned level,
1236				      unsigned usage,
1237				      const struct pipe_box *box,
1238				      struct pipe_transfer **ptransfer)
1239{
1240	struct r600_context *rctx = (struct r600_context*)ctx;
1241	struct compute_memory_pool *pool = rctx->screen->global_pool;
1242	struct r600_resource_global* buffer =
1243		(struct r600_resource_global*)resource;
1244
1245	struct compute_memory_item *item = buffer->chunk;
1246	struct pipe_resource *dst = NULL;
1247	unsigned offset = box->x;
1248
1249	if (is_item_in_pool(item)) {
1250		compute_memory_demote_item(pool, item, ctx);
1251	}
1252	else {
1253		if (item->real_buffer == NULL) {
1254			item->real_buffer =
1255					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256		}
1257	}
1258
1259	dst = (struct pipe_resource*)item->real_buffer;
1260
1261	if (usage & PIPE_MAP_READ)
1262		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1263
1264	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1265			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1266			"width = %u, height = %u, depth = %u)\n", level, usage,
1267			box->x, box->y, box->z, box->width, box->height,
1268			box->depth);
1269	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1270		"%u (box.x)\n", item->id, box->x);
1271
1272
1273	assert(resource->target == PIPE_BUFFER);
1274	assert(resource->bind & PIPE_BIND_GLOBAL);
1275	assert(box->x >= 0);
1276	assert(box->y == 0);
1277	assert(box->z == 0);
1278
1279	///TODO: do it better, mapping is not possible if the pool is too big
1280	return pipe_buffer_map_range(ctx, dst,
1281			offset, box->width, usage, ptransfer);
1282}
1283
1284void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285					struct pipe_transfer *transfer)
1286{
1287	/* struct r600_resource_global are not real resources, they just map
1288	 * to an offset within the compute memory pool.  The function
1289	 * r600_compute_global_transfer_map() maps the memory pool
1290	 * resource rather than the struct r600_resource_global passed to
1291	 * it as an argument and then initializes ptransfer->resource with
1292	 * the memory pool resource (via pipe_buffer_map_range).
1293	 * When transfer_unmap is called it uses the memory pool's
1294	 * vtable which calls r600_buffer_transfer_map() rather than
1295	 * this function.
1296	 */
1297	assert (!"This function should not be called");
1298}
1299
1300void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1301					struct pipe_resource *res)
1302{
1303	struct r600_resource_global* buffer = NULL;
1304	struct r600_screen* rscreen = NULL;
1305
1306	assert(res->target == PIPE_BUFFER);
1307	assert(res->bind & PIPE_BIND_GLOBAL);
1308
1309	buffer = (struct r600_resource_global*)res;
1310	rscreen = (struct r600_screen*)screen;
1311
1312	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1313
1314	buffer->chunk = NULL;
1315	free(res);
1316}
1317
1318struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1319							const struct pipe_resource *templ)
1320{
1321	struct r600_resource_global* result = NULL;
1322	struct r600_screen* rscreen = NULL;
1323	int size_in_dw = 0;
1324
1325	assert(templ->target == PIPE_BUFFER);
1326	assert(templ->bind & PIPE_BIND_GLOBAL);
1327	assert(templ->array_size == 1 || templ->array_size == 0);
1328	assert(templ->depth0 == 1 || templ->depth0 == 0);
1329	assert(templ->height0 == 1 || templ->height0 == 0);
1330
1331	result = (struct r600_resource_global*)
1332	CALLOC(sizeof(struct r600_resource_global), 1);
1333	rscreen = (struct r600_screen*)screen;
1334
1335	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1336	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1337			templ->array_size);
1338
1339	result->base.b.b = *templ;
1340	result->base.b.b.screen = screen;
1341	result->base.compute_global_bo = true;
1342	pipe_reference_init(&result->base.b.b.reference, 1);
1343
1344	size_in_dw = (templ->width0+3) / 4;
1345
1346	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1347
1348	if (result->chunk == NULL)
1349	{
1350		free(result);
1351		return NULL;
1352	}
1353
1354	return &result->base.b.b;
1355}
1356