1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#ifdef HAVE_OPENCL
28#include <gelf.h>
29#include <libelf.h>
30#endif
31#include <stdio.h>
32#include <errno.h>
33#include "pipe/p_defines.h"
34#include "pipe/p_state.h"
35#include "pipe/p_context.h"
36#include "util/u_blitter.h"
37#include "util/list.h"
38#include "util/u_transfer.h"
39#include "util/u_surface.h"
40#include "util/u_pack_color.h"
41#include "util/u_memory.h"
42#include "util/u_inlines.h"
43#include "util/u_framebuffer.h"
44#include "tgsi/tgsi_parse.h"
45#include "pipebuffer/pb_buffer.h"
46#include "evergreend.h"
47#include "r600_shader.h"
48#include "r600_pipe.h"
49#include "r600_formats.h"
50#include "evergreen_compute.h"
51#include "evergreen_compute_internal.h"
52#include "compute_memory_pool.h"
53#include "sb/sb_public.h"
54#include <inttypes.h>
55
56/**
57RAT0 is for global binding write
58VTX1 is for global binding read
59
60for wrting images RAT1...
61for reading images TEX2...
62  TEX2-RAT1 is paired
63
64TEX2... consumes the same fetch resources, that VTX2... would consume
65
66CONST0 and VTX0 is for parameters
67  CONST0 is binding smaller input parameter buffer, and for constant indexing,
68  also constant cached
69  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70  the constant cache can handle
71
72RAT-s are limited to 12, so we can only bind at most 11 texture for writing
73because we reserve RAT0 for global bindings. With byteaddressing enabled,
74we should reserve another one too.=> 10 image binding for writing max.
75
76from Nvidia OpenCL:
77  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
78  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
79
80so 10 for writing is enough. 176 is the max for reading according to the docs
81
82writable images should be listed first < 10, so their id corresponds to RAT(id+1)
83writable images will consume TEX slots, VTX slots too because of linear indexing
84
85*/
86
87struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
88						     unsigned size)
89{
90	struct pipe_resource *buffer = NULL;
91	assert(size);
92
93	buffer = pipe_buffer_create((struct pipe_screen*) screen,
94				    0, PIPE_USAGE_IMMUTABLE, size);
95
96	return (struct r600_resource *)buffer;
97}
98
99
100static void evergreen_set_rat(struct r600_pipe_compute *pipe,
101			      unsigned id,
102			      struct r600_resource *bo,
103			      int start,
104			      int size)
105{
106	struct pipe_surface rat_templ;
107	struct r600_surface *surf = NULL;
108	struct r600_context *rctx = NULL;
109
110	assert(id < 12);
111	assert((size & 3) == 0);
112	assert((start & 0xFF) == 0);
113
114	rctx = pipe->ctx;
115
116	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
117
118	/* Create the RAT surface */
119	memset(&rat_templ, 0, sizeof(rat_templ));
120	rat_templ.format = PIPE_FORMAT_R32_UINT;
121	rat_templ.u.tex.level = 0;
122	rat_templ.u.tex.first_layer = 0;
123	rat_templ.u.tex.last_layer = 0;
124
125	/* Add the RAT the list of color buffers. Drop the old buffer first. */
126	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
127	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128		(struct pipe_context *)pipe->ctx,
129		(struct pipe_resource *)bo, &rat_templ);
130
131	/* Update the number of color buffers */
132	pipe->ctx->framebuffer.state.nr_cbufs =
133		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135	/* Update the cb_target_mask
136	 * XXX: I think this is a potential spot for bugs once we start doing
137	 * GL interop.  cb_target_mask may be modified in the 3D sections
138	 * of this driver. */
139	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142	evergreen_init_color_surface_rat(rctx, surf);
143}
144
145static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146					   unsigned vb_index,
147					   unsigned offset,
148					   struct pipe_resource *buffer)
149{
150	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152	vb->stride = 1;
153	vb->buffer_offset = offset;
154	vb->buffer.resource = buffer;
155	vb->is_user_buffer = false;
156
157	/* The vertex instructions in the compute shaders use the texture cache,
158	 * so we need to invalidate it. */
159	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160	state->enabled_mask |= 1 << vb_index;
161	state->dirty_mask |= 1 << vb_index;
162	r600_mark_atom_dirty(rctx, &state->atom);
163}
164
165static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166					     unsigned cb_index,
167					     unsigned offset,
168					     unsigned size,
169					     struct pipe_resource *buffer)
170{
171	struct pipe_constant_buffer cb;
172	cb.buffer_size = size;
173	cb.buffer_offset = offset;
174	cb.buffer = buffer;
175	cb.user_buffer = NULL;
176
177	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178}
179
180/* We need to define these R600 registers here, because we can't include
181 * evergreend.h and r600d.h.
182 */
183#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
184#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
185
186#ifdef HAVE_OPENCL
187static void parse_symbol_table(Elf_Data *symbol_table_data,
188				const GElf_Shdr *symbol_table_header,
189				struct ac_shader_binary *binary)
190{
191	GElf_Sym symbol;
192	unsigned i = 0;
193	unsigned symbol_count =
194		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
195
196	/* We are over allocating this list, because symbol_count gives the
197	 * total number of symbols, and we will only be filling the list
198	 * with offsets of global symbols.  The memory savings from
199	 * allocating the correct size of this list will be small, and
200	 * I don't think it is worth the cost of pre-computing the number
201	 * of global symbols.
202	 */
203	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
204
205	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
206		unsigned i;
207		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
208		    symbol.st_shndx == 0 /* Undefined symbol */) {
209			continue;
210		}
211
212		binary->global_symbol_offsets[binary->global_symbol_count] =
213					symbol.st_value;
214
215		/* Sort the list using bubble sort.  This list will usually
216		 * be small. */
217		for (i = binary->global_symbol_count; i > 0; --i) {
218			uint64_t lhs = binary->global_symbol_offsets[i - 1];
219			uint64_t rhs = binary->global_symbol_offsets[i];
220			if (lhs < rhs) {
221				break;
222			}
223			binary->global_symbol_offsets[i] = lhs;
224			binary->global_symbol_offsets[i - 1] = rhs;
225		}
226		++binary->global_symbol_count;
227	}
228}
229
230
231static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
232			unsigned symbol_sh_link,
233			struct ac_shader_binary *binary)
234{
235	unsigned i;
236
237	if (!relocs || !symbols || !binary->reloc_count) {
238		return;
239	}
240	binary->relocs = CALLOC(binary->reloc_count,
241			sizeof(struct ac_shader_reloc));
242	for (i = 0; i < binary->reloc_count; i++) {
243		GElf_Sym symbol;
244		GElf_Rel rel;
245		char *symbol_name;
246		struct ac_shader_reloc *reloc = &binary->relocs[i];
247
248		gelf_getrel(relocs, i, &rel);
249		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
250		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
251
252		reloc->offset = rel.r_offset;
253		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
254		reloc->name[sizeof(reloc->name)-1] = 0;
255	}
256}
257
258static void r600_elf_read(const char *elf_data, unsigned elf_size,
259		 struct ac_shader_binary *binary)
260{
261	char *elf_buffer;
262	Elf *elf;
263	Elf_Scn *section = NULL;
264	Elf_Data *symbols = NULL, *relocs = NULL;
265	size_t section_str_index;
266	unsigned symbol_sh_link = 0;
267
268	/* One of the libelf implementations
269	 * (http://www.mr511.de/software/english.htm) requires calling
270	 * elf_version() before elf_memory().
271	 */
272	elf_version(EV_CURRENT);
273	elf_buffer = MALLOC(elf_size);
274	memcpy(elf_buffer, elf_data, elf_size);
275
276	elf = elf_memory(elf_buffer, elf_size);
277
278	elf_getshdrstrndx(elf, &section_str_index);
279
280	while ((section = elf_nextscn(elf, section))) {
281		const char *name;
282		Elf_Data *section_data = NULL;
283		GElf_Shdr section_header;
284		if (gelf_getshdr(section, &section_header) != &section_header) {
285			fprintf(stderr, "Failed to read ELF section header\n");
286			return;
287		}
288		name = elf_strptr(elf, section_str_index, section_header.sh_name);
289		if (!strcmp(name, ".text")) {
290			section_data = elf_getdata(section, section_data);
291			binary->code_size = section_data->d_size;
292			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
293			memcpy(binary->code, section_data->d_buf, binary->code_size);
294		} else if (!strcmp(name, ".AMDGPU.config")) {
295			section_data = elf_getdata(section, section_data);
296			binary->config_size = section_data->d_size;
297			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
298			memcpy(binary->config, section_data->d_buf, binary->config_size);
299		} else if (!strcmp(name, ".AMDGPU.disasm")) {
300			/* Always read disassembly if it's available. */
301			section_data = elf_getdata(section, section_data);
302			binary->disasm_string = strndup(section_data->d_buf,
303							section_data->d_size);
304		} else if (!strncmp(name, ".rodata", 7)) {
305			section_data = elf_getdata(section, section_data);
306			binary->rodata_size = section_data->d_size;
307			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
308			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
309		} else if (!strncmp(name, ".symtab", 7)) {
310			symbols = elf_getdata(section, section_data);
311			symbol_sh_link = section_header.sh_link;
312			parse_symbol_table(symbols, &section_header, binary);
313		} else if (!strcmp(name, ".rel.text")) {
314			relocs = elf_getdata(section, section_data);
315			binary->reloc_count = section_header.sh_size /
316					section_header.sh_entsize;
317		}
318	}
319
320	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
321
322	if (elf){
323		elf_end(elf);
324	}
325	FREE(elf_buffer);
326
327	/* Cache the config size per symbol */
328	if (binary->global_symbol_count) {
329		binary->config_size_per_symbol =
330			binary->config_size / binary->global_symbol_count;
331	} else {
332		binary->global_symbol_count = 1;
333		binary->config_size_per_symbol = binary->config_size;
334	}
335}
336
337static const unsigned char *r600_shader_binary_config_start(
338	const struct ac_shader_binary *binary,
339	uint64_t symbol_offset)
340{
341	unsigned i;
342	for (i = 0; i < binary->global_symbol_count; ++i) {
343		if (binary->global_symbol_offsets[i] == symbol_offset) {
344			unsigned offset = i * binary->config_size_per_symbol;
345			return binary->config + offset;
346		}
347	}
348	return binary->config;
349}
350
351static void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
352					   struct r600_bytecode *bc,
353					   uint64_t symbol_offset,
354					   boolean *use_kill)
355{
356       unsigned i;
357       const unsigned char *config =
358               r600_shader_binary_config_start(binary, symbol_offset);
359
360       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
361               unsigned reg =
362                       util_le32_to_cpu(*(uint32_t*)(config + i));
363               unsigned value =
364                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
365               switch (reg) {
366               /* R600 / R700 */
367               case R_028850_SQ_PGM_RESOURCES_PS:
368               case R_028868_SQ_PGM_RESOURCES_VS:
369               /* Evergreen / Northern Islands */
370               case R_028844_SQ_PGM_RESOURCES_PS:
371               case R_028860_SQ_PGM_RESOURCES_VS:
372               case R_0288D4_SQ_PGM_RESOURCES_LS:
373                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
374                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
375                       break;
376               case R_02880C_DB_SHADER_CONTROL:
377                       *use_kill = G_02880C_KILL_ENABLE(value);
378                       break;
379               case R_0288E8_SQ_LDS_ALLOC:
380                       bc->nlds_dw = value;
381                       break;
382               }
383       }
384}
385
386static unsigned r600_create_shader(struct r600_bytecode *bc,
387				   const struct ac_shader_binary *binary,
388				   boolean *use_kill)
389
390{
391	assert(binary->code_size % 4 == 0);
392	bc->bytecode = CALLOC(1, binary->code_size);
393	memcpy(bc->bytecode, binary->code, binary->code_size);
394	bc->ndw = binary->code_size / 4;
395
396	r600_shader_binary_read_config(binary, bc, 0, use_kill);
397	return 0;
398}
399
400#endif
401
402static void r600_destroy_shader(struct r600_bytecode *bc)
403{
404	FREE(bc->bytecode);
405}
406
407static void *evergreen_create_compute_state(struct pipe_context *ctx,
408					    const struct pipe_compute_state *cso)
409{
410	struct r600_context *rctx = (struct r600_context *)ctx;
411	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
412#ifdef HAVE_OPENCL
413	const struct pipe_llvm_program_header *header;
414	const char *code;
415	void *p;
416	boolean use_kill;
417#endif
418
419	shader->ctx = rctx;
420	shader->local_size = cso->req_local_mem;
421	shader->private_size = cso->req_private_mem;
422	shader->input_size = cso->req_input_mem;
423
424	shader->ir_type = cso->ir_type;
425
426	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
427		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
428		return shader;
429	}
430#ifdef HAVE_OPENCL
431	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
432	header = cso->prog;
433	code = cso->prog + sizeof(struct pipe_llvm_program_header);
434	radeon_shader_binary_init(&shader->binary);
435	r600_elf_read(code, header->num_bytes, &shader->binary);
436	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
437
438	/* Upload code + ROdata */
439	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
440							shader->bc.ndw * 4);
441	p = r600_buffer_map_sync_with_rings(
442		&rctx->b, shader->code_bo,
443		PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
444	//TODO: use util_memcpy_cpu_to_le32 ?
445	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
446	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
447#endif
448
449	return shader;
450}
451
452static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
453{
454	struct r600_context *rctx = (struct r600_context *)ctx;
455	struct r600_pipe_compute *shader = state;
456
457	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
458
459	if (!shader)
460		return;
461
462	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
463		r600_delete_shader_selector(ctx, shader->sel);
464	} else {
465#ifdef HAVE_OPENCL
466		radeon_shader_binary_clean(&shader->binary);
467		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
468		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
469#endif
470		r600_destroy_shader(&shader->bc);
471	}
472	FREE(shader);
473}
474
475static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
476{
477	struct r600_context *rctx = (struct r600_context *)ctx;
478	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
479	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
480
481	if (!state) {
482		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
483		return;
484	}
485
486	if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
487		bool compute_dirty;
488
489		r600_shader_select(ctx, cstate->sel, &compute_dirty);
490	}
491
492	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
493}
494
495/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
496 * kernel parameters there are implicit parameters that need to be stored
497 * in the vertex buffer as well.  Here is how these parameters are organized in
498 * the buffer:
499 *
500 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
501 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
502 * DWORDS 6-8: Number of work items within each work group in each dimension
503 *             (x,y,z)
504 * DWORDS 9+ : Kernel parameters
505 */
506static void evergreen_compute_upload_input(struct pipe_context *ctx,
507					   const struct pipe_grid_info *info)
508{
509	struct r600_context *rctx = (struct r600_context *)ctx;
510	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
511	unsigned i;
512	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
513	 * parameters.
514	 */
515	unsigned input_size;
516	uint32_t *num_work_groups_start;
517	uint32_t *global_size_start;
518	uint32_t *local_size_start;
519	uint32_t *kernel_parameters_start;
520	struct pipe_box box;
521	struct pipe_transfer *transfer = NULL;
522
523	if (!shader)
524		return;
525	if (shader->input_size == 0) {
526		return;
527	}
528	input_size = shader->input_size + 36;
529	if (!shader->kernel_param) {
530		/* Add space for the grid dimensions */
531		shader->kernel_param = (struct r600_resource *)
532			pipe_buffer_create(ctx->screen, 0,
533					PIPE_USAGE_IMMUTABLE, input_size);
534	}
535
536	u_box_1d(0, input_size, &box);
537	num_work_groups_start = ctx->transfer_map(ctx,
538			(struct pipe_resource*)shader->kernel_param,
539			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
540			&box, &transfer);
541	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
542	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
543	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
544
545	/* Copy the work group size */
546	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
547
548	/* Copy the global size */
549	for (i = 0; i < 3; i++) {
550		global_size_start[i] = info->grid[i] * info->block[i];
551	}
552
553	/* Copy the local dimensions */
554	memcpy(local_size_start, info->block, 3 * sizeof(uint));
555
556	/* Copy the kernel inputs */
557	memcpy(kernel_parameters_start, info->input, shader->input_size);
558
559	for (i = 0; i < (input_size / 4); i++) {
560		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
561			((unsigned*)num_work_groups_start)[i]);
562	}
563
564	ctx->transfer_unmap(ctx, transfer);
565
566	/* ID=0 and ID=3 are reserved for the parameters.
567	 * LLVM will preferably use ID=0, but it does not work for dynamic
568	 * indices. */
569	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
570			(struct pipe_resource*)shader->kernel_param);
571	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
572			(struct pipe_resource*)shader->kernel_param);
573}
574
575static void evergreen_emit_dispatch(struct r600_context *rctx,
576				    const struct pipe_grid_info *info,
577				    uint32_t indirect_grid[3])
578{
579	int i;
580	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
581	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
582	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
583	unsigned num_waves;
584	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
585	unsigned wave_divisor = (16 * num_pipes);
586	int group_size = 1;
587	int grid_size = 1;
588	unsigned lds_size = shader->local_size / 4;
589
590	if (shader->ir_type != PIPE_SHADER_IR_TGSI)
591		lds_size += shader->bc.nlds_dw;
592
593	/* Calculate group_size/grid_size */
594	for (i = 0; i < 3; i++) {
595		group_size *= info->block[i];
596	}
597
598	for (i = 0; i < 3; i++)	{
599		grid_size *= info->grid[i];
600	}
601
602	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
603	num_waves = (info->block[0] * info->block[1] * info->block[2] +
604			wave_divisor - 1) / wave_divisor;
605
606	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
607				"%u wavefronts per thread block, "
608				"allocating %u dwords lds.\n",
609				num_pipes, num_waves, lds_size);
610
611	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
612
613	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
614	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
615	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
616	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
617
618	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
619								group_size);
620
621	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
622	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
623	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
624	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
625
626	if (rctx->b.chip_class < CAYMAN) {
627		assert(lds_size <= 8192);
628	} else {
629		/* Cayman appears to have a slightly smaller limit, see the
630		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
631		assert(lds_size <= 8160);
632	}
633
634	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
635					lds_size | (num_waves << 14));
636
637	if (info->indirect) {
638		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
639		radeon_emit(cs, indirect_grid[0]);
640		radeon_emit(cs, indirect_grid[1]);
641		radeon_emit(cs, indirect_grid[2]);
642		radeon_emit(cs, 1);
643	} else {
644		/* Dispatch packet */
645		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
646		radeon_emit(cs, info->grid[0]);
647		radeon_emit(cs, info->grid[1]);
648		radeon_emit(cs, info->grid[2]);
649		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
650		radeon_emit(cs, 1);
651	}
652
653	if (rctx->is_debug)
654		eg_trace_emit(rctx);
655}
656
657static void compute_setup_cbs(struct r600_context *rctx)
658{
659	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
660	unsigned i;
661
662	/* Emit colorbuffers. */
663	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
664	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
665		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
666		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
667						       (struct r600_resource*)cb->base.texture,
668						       RADEON_USAGE_READWRITE,
669						       RADEON_PRIO_SHADER_RW_BUFFER);
670
671		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
672		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
673		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
674		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
675		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
676		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
677		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
678		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
679
680		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
681		radeon_emit(cs, reloc);
682
683		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
684		radeon_emit(cs, reloc);
685	}
686	for (; i < 8 ; i++)
687		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
688					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
689	for (; i < 12; i++)
690		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
691					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
692
693	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
694	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
695				       rctx->compute_cb_target_mask);
696}
697
698static void compute_emit_cs(struct r600_context *rctx,
699			    const struct pipe_grid_info *info)
700{
701	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
702	bool compute_dirty = false;
703	struct r600_pipe_shader *current;
704	struct r600_shader_atomic combined_atomics[8];
705	uint8_t atomic_used_mask;
706	uint32_t indirect_grid[3] = { 0, 0, 0 };
707
708	/* make sure that the gfx ring is only one active */
709	if (radeon_emitted(rctx->b.dma.cs, 0)) {
710		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
711	}
712
713	r600_update_compressed_resource_state(rctx, true);
714
715	if (!rctx->cmd_buf_is_compute) {
716		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
717		rctx->cmd_buf_is_compute = true;
718	}
719
720	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
721		r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
722		current = rctx->cs_shader_state.shader->sel->current;
723		if (compute_dirty) {
724			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
725			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
726			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
727		}
728
729		bool need_buf_const = current->shader.uses_tex_buffers ||
730			current->shader.has_txq_cube_array_z_comp;
731
732		if (info->indirect) {
733			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
734			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
735			unsigned offset = info->indirect_offset / 4;
736			indirect_grid[0] = data[offset];
737			indirect_grid[1] = data[offset + 1];
738			indirect_grid[2] = data[offset + 2];
739		}
740		for (int i = 0; i < 3; i++) {
741			rctx->cs_block_grid_sizes[i] = info->block[i];
742			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
743		}
744		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
745		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
746
747		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
748		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
749
750		if (need_buf_const) {
751			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
752		}
753		r600_update_driver_const_buffers(rctx, true);
754
755		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
756		if (atomic_used_mask) {
757			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
758			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
759		}
760	} else
761		r600_need_cs_space(rctx, 0, true, 0);
762
763	/* Initialize all the compute-related registers.
764	 *
765	 * See evergreen_init_atom_start_compute_cs() in this file for the list
766	 * of registers initialized by the start_compute_cs_cmd atom.
767	 */
768	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
769
770	/* emit config state */
771	if (rctx->b.chip_class == EVERGREEN) {
772		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
773			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
774			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
775			radeon_emit(cs, 0);
776			radeon_emit(cs, 0);
777			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
778		} else
779			r600_emit_atom(rctx, &rctx->config_state.atom);
780	}
781
782	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
783	r600_flush_emit(rctx);
784
785	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
786
787		compute_setup_cbs(rctx);
788
789		/* Emit vertex buffer state */
790		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
791		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
792	} else {
793		uint32_t rat_mask;
794
795		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
796		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
797					       rat_mask);
798	}
799
800	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
801
802	/* Emit constant buffer state */
803	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
804
805	/* Emit sampler state */
806	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
807
808	/* Emit sampler view (texture resource) state */
809	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
810
811	/* Emit images state */
812	r600_emit_atom(rctx, &rctx->compute_images.atom);
813
814	/* Emit buffers state */
815	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
816
817	/* Emit shader state */
818	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
819
820	/* Emit dispatch state and dispatch packet */
821	evergreen_emit_dispatch(rctx, info, indirect_grid);
822
823	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
824	 */
825	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
826		      R600_CONTEXT_INV_VERTEX_CACHE |
827	              R600_CONTEXT_INV_TEX_CACHE;
828	r600_flush_emit(rctx);
829	rctx->b.flags = 0;
830
831	if (rctx->b.chip_class >= CAYMAN) {
832		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
833		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
834		/* DEALLOC_STATE prevents the GPU from hanging when a
835		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
836		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
837		 */
838		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
839		radeon_emit(cs, 0);
840	}
841	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
842		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
843
844#if 0
845	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
846	for (i = 0; i < cs->cdw; i++) {
847		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
848	}
849#endif
850
851}
852
853
854/**
855 * Emit function for r600_cs_shader_state atom
856 */
857void evergreen_emit_cs_shader(struct r600_context *rctx,
858			      struct r600_atom *atom)
859{
860	struct r600_cs_shader_state *state =
861					(struct r600_cs_shader_state*)atom;
862	struct r600_pipe_compute *shader = state->shader;
863	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
864	uint64_t va;
865	struct r600_resource *code_bo;
866	unsigned ngpr, nstack;
867
868	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
869		code_bo = shader->sel->current->bo;
870		va = shader->sel->current->bo->gpu_address;
871		ngpr = shader->sel->current->shader.bc.ngpr;
872		nstack = shader->sel->current->shader.bc.nstack;
873	} else {
874		code_bo = shader->code_bo;
875		va = shader->code_bo->gpu_address + state->pc;
876		ngpr = shader->bc.ngpr;
877		nstack = shader->bc.nstack;
878	}
879
880	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
881	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
882	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
883			S_0288D4_NUM_GPRS(ngpr) |
884			S_0288D4_DX10_CLAMP(1) |
885			S_0288D4_STACK_SIZE(nstack));
886	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
887
888	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
889	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
890					      code_bo, RADEON_USAGE_READ,
891					      RADEON_PRIO_SHADER_BINARY));
892}
893
894static void evergreen_launch_grid(struct pipe_context *ctx,
895				  const struct pipe_grid_info *info)
896{
897	struct r600_context *rctx = (struct r600_context *)ctx;
898#ifdef HAVE_OPENCL
899	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
900	boolean use_kill;
901
902	if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
903		rctx->cs_shader_state.pc = info->pc;
904		/* Get the config information for this kernel. */
905		r600_shader_binary_read_config(&shader->binary, &shader->bc,
906					       info->pc, &use_kill);
907	} else {
908		use_kill = false;
909		rctx->cs_shader_state.pc = 0;
910	}
911#endif
912
913	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
914
915
916	evergreen_compute_upload_input(ctx, info);
917	compute_emit_cs(rctx, info);
918}
919
920static void evergreen_set_compute_resources(struct pipe_context *ctx,
921					    unsigned start, unsigned count,
922					    struct pipe_surface **surfaces)
923{
924	struct r600_context *rctx = (struct r600_context *)ctx;
925	struct r600_surface **resources = (struct r600_surface **)surfaces;
926
927	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
928			start, count);
929
930	for (unsigned i = 0; i < count; i++) {
931		/* The First four vertex buffers are reserved for parameters and
932		 * global buffers. */
933		unsigned vtx_id = 4 + i;
934		if (resources[i]) {
935			struct r600_resource_global *buffer =
936				(struct r600_resource_global*)
937				resources[i]->base.texture;
938			if (resources[i]->base.writable) {
939				assert(i+1 < 12);
940
941				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
942				(struct r600_resource *)resources[i]->base.texture,
943				buffer->chunk->start_in_dw*4,
944				resources[i]->base.texture->width0);
945			}
946
947			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
948					buffer->chunk->start_in_dw * 4,
949					resources[i]->base.texture);
950		}
951	}
952}
953
954static void evergreen_set_global_binding(struct pipe_context *ctx,
955					 unsigned first, unsigned n,
956					 struct pipe_resource **resources,
957					 uint32_t **handles)
958{
959	struct r600_context *rctx = (struct r600_context *)ctx;
960	struct compute_memory_pool *pool = rctx->screen->global_pool;
961	struct r600_resource_global **buffers =
962		(struct r600_resource_global **)resources;
963	unsigned i;
964
965	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
966			first, n);
967
968	if (!resources) {
969		/* XXX: Unset */
970		return;
971	}
972
973	/* We mark these items for promotion to the pool if they
974	 * aren't already there */
975	for (i = first; i < first + n; i++) {
976		struct compute_memory_item *item = buffers[i]->chunk;
977
978		if (!is_item_in_pool(item))
979			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
980	}
981
982	if (compute_memory_finalize_pending(pool, ctx) == -1) {
983		/* XXX: Unset */
984		return;
985	}
986
987	for (i = first; i < first + n; i++)
988	{
989		uint32_t buffer_offset;
990		uint32_t handle;
991		assert(resources[i]->target == PIPE_BUFFER);
992		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
993
994		buffer_offset = util_le32_to_cpu(*(handles[i]));
995		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
996
997		*(handles[i]) = util_cpu_to_le32(handle);
998	}
999
1000	/* globals for writing */
1001	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1002	/* globals for reading */
1003	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1004				(struct pipe_resource*)pool->bo);
1005
1006	/* constants for reading, LLVM puts them in text segment */
1007	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1008				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1009}
1010
1011/**
1012 * This function initializes all the compute specific registers that need to
1013 * be initialized for each compute command stream.  Registers that are common
1014 * to both compute and 3D will be initialized at the beginning of each compute
1015 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1016 * packet requires that the shader type bit be set, we must initialize all
1017 * context registers needed for compute in this function.  The registers
1018 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1019 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1020 * on the GPU family.
1021 */
1022void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1023{
1024	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1025	int num_threads;
1026	int num_stack_entries;
1027
1028	/* since all required registers are initialized in the
1029	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1030	 */
1031	r600_init_command_buffer(cb, 256);
1032	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1033
1034	/* We're setting config registers here. */
1035	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1036	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1037
1038	switch (rctx->b.family) {
1039	case CHIP_CEDAR:
1040	default:
1041		num_threads = 128;
1042		num_stack_entries = 256;
1043		break;
1044	case CHIP_REDWOOD:
1045		num_threads = 128;
1046		num_stack_entries = 256;
1047		break;
1048	case CHIP_JUNIPER:
1049		num_threads = 128;
1050		num_stack_entries = 512;
1051		break;
1052	case CHIP_CYPRESS:
1053	case CHIP_HEMLOCK:
1054		num_threads = 128;
1055		num_stack_entries = 512;
1056		break;
1057	case CHIP_PALM:
1058		num_threads = 128;
1059		num_stack_entries = 256;
1060		break;
1061	case CHIP_SUMO:
1062		num_threads = 128;
1063		num_stack_entries = 256;
1064		break;
1065	case CHIP_SUMO2:
1066		num_threads = 128;
1067		num_stack_entries = 512;
1068		break;
1069	case CHIP_BARTS:
1070		num_threads = 128;
1071		num_stack_entries = 512;
1072		break;
1073	case CHIP_TURKS:
1074		num_threads = 128;
1075		num_stack_entries = 256;
1076		break;
1077	case CHIP_CAICOS:
1078		num_threads = 128;
1079		num_stack_entries = 256;
1080		break;
1081	}
1082
1083	/* The primitive type always needs to be POINTLIST for compute. */
1084	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1085						V_008958_DI_PT_POINTLIST);
1086
1087	if (rctx->b.chip_class < CAYMAN) {
1088
1089		/* These registers control which simds can be used by each stage.
1090		 * The default for these registers is 0xffffffff, which means
1091		 * all simds are available for each stage.  It's possible we may
1092		 * want to play around with these in the future, but for now
1093		 * the default value is fine.
1094		 *
1095		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1096		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1097		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1098		 */
1099
1100		/* XXX: We may need to adjust the thread and stack resource
1101		 * values for 3D/compute interop */
1102
1103		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1104
1105		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1106		 * Set the number of threads used by the PS/VS/GS/ES stage to
1107		 * 0.
1108		 */
1109		r600_store_value(cb, 0);
1110
1111		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1112		 * Set the number of threads used by the CS (aka LS) stage to
1113		 * the maximum number of threads and set the number of threads
1114		 * for the HS stage to 0. */
1115		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1116
1117		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1118		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1119		r600_store_value(cb, 0);
1120
1121		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1122		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1123		r600_store_value(cb, 0);
1124
1125		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1126		 * Set the Contol Flow stack entries to 0 for the HS stage, and
1127		 * set it to the maximum value for the CS (aka LS) stage. */
1128		r600_store_value(cb,
1129			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1130	}
1131	/* Give the compute shader all the available LDS space.
1132	 * NOTE: This only sets the maximum number of dwords that a compute
1133	 * shader can allocate.  When a shader is executed, we still need to
1134	 * allocate the appropriate amount of LDS dwords using the
1135	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1136	 */
1137	if (rctx->b.chip_class < CAYMAN) {
1138		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1139			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1140	} else {
1141		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1142			S_0286FC_NUM_PS_LDS(0) |
1143			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1144	}
1145
1146	/* Context Registers */
1147
1148	if (rctx->b.chip_class < CAYMAN) {
1149		/* workaround for hw issues with dyn gpr - must set all limits
1150		 * to 240 instead of 0, 0x1e == 240 / 8
1151		 */
1152		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1153				S_028838_PS_GPRS(0x1e) |
1154				S_028838_VS_GPRS(0x1e) |
1155				S_028838_GS_GPRS(0x1e) |
1156				S_028838_ES_GPRS(0x1e) |
1157				S_028838_HS_GPRS(0x1e) |
1158				S_028838_LS_GPRS(0x1e));
1159	}
1160
1161	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1162	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1163		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1164
1165	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1166
1167	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1168			       S_0286E8_TID_IN_GROUP_ENA(1) |
1169			       S_0286E8_TGID_ENA(1) |
1170			       S_0286E8_DISABLE_INDEX_PACK(1));
1171
1172	/* The LOOP_CONST registers are an optimizations for loops that allows
1173	 * you to store the initial counter, increment value, and maximum
1174	 * counter value in a register so that hardware can calculate the
1175	 * correct number of iterations for the loop, so that you don't need
1176	 * to have the loop counter in your shader code.  We don't currently use
1177	 * this optimization, so we must keep track of the counter in the
1178	 * shader and use a break instruction to exit loops.  However, the
1179	 * hardware will still uses this register to determine when to exit a
1180	 * loop, so we need to initialize the counter to 0, set the increment
1181	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1182	 * is the maximum value allowed.  This gives us a maximum of 4096
1183	 * iterations for our loops, but hopefully our break instruction will
1184	 * execute before some time before the 4096th iteration.
1185	 */
1186	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1187}
1188
1189void evergreen_init_compute_state_functions(struct r600_context *rctx)
1190{
1191	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1192	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1193	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1194//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1195	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1196	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1197	rctx->b.b.launch_grid = evergreen_launch_grid;
1198
1199}
1200
1201static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1202					      struct pipe_resource *resource,
1203					      unsigned level,
1204					      unsigned usage,
1205					      const struct pipe_box *box,
1206					      struct pipe_transfer **ptransfer)
1207{
1208	struct r600_context *rctx = (struct r600_context*)ctx;
1209	struct compute_memory_pool *pool = rctx->screen->global_pool;
1210	struct r600_resource_global* buffer =
1211		(struct r600_resource_global*)resource;
1212
1213	struct compute_memory_item *item = buffer->chunk;
1214	struct pipe_resource *dst = NULL;
1215	unsigned offset = box->x;
1216
1217	if (is_item_in_pool(item)) {
1218		compute_memory_demote_item(pool, item, ctx);
1219	}
1220	else {
1221		if (item->real_buffer == NULL) {
1222			item->real_buffer =
1223					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1224		}
1225	}
1226
1227	dst = (struct pipe_resource*)item->real_buffer;
1228
1229	if (usage & PIPE_TRANSFER_READ)
1230		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1231
1232	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1233			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1234			"width = %u, height = %u, depth = %u)\n", level, usage,
1235			box->x, box->y, box->z, box->width, box->height,
1236			box->depth);
1237	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1238		"%u (box.x)\n", item->id, box->x);
1239
1240
1241	assert(resource->target == PIPE_BUFFER);
1242	assert(resource->bind & PIPE_BIND_GLOBAL);
1243	assert(box->x >= 0);
1244	assert(box->y == 0);
1245	assert(box->z == 0);
1246
1247	///TODO: do it better, mapping is not possible if the pool is too big
1248	return pipe_buffer_map_range(ctx, dst,
1249			offset, box->width, usage, ptransfer);
1250}
1251
1252static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1253					       struct pipe_transfer *transfer)
1254{
1255	/* struct r600_resource_global are not real resources, they just map
1256	 * to an offset within the compute memory pool.  The function
1257	 * r600_compute_global_transfer_map() maps the memory pool
1258	 * resource rather than the struct r600_resource_global passed to
1259	 * it as an argument and then initalizes ptransfer->resource with
1260	 * the memory pool resource (via pipe_buffer_map_range).
1261	 * When transfer_unmap is called it uses the memory pool's
1262	 * vtable which calls r600_buffer_transfer_map() rather than
1263	 * this function.
1264	 */
1265	assert (!"This function should not be called");
1266}
1267
1268static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1269						      struct pipe_transfer *transfer,
1270						      const struct pipe_box *box)
1271{
1272	assert(0 && "TODO");
1273}
1274
1275static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1276					       struct pipe_resource *res)
1277{
1278	struct r600_resource_global* buffer = NULL;
1279	struct r600_screen* rscreen = NULL;
1280
1281	assert(res->target == PIPE_BUFFER);
1282	assert(res->bind & PIPE_BIND_GLOBAL);
1283
1284	buffer = (struct r600_resource_global*)res;
1285	rscreen = (struct r600_screen*)screen;
1286
1287	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1288
1289	buffer->chunk = NULL;
1290	free(res);
1291}
1292
1293static const struct u_resource_vtbl r600_global_buffer_vtbl =
1294{
1295	u_default_resource_get_handle, /* get_handle */
1296	r600_compute_global_buffer_destroy, /* resource_destroy */
1297	r600_compute_global_transfer_map, /* transfer_map */
1298	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1299	r600_compute_global_transfer_unmap, /* transfer_unmap */
1300};
1301
1302struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1303							const struct pipe_resource *templ)
1304{
1305	struct r600_resource_global* result = NULL;
1306	struct r600_screen* rscreen = NULL;
1307	int size_in_dw = 0;
1308
1309	assert(templ->target == PIPE_BUFFER);
1310	assert(templ->bind & PIPE_BIND_GLOBAL);
1311	assert(templ->array_size == 1 || templ->array_size == 0);
1312	assert(templ->depth0 == 1 || templ->depth0 == 0);
1313	assert(templ->height0 == 1 || templ->height0 == 0);
1314
1315	result = (struct r600_resource_global*)
1316	CALLOC(sizeof(struct r600_resource_global), 1);
1317	rscreen = (struct r600_screen*)screen;
1318
1319	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1320	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1321			templ->array_size);
1322
1323	result->base.b.vtbl = &r600_global_buffer_vtbl;
1324	result->base.b.b = *templ;
1325	result->base.b.b.screen = screen;
1326	pipe_reference_init(&result->base.b.b.reference, 1);
1327
1328	size_in_dw = (templ->width0+3) / 4;
1329
1330	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1331
1332	if (result->chunk == NULL)
1333	{
1334		free(result);
1335		return NULL;
1336	}
1337
1338	return &result->base.b.b;
1339}
1340