1848b8605Smrg/*
2848b8605Smrg * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3848b8605Smrg *
4848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5848b8605Smrg * copy of this software and associated documentation files (the "Software"),
6848b8605Smrg * to deal in the Software without restriction, including without limitation
7848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub
8848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom
9848b8605Smrg * the Software is furnished to do so, subject to the following conditions:
10848b8605Smrg *
11848b8605Smrg * The above copyright notice and this permission notice (including the next
12848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the
13848b8605Smrg * Software.
14848b8605Smrg *
15848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
22848b8605Smrg *
23848b8605Smrg * Authors:
24848b8605Smrg *      Adam Rak <adam.rak@streamnovation.com>
25848b8605Smrg */
26848b8605Smrg
27b8e80941Smrg#ifdef HAVE_OPENCL
28b8e80941Smrg#include <gelf.h>
29b8e80941Smrg#include <libelf.h>
30b8e80941Smrg#endif
31848b8605Smrg#include <stdio.h>
32848b8605Smrg#include <errno.h>
33848b8605Smrg#include "pipe/p_defines.h"
34848b8605Smrg#include "pipe/p_state.h"
35848b8605Smrg#include "pipe/p_context.h"
36848b8605Smrg#include "util/u_blitter.h"
37b8e80941Smrg#include "util/list.h"
38848b8605Smrg#include "util/u_transfer.h"
39848b8605Smrg#include "util/u_surface.h"
40848b8605Smrg#include "util/u_pack_color.h"
41848b8605Smrg#include "util/u_memory.h"
42848b8605Smrg#include "util/u_inlines.h"
43848b8605Smrg#include "util/u_framebuffer.h"
44b8e80941Smrg#include "tgsi/tgsi_parse.h"
45848b8605Smrg#include "pipebuffer/pb_buffer.h"
46848b8605Smrg#include "evergreend.h"
47848b8605Smrg#include "r600_shader.h"
48848b8605Smrg#include "r600_pipe.h"
49848b8605Smrg#include "r600_formats.h"
50848b8605Smrg#include "evergreen_compute.h"
51848b8605Smrg#include "evergreen_compute_internal.h"
52848b8605Smrg#include "compute_memory_pool.h"
53848b8605Smrg#include "sb/sb_public.h"
54848b8605Smrg#include <inttypes.h>
55848b8605Smrg
56848b8605Smrg/**
57848b8605SmrgRAT0 is for global binding write
58848b8605SmrgVTX1 is for global binding read
59848b8605Smrg
60848b8605Smrgfor wrting images RAT1...
61848b8605Smrgfor reading images TEX2...
62848b8605Smrg  TEX2-RAT1 is paired
63848b8605Smrg
64848b8605SmrgTEX2... consumes the same fetch resources, that VTX2... would consume
65848b8605Smrg
66848b8605SmrgCONST0 and VTX0 is for parameters
67848b8605Smrg  CONST0 is binding smaller input parameter buffer, and for constant indexing,
68848b8605Smrg  also constant cached
69848b8605Smrg  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70848b8605Smrg  the constant cache can handle
71848b8605Smrg
72848b8605SmrgRAT-s are limited to 12, so we can only bind at most 11 texture for writing
73848b8605Smrgbecause we reserve RAT0 for global bindings. With byteaddressing enabled,
74848b8605Smrgwe should reserve another one too.=> 10 image binding for writing max.
75848b8605Smrg
76848b8605Smrgfrom Nvidia OpenCL:
77848b8605Smrg  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
78848b8605Smrg  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
79848b8605Smrg
80848b8605Smrgso 10 for writing is enough. 176 is the max for reading according to the docs
81848b8605Smrg
82848b8605Smrgwritable images should be listed first < 10, so their id corresponds to RAT(id+1)
83848b8605Smrgwritable images will consume TEX slots, VTX slots too because of linear indexing
84848b8605Smrg
85848b8605Smrg*/
86848b8605Smrg
87b8e80941Smrgstruct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
88b8e80941Smrg						     unsigned size)
89848b8605Smrg{
90b8e80941Smrg	struct pipe_resource *buffer = NULL;
91848b8605Smrg	assert(size);
92848b8605Smrg
93b8e80941Smrg	buffer = pipe_buffer_create((struct pipe_screen*) screen,
94b8e80941Smrg				    0, PIPE_USAGE_IMMUTABLE, size);
95848b8605Smrg
96848b8605Smrg	return (struct r600_resource *)buffer;
97848b8605Smrg}
98848b8605Smrg
99848b8605Smrg
100b8e80941Smrgstatic void evergreen_set_rat(struct r600_pipe_compute *pipe,
101b8e80941Smrg			      unsigned id,
102b8e80941Smrg			      struct r600_resource *bo,
103b8e80941Smrg			      int start,
104b8e80941Smrg			      int size)
105848b8605Smrg{
106848b8605Smrg	struct pipe_surface rat_templ;
107848b8605Smrg	struct r600_surface *surf = NULL;
108848b8605Smrg	struct r600_context *rctx = NULL;
109848b8605Smrg
110848b8605Smrg	assert(id < 12);
111848b8605Smrg	assert((size & 3) == 0);
112848b8605Smrg	assert((start & 0xFF) == 0);
113848b8605Smrg
114848b8605Smrg	rctx = pipe->ctx;
115848b8605Smrg
116848b8605Smrg	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
117848b8605Smrg
118848b8605Smrg	/* Create the RAT surface */
119848b8605Smrg	memset(&rat_templ, 0, sizeof(rat_templ));
120848b8605Smrg	rat_templ.format = PIPE_FORMAT_R32_UINT;
121848b8605Smrg	rat_templ.u.tex.level = 0;
122848b8605Smrg	rat_templ.u.tex.first_layer = 0;
123848b8605Smrg	rat_templ.u.tex.last_layer = 0;
124848b8605Smrg
125b8e80941Smrg	/* Add the RAT the list of color buffers. Drop the old buffer first. */
126b8e80941Smrg	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
127848b8605Smrg	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128848b8605Smrg		(struct pipe_context *)pipe->ctx,
129848b8605Smrg		(struct pipe_resource *)bo, &rat_templ);
130848b8605Smrg
131848b8605Smrg	/* Update the number of color buffers */
132848b8605Smrg	pipe->ctx->framebuffer.state.nr_cbufs =
133848b8605Smrg		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134848b8605Smrg
135848b8605Smrg	/* Update the cb_target_mask
136848b8605Smrg	 * XXX: I think this is a potential spot for bugs once we start doing
137848b8605Smrg	 * GL interop.  cb_target_mask may be modified in the 3D sections
138848b8605Smrg	 * of this driver. */
139848b8605Smrg	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140848b8605Smrg
141848b8605Smrg	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142848b8605Smrg	evergreen_init_color_surface_rat(rctx, surf);
143848b8605Smrg}
144848b8605Smrg
145b8e80941Smrgstatic void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146b8e80941Smrg					   unsigned vb_index,
147b8e80941Smrg					   unsigned offset,
148b8e80941Smrg					   struct pipe_resource *buffer)
149848b8605Smrg{
150848b8605Smrg	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151848b8605Smrg	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152848b8605Smrg	vb->stride = 1;
153848b8605Smrg	vb->buffer_offset = offset;
154b8e80941Smrg	vb->buffer.resource = buffer;
155b8e80941Smrg	vb->is_user_buffer = false;
156848b8605Smrg
157848b8605Smrg	/* The vertex instructions in the compute shaders use the texture cache,
158848b8605Smrg	 * so we need to invalidate it. */
159848b8605Smrg	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160848b8605Smrg	state->enabled_mask |= 1 << vb_index;
161848b8605Smrg	state->dirty_mask |= 1 << vb_index;
162b8e80941Smrg	r600_mark_atom_dirty(rctx, &state->atom);
163848b8605Smrg}
164848b8605Smrg
165b8e80941Smrgstatic void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166b8e80941Smrg					     unsigned cb_index,
167b8e80941Smrg					     unsigned offset,
168b8e80941Smrg					     unsigned size,
169b8e80941Smrg					     struct pipe_resource *buffer)
170848b8605Smrg{
171848b8605Smrg	struct pipe_constant_buffer cb;
172848b8605Smrg	cb.buffer_size = size;
173848b8605Smrg	cb.buffer_offset = offset;
174848b8605Smrg	cb.buffer = buffer;
175848b8605Smrg	cb.user_buffer = NULL;
176848b8605Smrg
177848b8605Smrg	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178848b8605Smrg}
179848b8605Smrg
180b8e80941Smrg/* We need to define these R600 registers here, because we can't include
181b8e80941Smrg * evergreend.h and r600d.h.
182b8e80941Smrg */
183b8e80941Smrg#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
184b8e80941Smrg#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
185b8e80941Smrg
186b8e80941Smrg#ifdef HAVE_OPENCL
187b8e80941Smrgstatic void parse_symbol_table(Elf_Data *symbol_table_data,
188b8e80941Smrg				const GElf_Shdr *symbol_table_header,
189b8e80941Smrg				struct ac_shader_binary *binary)
190848b8605Smrg{
191b8e80941Smrg	GElf_Sym symbol;
192b8e80941Smrg	unsigned i = 0;
193b8e80941Smrg	unsigned symbol_count =
194b8e80941Smrg		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
195b8e80941Smrg
196b8e80941Smrg	/* We are over allocating this list, because symbol_count gives the
197b8e80941Smrg	 * total number of symbols, and we will only be filling the list
198b8e80941Smrg	 * with offsets of global symbols.  The memory savings from
199b8e80941Smrg	 * allocating the correct size of this list will be small, and
200b8e80941Smrg	 * I don't think it is worth the cost of pre-computing the number
201b8e80941Smrg	 * of global symbols.
202b8e80941Smrg	 */
203b8e80941Smrg	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
204b8e80941Smrg
205b8e80941Smrg	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
206b8e80941Smrg		unsigned i;
207b8e80941Smrg		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
208b8e80941Smrg		    symbol.st_shndx == 0 /* Undefined symbol */) {
209b8e80941Smrg			continue;
210b8e80941Smrg		}
211848b8605Smrg
212b8e80941Smrg		binary->global_symbol_offsets[binary->global_symbol_count] =
213b8e80941Smrg					symbol.st_value;
214848b8605Smrg
215b8e80941Smrg		/* Sort the list using bubble sort.  This list will usually
216b8e80941Smrg		 * be small. */
217b8e80941Smrg		for (i = binary->global_symbol_count; i > 0; --i) {
218b8e80941Smrg			uint64_t lhs = binary->global_symbol_offsets[i - 1];
219b8e80941Smrg			uint64_t rhs = binary->global_symbol_offsets[i];
220b8e80941Smrg			if (lhs < rhs) {
221b8e80941Smrg				break;
222b8e80941Smrg			}
223b8e80941Smrg			binary->global_symbol_offsets[i] = lhs;
224b8e80941Smrg			binary->global_symbol_offsets[i - 1] = rhs;
225b8e80941Smrg		}
226b8e80941Smrg		++binary->global_symbol_count;
227b8e80941Smrg	}
228b8e80941Smrg}
229b8e80941Smrg
230b8e80941Smrg
231b8e80941Smrgstatic void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
232b8e80941Smrg			unsigned symbol_sh_link,
233b8e80941Smrg			struct ac_shader_binary *binary)
234848b8605Smrg{
235b8e80941Smrg	unsigned i;
236848b8605Smrg
237b8e80941Smrg	if (!relocs || !symbols || !binary->reloc_count) {
238b8e80941Smrg		return;
239b8e80941Smrg	}
240b8e80941Smrg	binary->relocs = CALLOC(binary->reloc_count,
241b8e80941Smrg			sizeof(struct ac_shader_reloc));
242b8e80941Smrg	for (i = 0; i < binary->reloc_count; i++) {
243b8e80941Smrg		GElf_Sym symbol;
244b8e80941Smrg		GElf_Rel rel;
245b8e80941Smrg		char *symbol_name;
246b8e80941Smrg		struct ac_shader_reloc *reloc = &binary->relocs[i];
247b8e80941Smrg
248b8e80941Smrg		gelf_getrel(relocs, i, &rel);
249b8e80941Smrg		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
250b8e80941Smrg		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
251b8e80941Smrg
252b8e80941Smrg		reloc->offset = rel.r_offset;
253b8e80941Smrg		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
254b8e80941Smrg		reloc->name[sizeof(reloc->name)-1] = 0;
255b8e80941Smrg	}
256b8e80941Smrg}
257b8e80941Smrg
258b8e80941Smrgstatic void r600_elf_read(const char *elf_data, unsigned elf_size,
259b8e80941Smrg		 struct ac_shader_binary *binary)
260b8e80941Smrg{
261b8e80941Smrg	char *elf_buffer;
262b8e80941Smrg	Elf *elf;
263b8e80941Smrg	Elf_Scn *section = NULL;
264b8e80941Smrg	Elf_Data *symbols = NULL, *relocs = NULL;
265b8e80941Smrg	size_t section_str_index;
266b8e80941Smrg	unsigned symbol_sh_link = 0;
267b8e80941Smrg
268b8e80941Smrg	/* One of the libelf implementations
269b8e80941Smrg	 * (http://www.mr511.de/software/english.htm) requires calling
270b8e80941Smrg	 * elf_version() before elf_memory().
271b8e80941Smrg	 */
272b8e80941Smrg	elf_version(EV_CURRENT);
273b8e80941Smrg	elf_buffer = MALLOC(elf_size);
274b8e80941Smrg	memcpy(elf_buffer, elf_data, elf_size);
275b8e80941Smrg
276b8e80941Smrg	elf = elf_memory(elf_buffer, elf_size);
277b8e80941Smrg
278b8e80941Smrg	elf_getshdrstrndx(elf, &section_str_index);
279b8e80941Smrg
280b8e80941Smrg	while ((section = elf_nextscn(elf, section))) {
281b8e80941Smrg		const char *name;
282b8e80941Smrg		Elf_Data *section_data = NULL;
283b8e80941Smrg		GElf_Shdr section_header;
284b8e80941Smrg		if (gelf_getshdr(section, &section_header) != &section_header) {
285b8e80941Smrg			fprintf(stderr, "Failed to read ELF section header\n");
286b8e80941Smrg			return;
287b8e80941Smrg		}
288b8e80941Smrg		name = elf_strptr(elf, section_str_index, section_header.sh_name);
289b8e80941Smrg		if (!strcmp(name, ".text")) {
290b8e80941Smrg			section_data = elf_getdata(section, section_data);
291b8e80941Smrg			binary->code_size = section_data->d_size;
292b8e80941Smrg			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
293b8e80941Smrg			memcpy(binary->code, section_data->d_buf, binary->code_size);
294b8e80941Smrg		} else if (!strcmp(name, ".AMDGPU.config")) {
295b8e80941Smrg			section_data = elf_getdata(section, section_data);
296b8e80941Smrg			binary->config_size = section_data->d_size;
297b8e80941Smrg			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
298b8e80941Smrg			memcpy(binary->config, section_data->d_buf, binary->config_size);
299b8e80941Smrg		} else if (!strcmp(name, ".AMDGPU.disasm")) {
300b8e80941Smrg			/* Always read disassembly if it's available. */
301b8e80941Smrg			section_data = elf_getdata(section, section_data);
302b8e80941Smrg			binary->disasm_string = strndup(section_data->d_buf,
303b8e80941Smrg							section_data->d_size);
304b8e80941Smrg		} else if (!strncmp(name, ".rodata", 7)) {
305b8e80941Smrg			section_data = elf_getdata(section, section_data);
306b8e80941Smrg			binary->rodata_size = section_data->d_size;
307b8e80941Smrg			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
308b8e80941Smrg			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
309b8e80941Smrg		} else if (!strncmp(name, ".symtab", 7)) {
310b8e80941Smrg			symbols = elf_getdata(section, section_data);
311b8e80941Smrg			symbol_sh_link = section_header.sh_link;
312b8e80941Smrg			parse_symbol_table(symbols, &section_header, binary);
313b8e80941Smrg		} else if (!strcmp(name, ".rel.text")) {
314b8e80941Smrg			relocs = elf_getdata(section, section_data);
315b8e80941Smrg			binary->reloc_count = section_header.sh_size /
316b8e80941Smrg					section_header.sh_entsize;
317b8e80941Smrg		}
318b8e80941Smrg	}
319b8e80941Smrg
320b8e80941Smrg	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
321b8e80941Smrg
322b8e80941Smrg	if (elf){
323b8e80941Smrg		elf_end(elf);
324b8e80941Smrg	}
325b8e80941Smrg	FREE(elf_buffer);
326b8e80941Smrg
327b8e80941Smrg	/* Cache the config size per symbol */
328b8e80941Smrg	if (binary->global_symbol_count) {
329b8e80941Smrg		binary->config_size_per_symbol =
330b8e80941Smrg			binary->config_size / binary->global_symbol_count;
331b8e80941Smrg	} else {
332b8e80941Smrg		binary->global_symbol_count = 1;
333b8e80941Smrg		binary->config_size_per_symbol = binary->config_size;
334b8e80941Smrg	}
335b8e80941Smrg}
336b8e80941Smrg
337b8e80941Smrgstatic const unsigned char *r600_shader_binary_config_start(
338b8e80941Smrg	const struct ac_shader_binary *binary,
339b8e80941Smrg	uint64_t symbol_offset)
340b8e80941Smrg{
341848b8605Smrg	unsigned i;
342b8e80941Smrg	for (i = 0; i < binary->global_symbol_count; ++i) {
343b8e80941Smrg		if (binary->global_symbol_offsets[i] == symbol_offset) {
344b8e80941Smrg			unsigned offset = i * binary->config_size_per_symbol;
345b8e80941Smrg			return binary->config + offset;
346b8e80941Smrg		}
347b8e80941Smrg	}
348b8e80941Smrg	return binary->config;
349b8e80941Smrg}
350848b8605Smrg
351b8e80941Smrgstatic void r600_shader_binary_read_config(const struct ac_shader_binary *binary,
352b8e80941Smrg					   struct r600_bytecode *bc,
353b8e80941Smrg					   uint64_t symbol_offset,
354b8e80941Smrg					   boolean *use_kill)
355b8e80941Smrg{
356b8e80941Smrg       unsigned i;
357b8e80941Smrg       const unsigned char *config =
358b8e80941Smrg               r600_shader_binary_config_start(binary, symbol_offset);
359b8e80941Smrg
360b8e80941Smrg       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
361b8e80941Smrg               unsigned reg =
362b8e80941Smrg                       util_le32_to_cpu(*(uint32_t*)(config + i));
363b8e80941Smrg               unsigned value =
364b8e80941Smrg                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
365b8e80941Smrg               switch (reg) {
366b8e80941Smrg               /* R600 / R700 */
367b8e80941Smrg               case R_028850_SQ_PGM_RESOURCES_PS:
368b8e80941Smrg               case R_028868_SQ_PGM_RESOURCES_VS:
369b8e80941Smrg               /* Evergreen / Northern Islands */
370b8e80941Smrg               case R_028844_SQ_PGM_RESOURCES_PS:
371b8e80941Smrg               case R_028860_SQ_PGM_RESOURCES_VS:
372b8e80941Smrg               case R_0288D4_SQ_PGM_RESOURCES_LS:
373b8e80941Smrg                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
374b8e80941Smrg                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
375b8e80941Smrg                       break;
376b8e80941Smrg               case R_02880C_DB_SHADER_CONTROL:
377b8e80941Smrg                       *use_kill = G_02880C_KILL_ENABLE(value);
378b8e80941Smrg                       break;
379b8e80941Smrg               case R_0288E8_SQ_LDS_ALLOC:
380b8e80941Smrg                       bc->nlds_dw = value;
381b8e80941Smrg                       break;
382b8e80941Smrg               }
383b8e80941Smrg       }
384b8e80941Smrg}
385848b8605Smrg
386b8e80941Smrgstatic unsigned r600_create_shader(struct r600_bytecode *bc,
387b8e80941Smrg				   const struct ac_shader_binary *binary,
388b8e80941Smrg				   boolean *use_kill)
389b8e80941Smrg
390b8e80941Smrg{
391b8e80941Smrg	assert(binary->code_size % 4 == 0);
392b8e80941Smrg	bc->bytecode = CALLOC(1, binary->code_size);
393b8e80941Smrg	memcpy(bc->bytecode, binary->code, binary->code_size);
394b8e80941Smrg	bc->ndw = binary->code_size / 4;
395b8e80941Smrg
396b8e80941Smrg	r600_shader_binary_read_config(binary, bc, 0, use_kill);
397b8e80941Smrg	return 0;
398b8e80941Smrg}
399848b8605Smrg
400848b8605Smrg#endif
401848b8605Smrg
402b8e80941Smrgstatic void r600_destroy_shader(struct r600_bytecode *bc)
403b8e80941Smrg{
404b8e80941Smrg	FREE(bc->bytecode);
405b8e80941Smrg}
406b8e80941Smrg
407b8e80941Smrgstatic void *evergreen_create_compute_state(struct pipe_context *ctx,
408b8e80941Smrg					    const struct pipe_compute_state *cso)
409b8e80941Smrg{
410b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
411b8e80941Smrg	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
412b8e80941Smrg#ifdef HAVE_OPENCL
413b8e80941Smrg	const struct pipe_llvm_program_header *header;
414b8e80941Smrg	const char *code;
415b8e80941Smrg	void *p;
416b8e80941Smrg	boolean use_kill;
417b8e80941Smrg#endif
418b8e80941Smrg
419b8e80941Smrg	shader->ctx = rctx;
420848b8605Smrg	shader->local_size = cso->req_local_mem;
421848b8605Smrg	shader->private_size = cso->req_private_mem;
422848b8605Smrg	shader->input_size = cso->req_input_mem;
423848b8605Smrg
424b8e80941Smrg	shader->ir_type = cso->ir_type;
425848b8605Smrg
426b8e80941Smrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
427b8e80941Smrg		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE);
428b8e80941Smrg		return shader;
429848b8605Smrg	}
430b8e80941Smrg#ifdef HAVE_OPENCL
431b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
432b8e80941Smrg	header = cso->prog;
433b8e80941Smrg	code = cso->prog + sizeof(struct pipe_llvm_program_header);
434b8e80941Smrg	radeon_shader_binary_init(&shader->binary);
435b8e80941Smrg	r600_elf_read(code, header->num_bytes, &shader->binary);
436b8e80941Smrg	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
437b8e80941Smrg
438b8e80941Smrg	/* Upload code + ROdata */
439b8e80941Smrg	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
440b8e80941Smrg							shader->bc.ndw * 4);
441b8e80941Smrg	p = r600_buffer_map_sync_with_rings(
442b8e80941Smrg		&rctx->b, shader->code_bo,
443b8e80941Smrg		PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
444b8e80941Smrg	//TODO: use util_memcpy_cpu_to_le32 ?
445b8e80941Smrg	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
446b8e80941Smrg	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
447848b8605Smrg#endif
448b8e80941Smrg
449848b8605Smrg	return shader;
450848b8605Smrg}
451848b8605Smrg
452b8e80941Smrgstatic void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
453848b8605Smrg{
454b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
455b8e80941Smrg	struct r600_pipe_compute *shader = state;
456b8e80941Smrg
457b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
458848b8605Smrg
459848b8605Smrg	if (!shader)
460848b8605Smrg		return;
461848b8605Smrg
462b8e80941Smrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
463b8e80941Smrg		r600_delete_shader_selector(ctx, shader->sel);
464b8e80941Smrg	} else {
465848b8605Smrg#ifdef HAVE_OPENCL
466b8e80941Smrg		radeon_shader_binary_clean(&shader->binary);
467b8e80941Smrg		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
468b8e80941Smrg		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
469848b8605Smrg#endif
470b8e80941Smrg		r600_destroy_shader(&shader->bc);
471b8e80941Smrg	}
472848b8605Smrg	FREE(shader);
473848b8605Smrg}
474848b8605Smrg
475b8e80941Smrgstatic void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
476848b8605Smrg{
477b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
478b8e80941Smrg	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
479b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
480b8e80941Smrg
481b8e80941Smrg	if (!state) {
482b8e80941Smrg		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
483b8e80941Smrg		return;
484b8e80941Smrg	}
485b8e80941Smrg
486b8e80941Smrg	if (cstate->ir_type == PIPE_SHADER_IR_TGSI) {
487b8e80941Smrg		bool compute_dirty;
488848b8605Smrg
489b8e80941Smrg		r600_shader_select(ctx, cstate->sel, &compute_dirty);
490b8e80941Smrg	}
491848b8605Smrg
492b8e80941Smrg	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
493848b8605Smrg}
494848b8605Smrg
495848b8605Smrg/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
496848b8605Smrg * kernel parameters there are implicit parameters that need to be stored
497848b8605Smrg * in the vertex buffer as well.  Here is how these parameters are organized in
498848b8605Smrg * the buffer:
499848b8605Smrg *
500848b8605Smrg * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
501848b8605Smrg * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
502848b8605Smrg * DWORDS 6-8: Number of work items within each work group in each dimension
503848b8605Smrg *             (x,y,z)
504848b8605Smrg * DWORDS 9+ : Kernel parameters
505848b8605Smrg */
506b8e80941Smrgstatic void evergreen_compute_upload_input(struct pipe_context *ctx,
507b8e80941Smrg					   const struct pipe_grid_info *info)
508848b8605Smrg{
509b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
510b8e80941Smrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
511848b8605Smrg	unsigned i;
512848b8605Smrg	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
513848b8605Smrg	 * parameters.
514848b8605Smrg	 */
515b8e80941Smrg	unsigned input_size;
516b8e80941Smrg	uint32_t *num_work_groups_start;
517b8e80941Smrg	uint32_t *global_size_start;
518b8e80941Smrg	uint32_t *local_size_start;
519b8e80941Smrg	uint32_t *kernel_parameters_start;
520848b8605Smrg	struct pipe_box box;
521848b8605Smrg	struct pipe_transfer *transfer = NULL;
522848b8605Smrg
523b8e80941Smrg	if (!shader)
524b8e80941Smrg		return;
525848b8605Smrg	if (shader->input_size == 0) {
526848b8605Smrg		return;
527848b8605Smrg	}
528b8e80941Smrg	input_size = shader->input_size + 36;
529848b8605Smrg	if (!shader->kernel_param) {
530848b8605Smrg		/* Add space for the grid dimensions */
531848b8605Smrg		shader->kernel_param = (struct r600_resource *)
532b8e80941Smrg			pipe_buffer_create(ctx->screen, 0,
533848b8605Smrg					PIPE_USAGE_IMMUTABLE, input_size);
534848b8605Smrg	}
535848b8605Smrg
536848b8605Smrg	u_box_1d(0, input_size, &box);
537b8e80941Smrg	num_work_groups_start = ctx->transfer_map(ctx,
538848b8605Smrg			(struct pipe_resource*)shader->kernel_param,
539848b8605Smrg			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
540848b8605Smrg			&box, &transfer);
541848b8605Smrg	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
542848b8605Smrg	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
543848b8605Smrg	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
544848b8605Smrg
545848b8605Smrg	/* Copy the work group size */
546b8e80941Smrg	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
547848b8605Smrg
548848b8605Smrg	/* Copy the global size */
549848b8605Smrg	for (i = 0; i < 3; i++) {
550b8e80941Smrg		global_size_start[i] = info->grid[i] * info->block[i];
551848b8605Smrg	}
552848b8605Smrg
553848b8605Smrg	/* Copy the local dimensions */
554b8e80941Smrg	memcpy(local_size_start, info->block, 3 * sizeof(uint));
555848b8605Smrg
556848b8605Smrg	/* Copy the kernel inputs */
557b8e80941Smrg	memcpy(kernel_parameters_start, info->input, shader->input_size);
558848b8605Smrg
559848b8605Smrg	for (i = 0; i < (input_size / 4); i++) {
560b8e80941Smrg		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
561848b8605Smrg			((unsigned*)num_work_groups_start)[i]);
562848b8605Smrg	}
563848b8605Smrg
564b8e80941Smrg	ctx->transfer_unmap(ctx, transfer);
565848b8605Smrg
566b8e80941Smrg	/* ID=0 and ID=3 are reserved for the parameters.
567b8e80941Smrg	 * LLVM will preferably use ID=0, but it does not work for dynamic
568b8e80941Smrg	 * indices. */
569b8e80941Smrg	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
570b8e80941Smrg			(struct pipe_resource*)shader->kernel_param);
571b8e80941Smrg	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
572848b8605Smrg			(struct pipe_resource*)shader->kernel_param);
573848b8605Smrg}
574848b8605Smrg
575b8e80941Smrgstatic void evergreen_emit_dispatch(struct r600_context *rctx,
576b8e80941Smrg				    const struct pipe_grid_info *info,
577b8e80941Smrg				    uint32_t indirect_grid[3])
578848b8605Smrg{
579848b8605Smrg	int i;
580b8e80941Smrg	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
581848b8605Smrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
582b8e80941Smrg	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
583848b8605Smrg	unsigned num_waves;
584b8e80941Smrg	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
585848b8605Smrg	unsigned wave_divisor = (16 * num_pipes);
586848b8605Smrg	int group_size = 1;
587848b8605Smrg	int grid_size = 1;
588b8e80941Smrg	unsigned lds_size = shader->local_size / 4;
589b8e80941Smrg
590b8e80941Smrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI)
591b8e80941Smrg		lds_size += shader->bc.nlds_dw;
592848b8605Smrg
593848b8605Smrg	/* Calculate group_size/grid_size */
594848b8605Smrg	for (i = 0; i < 3; i++) {
595b8e80941Smrg		group_size *= info->block[i];
596848b8605Smrg	}
597848b8605Smrg
598848b8605Smrg	for (i = 0; i < 3; i++)	{
599b8e80941Smrg		grid_size *= info->grid[i];
600848b8605Smrg	}
601848b8605Smrg
602848b8605Smrg	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
603b8e80941Smrg	num_waves = (info->block[0] * info->block[1] * info->block[2] +
604848b8605Smrg			wave_divisor - 1) / wave_divisor;
605848b8605Smrg
606848b8605Smrg	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
607848b8605Smrg				"%u wavefronts per thread block, "
608848b8605Smrg				"allocating %u dwords lds.\n",
609848b8605Smrg				num_pipes, num_waves, lds_size);
610848b8605Smrg
611b8e80941Smrg	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
612848b8605Smrg
613b8e80941Smrg	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
614848b8605Smrg	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
615848b8605Smrg	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
616848b8605Smrg	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
617848b8605Smrg
618b8e80941Smrg	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
619848b8605Smrg								group_size);
620848b8605Smrg
621b8e80941Smrg	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
622b8e80941Smrg	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
623b8e80941Smrg	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
624b8e80941Smrg	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
625848b8605Smrg
626848b8605Smrg	if (rctx->b.chip_class < CAYMAN) {
627848b8605Smrg		assert(lds_size <= 8192);
628848b8605Smrg	} else {
629848b8605Smrg		/* Cayman appears to have a slightly smaller limit, see the
630848b8605Smrg		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
631848b8605Smrg		assert(lds_size <= 8160);
632848b8605Smrg	}
633848b8605Smrg
634b8e80941Smrg	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
635848b8605Smrg					lds_size | (num_waves << 14));
636848b8605Smrg
637b8e80941Smrg	if (info->indirect) {
638b8e80941Smrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
639b8e80941Smrg		radeon_emit(cs, indirect_grid[0]);
640b8e80941Smrg		radeon_emit(cs, indirect_grid[1]);
641b8e80941Smrg		radeon_emit(cs, indirect_grid[2]);
642b8e80941Smrg		radeon_emit(cs, 1);
643b8e80941Smrg	} else {
644b8e80941Smrg		/* Dispatch packet */
645b8e80941Smrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
646b8e80941Smrg		radeon_emit(cs, info->grid[0]);
647b8e80941Smrg		radeon_emit(cs, info->grid[1]);
648b8e80941Smrg		radeon_emit(cs, info->grid[2]);
649b8e80941Smrg		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
650b8e80941Smrg		radeon_emit(cs, 1);
651b8e80941Smrg	}
652b8e80941Smrg
653b8e80941Smrg	if (rctx->is_debug)
654b8e80941Smrg		eg_trace_emit(rctx);
655848b8605Smrg}
656848b8605Smrg
657b8e80941Smrgstatic void compute_setup_cbs(struct r600_context *rctx)
658848b8605Smrg{
659b8e80941Smrg	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
660848b8605Smrg	unsigned i;
661848b8605Smrg
662848b8605Smrg	/* Emit colorbuffers. */
663848b8605Smrg	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
664b8e80941Smrg	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
665b8e80941Smrg		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
666b8e80941Smrg		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
667848b8605Smrg						       (struct r600_resource*)cb->base.texture,
668848b8605Smrg						       RADEON_USAGE_READWRITE,
669b8e80941Smrg						       RADEON_PRIO_SHADER_RW_BUFFER);
670848b8605Smrg
671b8e80941Smrg		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
672848b8605Smrg		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
673848b8605Smrg		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
674848b8605Smrg		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
675848b8605Smrg		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
676848b8605Smrg		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
677848b8605Smrg		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
678848b8605Smrg		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
679848b8605Smrg
680848b8605Smrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
681848b8605Smrg		radeon_emit(cs, reloc);
682848b8605Smrg
683848b8605Smrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
684848b8605Smrg		radeon_emit(cs, reloc);
685848b8605Smrg	}
686b8e80941Smrg	for (; i < 8 ; i++)
687b8e80941Smrg		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
688b8e80941Smrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
689b8e80941Smrg	for (; i < 12; i++)
690b8e80941Smrg		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
691b8e80941Smrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
692b8e80941Smrg
693b8e80941Smrg	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
694b8e80941Smrg	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
695b8e80941Smrg				       rctx->compute_cb_target_mask);
696b8e80941Smrg}
697b8e80941Smrg
698b8e80941Smrgstatic void compute_emit_cs(struct r600_context *rctx,
699b8e80941Smrg			    const struct pipe_grid_info *info)
700b8e80941Smrg{
701b8e80941Smrg	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
702b8e80941Smrg	bool compute_dirty = false;
703b8e80941Smrg	struct r600_pipe_shader *current;
704b8e80941Smrg	struct r600_shader_atomic combined_atomics[8];
705b8e80941Smrg	uint8_t atomic_used_mask;
706b8e80941Smrg	uint32_t indirect_grid[3] = { 0, 0, 0 };
707b8e80941Smrg
708b8e80941Smrg	/* make sure that the gfx ring is only one active */
709b8e80941Smrg	if (radeon_emitted(rctx->b.dma.cs, 0)) {
710b8e80941Smrg		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
711b8e80941Smrg	}
712b8e80941Smrg
713b8e80941Smrg	r600_update_compressed_resource_state(rctx, true);
714b8e80941Smrg
715b8e80941Smrg	if (!rctx->cmd_buf_is_compute) {
716b8e80941Smrg		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
717b8e80941Smrg		rctx->cmd_buf_is_compute = true;
718b8e80941Smrg	}
719b8e80941Smrg
720b8e80941Smrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
721b8e80941Smrg		r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
722b8e80941Smrg		current = rctx->cs_shader_state.shader->sel->current;
723b8e80941Smrg		if (compute_dirty) {
724b8e80941Smrg			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
725b8e80941Smrg			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
726b8e80941Smrg			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
727b8e80941Smrg		}
728b8e80941Smrg
729b8e80941Smrg		bool need_buf_const = current->shader.uses_tex_buffers ||
730b8e80941Smrg			current->shader.has_txq_cube_array_z_comp;
731b8e80941Smrg
732b8e80941Smrg		if (info->indirect) {
733b8e80941Smrg			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
734b8e80941Smrg			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ);
735b8e80941Smrg			unsigned offset = info->indirect_offset / 4;
736b8e80941Smrg			indirect_grid[0] = data[offset];
737b8e80941Smrg			indirect_grid[1] = data[offset + 1];
738b8e80941Smrg			indirect_grid[2] = data[offset + 2];
739b8e80941Smrg		}
740b8e80941Smrg		for (int i = 0; i < 3; i++) {
741b8e80941Smrg			rctx->cs_block_grid_sizes[i] = info->block[i];
742b8e80941Smrg			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
743b8e80941Smrg		}
744b8e80941Smrg		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
745b8e80941Smrg		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
746b8e80941Smrg
747b8e80941Smrg		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
748b8e80941Smrg		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
749b8e80941Smrg
750b8e80941Smrg		if (need_buf_const) {
751b8e80941Smrg			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
752848b8605Smrg		}
753b8e80941Smrg		r600_update_driver_const_buffers(rctx, true);
754b8e80941Smrg
755b8e80941Smrg		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
756b8e80941Smrg		if (atomic_used_mask) {
757b8e80941Smrg			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
758b8e80941Smrg			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
759848b8605Smrg		}
760b8e80941Smrg	} else
761b8e80941Smrg		r600_need_cs_space(rctx, 0, true, 0);
762b8e80941Smrg
763b8e80941Smrg	/* Initialize all the compute-related registers.
764b8e80941Smrg	 *
765b8e80941Smrg	 * See evergreen_init_atom_start_compute_cs() in this file for the list
766b8e80941Smrg	 * of registers initialized by the start_compute_cs_cmd atom.
767b8e80941Smrg	 */
768b8e80941Smrg	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
769b8e80941Smrg
770b8e80941Smrg	/* emit config state */
771b8e80941Smrg	if (rctx->b.chip_class == EVERGREEN) {
772b8e80941Smrg		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
773b8e80941Smrg			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
774b8e80941Smrg			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
775b8e80941Smrg			radeon_emit(cs, 0);
776b8e80941Smrg			radeon_emit(cs, 0);
777b8e80941Smrg			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
778b8e80941Smrg		} else
779b8e80941Smrg			r600_emit_atom(rctx, &rctx->config_state.atom);
780848b8605Smrg	}
781848b8605Smrg
782b8e80941Smrg	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
783b8e80941Smrg	r600_flush_emit(rctx);
784848b8605Smrg
785b8e80941Smrg	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) {
786848b8605Smrg
787b8e80941Smrg		compute_setup_cbs(rctx);
788b8e80941Smrg
789b8e80941Smrg		/* Emit vertex buffer state */
790b8e80941Smrg		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
791b8e80941Smrg		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
792b8e80941Smrg	} else {
793b8e80941Smrg		uint32_t rat_mask;
794b8e80941Smrg
795b8e80941Smrg		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
796b8e80941Smrg		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
797b8e80941Smrg					       rat_mask);
798b8e80941Smrg	}
799b8e80941Smrg
800b8e80941Smrg	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
801848b8605Smrg
802848b8605Smrg	/* Emit constant buffer state */
803b8e80941Smrg	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
804b8e80941Smrg
805b8e80941Smrg	/* Emit sampler state */
806b8e80941Smrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
807b8e80941Smrg
808b8e80941Smrg	/* Emit sampler view (texture resource) state */
809b8e80941Smrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
810b8e80941Smrg
811b8e80941Smrg	/* Emit images state */
812b8e80941Smrg	r600_emit_atom(rctx, &rctx->compute_images.atom);
813b8e80941Smrg
814b8e80941Smrg	/* Emit buffers state */
815b8e80941Smrg	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
816848b8605Smrg
817b8e80941Smrg	/* Emit shader state */
818b8e80941Smrg	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
819848b8605Smrg
820848b8605Smrg	/* Emit dispatch state and dispatch packet */
821b8e80941Smrg	evergreen_emit_dispatch(rctx, info, indirect_grid);
822848b8605Smrg
823848b8605Smrg	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
824848b8605Smrg	 */
825b8e80941Smrg	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
826848b8605Smrg		      R600_CONTEXT_INV_VERTEX_CACHE |
827848b8605Smrg	              R600_CONTEXT_INV_TEX_CACHE;
828b8e80941Smrg	r600_flush_emit(rctx);
829b8e80941Smrg	rctx->b.flags = 0;
830848b8605Smrg
831b8e80941Smrg	if (rctx->b.chip_class >= CAYMAN) {
832b8e80941Smrg		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
833b8e80941Smrg		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
834848b8605Smrg		/* DEALLOC_STATE prevents the GPU from hanging when a
835848b8605Smrg		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
836848b8605Smrg		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
837848b8605Smrg		 */
838b8e80941Smrg		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
839b8e80941Smrg		radeon_emit(cs, 0);
840848b8605Smrg	}
841b8e80941Smrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI)
842b8e80941Smrg		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
843848b8605Smrg
844848b8605Smrg#if 0
845b8e80941Smrg	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
846848b8605Smrg	for (i = 0; i < cs->cdw; i++) {
847b8e80941Smrg		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
848848b8605Smrg	}
849848b8605Smrg#endif
850848b8605Smrg
851848b8605Smrg}
852848b8605Smrg
853848b8605Smrg
854848b8605Smrg/**
855848b8605Smrg * Emit function for r600_cs_shader_state atom
856848b8605Smrg */
857b8e80941Smrgvoid evergreen_emit_cs_shader(struct r600_context *rctx,
858b8e80941Smrg			      struct r600_atom *atom)
859848b8605Smrg{
860848b8605Smrg	struct r600_cs_shader_state *state =
861848b8605Smrg					(struct r600_cs_shader_state*)atom;
862848b8605Smrg	struct r600_pipe_compute *shader = state->shader;
863b8e80941Smrg	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
864b8e80941Smrg	uint64_t va;
865b8e80941Smrg	struct r600_resource *code_bo;
866b8e80941Smrg	unsigned ngpr, nstack;
867b8e80941Smrg
868b8e80941Smrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI) {
869b8e80941Smrg		code_bo = shader->sel->current->bo;
870b8e80941Smrg		va = shader->sel->current->bo->gpu_address;
871b8e80941Smrg		ngpr = shader->sel->current->shader.bc.ngpr;
872b8e80941Smrg		nstack = shader->sel->current->shader.bc.nstack;
873b8e80941Smrg	} else {
874b8e80941Smrg		code_bo = shader->code_bo;
875b8e80941Smrg		va = shader->code_bo->gpu_address + state->pc;
876b8e80941Smrg		ngpr = shader->bc.ngpr;
877b8e80941Smrg		nstack = shader->bc.nstack;
878b8e80941Smrg	}
879848b8605Smrg
880b8e80941Smrg	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
881b8e80941Smrg	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
882848b8605Smrg	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
883b8e80941Smrg			S_0288D4_NUM_GPRS(ngpr) |
884b8e80941Smrg			S_0288D4_DX10_CLAMP(1) |
885b8e80941Smrg			S_0288D4_STACK_SIZE(nstack));
886848b8605Smrg	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
887848b8605Smrg
888848b8605Smrg	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
889b8e80941Smrg	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
890b8e80941Smrg					      code_bo, RADEON_USAGE_READ,
891b8e80941Smrg					      RADEON_PRIO_SHADER_BINARY));
892848b8605Smrg}
893848b8605Smrg
894b8e80941Smrgstatic void evergreen_launch_grid(struct pipe_context *ctx,
895b8e80941Smrg				  const struct pipe_grid_info *info)
896848b8605Smrg{
897b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
898848b8605Smrg#ifdef HAVE_OPENCL
899b8e80941Smrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
900b8e80941Smrg	boolean use_kill;
901848b8605Smrg
902b8e80941Smrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI) {
903b8e80941Smrg		rctx->cs_shader_state.pc = info->pc;
904b8e80941Smrg		/* Get the config information for this kernel. */
905b8e80941Smrg		r600_shader_binary_read_config(&shader->binary, &shader->bc,
906b8e80941Smrg					       info->pc, &use_kill);
907b8e80941Smrg	} else {
908b8e80941Smrg		use_kill = false;
909b8e80941Smrg		rctx->cs_shader_state.pc = 0;
910848b8605Smrg	}
911848b8605Smrg#endif
912b8e80941Smrg
913b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
914b8e80941Smrg
915b8e80941Smrg
916b8e80941Smrg	evergreen_compute_upload_input(ctx, info);
917b8e80941Smrg	compute_emit_cs(rctx, info);
918848b8605Smrg}
919848b8605Smrg
920b8e80941Smrgstatic void evergreen_set_compute_resources(struct pipe_context *ctx,
921b8e80941Smrg					    unsigned start, unsigned count,
922b8e80941Smrg					    struct pipe_surface **surfaces)
923848b8605Smrg{
924b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
925848b8605Smrg	struct r600_surface **resources = (struct r600_surface **)surfaces;
926848b8605Smrg
927b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
928848b8605Smrg			start, count);
929848b8605Smrg
930848b8605Smrg	for (unsigned i = 0; i < count; i++) {
931b8e80941Smrg		/* The First four vertex buffers are reserved for parameters and
932848b8605Smrg		 * global buffers. */
933b8e80941Smrg		unsigned vtx_id = 4 + i;
934848b8605Smrg		if (resources[i]) {
935848b8605Smrg			struct r600_resource_global *buffer =
936848b8605Smrg				(struct r600_resource_global*)
937848b8605Smrg				resources[i]->base.texture;
938848b8605Smrg			if (resources[i]->base.writable) {
939848b8605Smrg				assert(i+1 < 12);
940848b8605Smrg
941b8e80941Smrg				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
942848b8605Smrg				(struct r600_resource *)resources[i]->base.texture,
943848b8605Smrg				buffer->chunk->start_in_dw*4,
944848b8605Smrg				resources[i]->base.texture->width0);
945848b8605Smrg			}
946848b8605Smrg
947b8e80941Smrg			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
948848b8605Smrg					buffer->chunk->start_in_dw * 4,
949848b8605Smrg					resources[i]->base.texture);
950848b8605Smrg		}
951848b8605Smrg	}
952848b8605Smrg}
953848b8605Smrg
954b8e80941Smrgstatic void evergreen_set_global_binding(struct pipe_context *ctx,
955b8e80941Smrg					 unsigned first, unsigned n,
956b8e80941Smrg					 struct pipe_resource **resources,
957b8e80941Smrg					 uint32_t **handles)
958848b8605Smrg{
959b8e80941Smrg	struct r600_context *rctx = (struct r600_context *)ctx;
960b8e80941Smrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
961848b8605Smrg	struct r600_resource_global **buffers =
962848b8605Smrg		(struct r600_resource_global **)resources;
963848b8605Smrg	unsigned i;
964848b8605Smrg
965b8e80941Smrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
966848b8605Smrg			first, n);
967848b8605Smrg
968848b8605Smrg	if (!resources) {
969848b8605Smrg		/* XXX: Unset */
970848b8605Smrg		return;
971848b8605Smrg	}
972848b8605Smrg
973848b8605Smrg	/* We mark these items for promotion to the pool if they
974848b8605Smrg	 * aren't already there */
975848b8605Smrg	for (i = first; i < first + n; i++) {
976848b8605Smrg		struct compute_memory_item *item = buffers[i]->chunk;
977848b8605Smrg
978848b8605Smrg		if (!is_item_in_pool(item))
979848b8605Smrg			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
980848b8605Smrg	}
981848b8605Smrg
982b8e80941Smrg	if (compute_memory_finalize_pending(pool, ctx) == -1) {
983848b8605Smrg		/* XXX: Unset */
984848b8605Smrg		return;
985848b8605Smrg	}
986848b8605Smrg
987848b8605Smrg	for (i = first; i < first + n; i++)
988848b8605Smrg	{
989848b8605Smrg		uint32_t buffer_offset;
990848b8605Smrg		uint32_t handle;
991848b8605Smrg		assert(resources[i]->target == PIPE_BUFFER);
992848b8605Smrg		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
993848b8605Smrg
994848b8605Smrg		buffer_offset = util_le32_to_cpu(*(handles[i]));
995848b8605Smrg		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
996848b8605Smrg
997848b8605Smrg		*(handles[i]) = util_cpu_to_le32(handle);
998848b8605Smrg	}
999848b8605Smrg
1000b8e80941Smrg	/* globals for writing */
1001b8e80941Smrg	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1002b8e80941Smrg	/* globals for reading */
1003b8e80941Smrg	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1004848b8605Smrg				(struct pipe_resource*)pool->bo);
1005b8e80941Smrg
1006b8e80941Smrg	/* constants for reading, LLVM puts them in text segment */
1007b8e80941Smrg	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1008b8e80941Smrg				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1009848b8605Smrg}
1010848b8605Smrg
1011848b8605Smrg/**
1012848b8605Smrg * This function initializes all the compute specific registers that need to
1013848b8605Smrg * be initialized for each compute command stream.  Registers that are common
1014848b8605Smrg * to both compute and 3D will be initialized at the beginning of each compute
1015848b8605Smrg * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1016848b8605Smrg * packet requires that the shader type bit be set, we must initialize all
1017848b8605Smrg * context registers needed for compute in this function.  The registers
1018b8e80941Smrg * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1019848b8605Smrg * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1020848b8605Smrg * on the GPU family.
1021848b8605Smrg */
1022b8e80941Smrgvoid evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1023848b8605Smrg{
1024b8e80941Smrg	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1025848b8605Smrg	int num_threads;
1026848b8605Smrg	int num_stack_entries;
1027848b8605Smrg
1028b8e80941Smrg	/* since all required registers are initialized in the
1029848b8605Smrg	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1030848b8605Smrg	 */
1031848b8605Smrg	r600_init_command_buffer(cb, 256);
1032848b8605Smrg	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1033848b8605Smrg
1034848b8605Smrg	/* We're setting config registers here. */
1035848b8605Smrg	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1036848b8605Smrg	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1037848b8605Smrg
1038b8e80941Smrg	switch (rctx->b.family) {
1039848b8605Smrg	case CHIP_CEDAR:
1040848b8605Smrg	default:
1041848b8605Smrg		num_threads = 128;
1042848b8605Smrg		num_stack_entries = 256;
1043848b8605Smrg		break;
1044848b8605Smrg	case CHIP_REDWOOD:
1045848b8605Smrg		num_threads = 128;
1046848b8605Smrg		num_stack_entries = 256;
1047848b8605Smrg		break;
1048848b8605Smrg	case CHIP_JUNIPER:
1049848b8605Smrg		num_threads = 128;
1050848b8605Smrg		num_stack_entries = 512;
1051848b8605Smrg		break;
1052848b8605Smrg	case CHIP_CYPRESS:
1053848b8605Smrg	case CHIP_HEMLOCK:
1054848b8605Smrg		num_threads = 128;
1055848b8605Smrg		num_stack_entries = 512;
1056848b8605Smrg		break;
1057848b8605Smrg	case CHIP_PALM:
1058848b8605Smrg		num_threads = 128;
1059848b8605Smrg		num_stack_entries = 256;
1060848b8605Smrg		break;
1061848b8605Smrg	case CHIP_SUMO:
1062848b8605Smrg		num_threads = 128;
1063848b8605Smrg		num_stack_entries = 256;
1064848b8605Smrg		break;
1065848b8605Smrg	case CHIP_SUMO2:
1066848b8605Smrg		num_threads = 128;
1067848b8605Smrg		num_stack_entries = 512;
1068848b8605Smrg		break;
1069848b8605Smrg	case CHIP_BARTS:
1070848b8605Smrg		num_threads = 128;
1071848b8605Smrg		num_stack_entries = 512;
1072848b8605Smrg		break;
1073848b8605Smrg	case CHIP_TURKS:
1074848b8605Smrg		num_threads = 128;
1075848b8605Smrg		num_stack_entries = 256;
1076848b8605Smrg		break;
1077848b8605Smrg	case CHIP_CAICOS:
1078848b8605Smrg		num_threads = 128;
1079848b8605Smrg		num_stack_entries = 256;
1080848b8605Smrg		break;
1081848b8605Smrg	}
1082848b8605Smrg
1083848b8605Smrg	/* The primitive type always needs to be POINTLIST for compute. */
1084848b8605Smrg	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1085848b8605Smrg						V_008958_DI_PT_POINTLIST);
1086848b8605Smrg
1087b8e80941Smrg	if (rctx->b.chip_class < CAYMAN) {
1088848b8605Smrg
1089848b8605Smrg		/* These registers control which simds can be used by each stage.
1090848b8605Smrg		 * The default for these registers is 0xffffffff, which means
1091848b8605Smrg		 * all simds are available for each stage.  It's possible we may
1092848b8605Smrg		 * want to play around with these in the future, but for now
1093848b8605Smrg		 * the default value is fine.
1094848b8605Smrg		 *
1095848b8605Smrg		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1096848b8605Smrg		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1097848b8605Smrg		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1098848b8605Smrg		 */
1099848b8605Smrg
1100b8e80941Smrg		/* XXX: We may need to adjust the thread and stack resource
1101848b8605Smrg		 * values for 3D/compute interop */
1102848b8605Smrg
1103848b8605Smrg		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1104848b8605Smrg
1105848b8605Smrg		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1106848b8605Smrg		 * Set the number of threads used by the PS/VS/GS/ES stage to
1107848b8605Smrg		 * 0.
1108848b8605Smrg		 */
1109848b8605Smrg		r600_store_value(cb, 0);
1110848b8605Smrg
1111848b8605Smrg		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1112848b8605Smrg		 * Set the number of threads used by the CS (aka LS) stage to
1113848b8605Smrg		 * the maximum number of threads and set the number of threads
1114848b8605Smrg		 * for the HS stage to 0. */
1115848b8605Smrg		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1116848b8605Smrg
1117848b8605Smrg		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1118848b8605Smrg		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1119848b8605Smrg		r600_store_value(cb, 0);
1120848b8605Smrg
1121848b8605Smrg		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1122848b8605Smrg		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1123848b8605Smrg		r600_store_value(cb, 0);
1124848b8605Smrg
1125848b8605Smrg		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1126848b8605Smrg		 * Set the Contol Flow stack entries to 0 for the HS stage, and
1127848b8605Smrg		 * set it to the maximum value for the CS (aka LS) stage. */
1128848b8605Smrg		r600_store_value(cb,
1129848b8605Smrg			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1130848b8605Smrg	}
1131848b8605Smrg	/* Give the compute shader all the available LDS space.
1132848b8605Smrg	 * NOTE: This only sets the maximum number of dwords that a compute
1133848b8605Smrg	 * shader can allocate.  When a shader is executed, we still need to
1134848b8605Smrg	 * allocate the appropriate amount of LDS dwords using the
1135848b8605Smrg	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1136848b8605Smrg	 */
1137b8e80941Smrg	if (rctx->b.chip_class < CAYMAN) {
1138848b8605Smrg		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1139848b8605Smrg			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1140848b8605Smrg	} else {
1141848b8605Smrg		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1142848b8605Smrg			S_0286FC_NUM_PS_LDS(0) |
1143848b8605Smrg			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1144848b8605Smrg	}
1145848b8605Smrg
1146848b8605Smrg	/* Context Registers */
1147848b8605Smrg
1148b8e80941Smrg	if (rctx->b.chip_class < CAYMAN) {
1149848b8605Smrg		/* workaround for hw issues with dyn gpr - must set all limits
1150848b8605Smrg		 * to 240 instead of 0, 0x1e == 240 / 8
1151848b8605Smrg		 */
1152848b8605Smrg		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1153848b8605Smrg				S_028838_PS_GPRS(0x1e) |
1154848b8605Smrg				S_028838_VS_GPRS(0x1e) |
1155848b8605Smrg				S_028838_GS_GPRS(0x1e) |
1156848b8605Smrg				S_028838_ES_GPRS(0x1e) |
1157848b8605Smrg				S_028838_HS_GPRS(0x1e) |
1158848b8605Smrg				S_028838_LS_GPRS(0x1e));
1159848b8605Smrg	}
1160848b8605Smrg
1161848b8605Smrg	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1162848b8605Smrg	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1163848b8605Smrg		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1164848b8605Smrg
1165848b8605Smrg	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1166848b8605Smrg
1167848b8605Smrg	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1168b8e80941Smrg			       S_0286E8_TID_IN_GROUP_ENA(1) |
1169b8e80941Smrg			       S_0286E8_TGID_ENA(1) |
1170b8e80941Smrg			       S_0286E8_DISABLE_INDEX_PACK(1));
1171848b8605Smrg
1172848b8605Smrg	/* The LOOP_CONST registers are an optimizations for loops that allows
1173848b8605Smrg	 * you to store the initial counter, increment value, and maximum
1174848b8605Smrg	 * counter value in a register so that hardware can calculate the
1175848b8605Smrg	 * correct number of iterations for the loop, so that you don't need
1176848b8605Smrg	 * to have the loop counter in your shader code.  We don't currently use
1177848b8605Smrg	 * this optimization, so we must keep track of the counter in the
1178848b8605Smrg	 * shader and use a break instruction to exit loops.  However, the
1179848b8605Smrg	 * hardware will still uses this register to determine when to exit a
1180848b8605Smrg	 * loop, so we need to initialize the counter to 0, set the increment
1181848b8605Smrg	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1182848b8605Smrg	 * is the maximum value allowed.  This gives us a maximum of 4096
1183848b8605Smrg	 * iterations for our loops, but hopefully our break instruction will
1184848b8605Smrg	 * execute before some time before the 4096th iteration.
1185848b8605Smrg	 */
1186848b8605Smrg	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1187848b8605Smrg}
1188848b8605Smrg
1189b8e80941Smrgvoid evergreen_init_compute_state_functions(struct r600_context *rctx)
1190848b8605Smrg{
1191b8e80941Smrg	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1192b8e80941Smrg	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1193b8e80941Smrg	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1194b8e80941Smrg//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1195b8e80941Smrg	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1196b8e80941Smrg	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1197b8e80941Smrg	rctx->b.b.launch_grid = evergreen_launch_grid;
1198848b8605Smrg
1199848b8605Smrg}
1200848b8605Smrg
1201b8e80941Smrgstatic void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1202b8e80941Smrg					      struct pipe_resource *resource,
1203b8e80941Smrg					      unsigned level,
1204b8e80941Smrg					      unsigned usage,
1205b8e80941Smrg					      const struct pipe_box *box,
1206b8e80941Smrg					      struct pipe_transfer **ptransfer)
1207848b8605Smrg{
1208b8e80941Smrg	struct r600_context *rctx = (struct r600_context*)ctx;
1209848b8605Smrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
1210848b8605Smrg	struct r600_resource_global* buffer =
1211848b8605Smrg		(struct r600_resource_global*)resource;
1212848b8605Smrg
1213848b8605Smrg	struct compute_memory_item *item = buffer->chunk;
1214848b8605Smrg	struct pipe_resource *dst = NULL;
1215848b8605Smrg	unsigned offset = box->x;
1216848b8605Smrg
1217848b8605Smrg	if (is_item_in_pool(item)) {
1218b8e80941Smrg		compute_memory_demote_item(pool, item, ctx);
1219848b8605Smrg	}
1220848b8605Smrg	else {
1221848b8605Smrg		if (item->real_buffer == NULL) {
1222b8e80941Smrg			item->real_buffer =
1223848b8605Smrg					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1224848b8605Smrg		}
1225848b8605Smrg	}
1226848b8605Smrg
1227848b8605Smrg	dst = (struct pipe_resource*)item->real_buffer;
1228848b8605Smrg
1229848b8605Smrg	if (usage & PIPE_TRANSFER_READ)
1230848b8605Smrg		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1231848b8605Smrg
1232848b8605Smrg	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1233848b8605Smrg			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1234848b8605Smrg			"width = %u, height = %u, depth = %u)\n", level, usage,
1235848b8605Smrg			box->x, box->y, box->z, box->width, box->height,
1236848b8605Smrg			box->depth);
1237848b8605Smrg	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1238848b8605Smrg		"%u (box.x)\n", item->id, box->x);
1239848b8605Smrg
1240848b8605Smrg
1241848b8605Smrg	assert(resource->target == PIPE_BUFFER);
1242848b8605Smrg	assert(resource->bind & PIPE_BIND_GLOBAL);
1243848b8605Smrg	assert(box->x >= 0);
1244848b8605Smrg	assert(box->y == 0);
1245848b8605Smrg	assert(box->z == 0);
1246848b8605Smrg
1247848b8605Smrg	///TODO: do it better, mapping is not possible if the pool is too big
1248b8e80941Smrg	return pipe_buffer_map_range(ctx, dst,
1249848b8605Smrg			offset, box->width, usage, ptransfer);
1250848b8605Smrg}
1251848b8605Smrg
1252b8e80941Smrgstatic void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1253b8e80941Smrg					       struct pipe_transfer *transfer)
1254848b8605Smrg{
1255848b8605Smrg	/* struct r600_resource_global are not real resources, they just map
1256848b8605Smrg	 * to an offset within the compute memory pool.  The function
1257848b8605Smrg	 * r600_compute_global_transfer_map() maps the memory pool
1258848b8605Smrg	 * resource rather than the struct r600_resource_global passed to
1259848b8605Smrg	 * it as an argument and then initalizes ptransfer->resource with
1260848b8605Smrg	 * the memory pool resource (via pipe_buffer_map_range).
1261848b8605Smrg	 * When transfer_unmap is called it uses the memory pool's
1262848b8605Smrg	 * vtable which calls r600_buffer_transfer_map() rather than
1263848b8605Smrg	 * this function.
1264848b8605Smrg	 */
1265848b8605Smrg	assert (!"This function should not be called");
1266848b8605Smrg}
1267848b8605Smrg
1268b8e80941Smrgstatic void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
1269b8e80941Smrg						      struct pipe_transfer *transfer,
1270b8e80941Smrg						      const struct pipe_box *box)
1271848b8605Smrg{
1272848b8605Smrg	assert(0 && "TODO");
1273848b8605Smrg}
1274848b8605Smrg
1275b8e80941Smrgstatic void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1276b8e80941Smrg					       struct pipe_resource *res)
1277848b8605Smrg{
1278b8e80941Smrg	struct r600_resource_global* buffer = NULL;
1279b8e80941Smrg	struct r600_screen* rscreen = NULL;
1280b8e80941Smrg
1281b8e80941Smrg	assert(res->target == PIPE_BUFFER);
1282b8e80941Smrg	assert(res->bind & PIPE_BIND_GLOBAL);
1283b8e80941Smrg
1284b8e80941Smrg	buffer = (struct r600_resource_global*)res;
1285b8e80941Smrg	rscreen = (struct r600_screen*)screen;
1286b8e80941Smrg
1287b8e80941Smrg	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1288b8e80941Smrg
1289b8e80941Smrg	buffer->chunk = NULL;
1290b8e80941Smrg	free(res);
1291b8e80941Smrg}
1292b8e80941Smrg
1293b8e80941Smrgstatic const struct u_resource_vtbl r600_global_buffer_vtbl =
1294b8e80941Smrg{
1295b8e80941Smrg	u_default_resource_get_handle, /* get_handle */
1296b8e80941Smrg	r600_compute_global_buffer_destroy, /* resource_destroy */
1297b8e80941Smrg	r600_compute_global_transfer_map, /* transfer_map */
1298b8e80941Smrg	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1299b8e80941Smrg	r600_compute_global_transfer_unmap, /* transfer_unmap */
1300b8e80941Smrg};
1301b8e80941Smrg
1302b8e80941Smrgstruct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1303b8e80941Smrg							const struct pipe_resource *templ)
1304b8e80941Smrg{
1305b8e80941Smrg	struct r600_resource_global* result = NULL;
1306b8e80941Smrg	struct r600_screen* rscreen = NULL;
1307b8e80941Smrg	int size_in_dw = 0;
1308b8e80941Smrg
1309b8e80941Smrg	assert(templ->target == PIPE_BUFFER);
1310b8e80941Smrg	assert(templ->bind & PIPE_BIND_GLOBAL);
1311b8e80941Smrg	assert(templ->array_size == 1 || templ->array_size == 0);
1312b8e80941Smrg	assert(templ->depth0 == 1 || templ->depth0 == 0);
1313b8e80941Smrg	assert(templ->height0 == 1 || templ->height0 == 0);
1314b8e80941Smrg
1315b8e80941Smrg	result = (struct r600_resource_global*)
1316b8e80941Smrg	CALLOC(sizeof(struct r600_resource_global), 1);
1317b8e80941Smrg	rscreen = (struct r600_screen*)screen;
1318b8e80941Smrg
1319b8e80941Smrg	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1320b8e80941Smrg	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1321b8e80941Smrg			templ->array_size);
1322b8e80941Smrg
1323b8e80941Smrg	result->base.b.vtbl = &r600_global_buffer_vtbl;
1324b8e80941Smrg	result->base.b.b = *templ;
1325b8e80941Smrg	result->base.b.b.screen = screen;
1326b8e80941Smrg	pipe_reference_init(&result->base.b.b.reference, 1);
1327b8e80941Smrg
1328b8e80941Smrg	size_in_dw = (templ->width0+3) / 4;
1329b8e80941Smrg
1330b8e80941Smrg	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1331b8e80941Smrg
1332b8e80941Smrg	if (result->chunk == NULL)
1333b8e80941Smrg	{
1334b8e80941Smrg		free(result);
1335b8e80941Smrg		return NULL;
1336b8e80941Smrg	}
1337b8e80941Smrg
1338b8e80941Smrg	return &result->base.b.b;
1339848b8605Smrg}
1340