1af69d88dSmrg/*
2af69d88dSmrg * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3af69d88dSmrg *
4af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
5af69d88dSmrg * copy of this software and associated documentation files (the "Software"),
6af69d88dSmrg * to deal in the Software without restriction, including without limitation
7af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
8af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom
9af69d88dSmrg * the Software is furnished to do so, subject to the following conditions:
10af69d88dSmrg *
11af69d88dSmrg * The above copyright notice and this permission notice (including the next
12af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the
13af69d88dSmrg * Software.
14af69d88dSmrg *
15af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
22af69d88dSmrg *
23af69d88dSmrg * Authors:
24af69d88dSmrg *      Adam Rak <adam.rak@streamnovation.com>
25af69d88dSmrg */
26af69d88dSmrg
277e995a2eSmrg#ifdef HAVE_OPENCL
287e995a2eSmrg#include <gelf.h>
297e995a2eSmrg#include <libelf.h>
307e995a2eSmrg#endif
31af69d88dSmrg#include <stdio.h>
32af69d88dSmrg#include <errno.h>
33af69d88dSmrg#include "pipe/p_defines.h"
34af69d88dSmrg#include "pipe/p_state.h"
35af69d88dSmrg#include "pipe/p_context.h"
36af69d88dSmrg#include "util/u_blitter.h"
377e995a2eSmrg#include "util/list.h"
38af69d88dSmrg#include "util/u_transfer.h"
39af69d88dSmrg#include "util/u_surface.h"
40af69d88dSmrg#include "util/u_pack_color.h"
41af69d88dSmrg#include "util/u_memory.h"
42af69d88dSmrg#include "util/u_inlines.h"
43af69d88dSmrg#include "util/u_framebuffer.h"
447e995a2eSmrg#include "tgsi/tgsi_parse.h"
45af69d88dSmrg#include "pipebuffer/pb_buffer.h"
46af69d88dSmrg#include "evergreend.h"
47af69d88dSmrg#include "r600_shader.h"
48af69d88dSmrg#include "r600_pipe.h"
49af69d88dSmrg#include "r600_formats.h"
50af69d88dSmrg#include "evergreen_compute.h"
51af69d88dSmrg#include "evergreen_compute_internal.h"
52af69d88dSmrg#include "compute_memory_pool.h"
53af69d88dSmrg#include "sb/sb_public.h"
54af69d88dSmrg#include <inttypes.h>
55af69d88dSmrg
56af69d88dSmrg/**
57af69d88dSmrgRAT0 is for global binding write
58af69d88dSmrgVTX1 is for global binding read
59af69d88dSmrg
60af69d88dSmrgfor wrting images RAT1...
61af69d88dSmrgfor reading images TEX2...
62af69d88dSmrg  TEX2-RAT1 is paired
63af69d88dSmrg
64af69d88dSmrgTEX2... consumes the same fetch resources, that VTX2... would consume
65af69d88dSmrg
66af69d88dSmrgCONST0 and VTX0 is for parameters
67af69d88dSmrg  CONST0 is binding smaller input parameter buffer, and for constant indexing,
68af69d88dSmrg  also constant cached
69af69d88dSmrg  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70af69d88dSmrg  the constant cache can handle
71af69d88dSmrg
72af69d88dSmrgRAT-s are limited to 12, so we can only bind at most 11 texture for writing
73af69d88dSmrgbecause we reserve RAT0 for global bindings. With byteaddressing enabled,
74af69d88dSmrgwe should reserve another one too.=> 10 image binding for writing max.
75af69d88dSmrg
76af69d88dSmrgfrom Nvidia OpenCL:
77af69d88dSmrg  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
78af69d88dSmrg  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
79af69d88dSmrg
80af69d88dSmrgso 10 for writing is enough. 176 is the max for reading according to the docs
81af69d88dSmrg
82af69d88dSmrgwritable images should be listed first < 10, so their id corresponds to RAT(id+1)
83af69d88dSmrgwritable images will consume TEX slots, VTX slots too because of linear indexing
84af69d88dSmrg
85af69d88dSmrg*/
86af69d88dSmrg
871463c08dSmrg#ifdef HAVE_OPENCL
881463c08dSmrgstatic void radeon_shader_binary_init(struct r600_shader_binary *b)
891463c08dSmrg{
901463c08dSmrg	memset(b, 0, sizeof(*b));
911463c08dSmrg}
921463c08dSmrg
931463c08dSmrgstatic void radeon_shader_binary_clean(struct r600_shader_binary *b)
941463c08dSmrg{
951463c08dSmrg	if (!b)
961463c08dSmrg		return;
971463c08dSmrg	FREE(b->code);
981463c08dSmrg	FREE(b->config);
991463c08dSmrg	FREE(b->rodata);
1001463c08dSmrg	FREE(b->global_symbol_offsets);
1011463c08dSmrg	FREE(b->relocs);
1021463c08dSmrg	FREE(b->disasm_string);
1031463c08dSmrg}
1041463c08dSmrg#endif
1051463c08dSmrg
1067e995a2eSmrgstruct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
1077e995a2eSmrg						     unsigned size)
108af69d88dSmrg{
1097e995a2eSmrg	struct pipe_resource *buffer = NULL;
110af69d88dSmrg	assert(size);
111af69d88dSmrg
1127e995a2eSmrg	buffer = pipe_buffer_create((struct pipe_screen*) screen,
1137e995a2eSmrg				    0, PIPE_USAGE_IMMUTABLE, size);
114af69d88dSmrg
115af69d88dSmrg	return (struct r600_resource *)buffer;
116af69d88dSmrg}
117af69d88dSmrg
118af69d88dSmrg
1197e995a2eSmrgstatic void evergreen_set_rat(struct r600_pipe_compute *pipe,
1207e995a2eSmrg			      unsigned id,
1217e995a2eSmrg			      struct r600_resource *bo,
1227e995a2eSmrg			      int start,
1237e995a2eSmrg			      int size)
124af69d88dSmrg{
125af69d88dSmrg	struct pipe_surface rat_templ;
126af69d88dSmrg	struct r600_surface *surf = NULL;
127af69d88dSmrg	struct r600_context *rctx = NULL;
128af69d88dSmrg
129af69d88dSmrg	assert(id < 12);
130af69d88dSmrg	assert((size & 3) == 0);
131af69d88dSmrg	assert((start & 0xFF) == 0);
132af69d88dSmrg
133af69d88dSmrg	rctx = pipe->ctx;
134af69d88dSmrg
135af69d88dSmrg	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
136af69d88dSmrg
137af69d88dSmrg	/* Create the RAT surface */
138af69d88dSmrg	memset(&rat_templ, 0, sizeof(rat_templ));
139af69d88dSmrg	rat_templ.format = PIPE_FORMAT_R32_UINT;
140af69d88dSmrg	rat_templ.u.tex.level = 0;
141af69d88dSmrg	rat_templ.u.tex.first_layer = 0;
142af69d88dSmrg	rat_templ.u.tex.last_layer = 0;
143af69d88dSmrg
1447e995a2eSmrg	/* Add the RAT the list of color buffers. Drop the old buffer first. */
1457e995a2eSmrg	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
146af69d88dSmrg	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
147af69d88dSmrg		(struct pipe_context *)pipe->ctx,
148af69d88dSmrg		(struct pipe_resource *)bo, &rat_templ);
149af69d88dSmrg
150af69d88dSmrg	/* Update the number of color buffers */
151af69d88dSmrg	pipe->ctx->framebuffer.state.nr_cbufs =
152af69d88dSmrg		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
153af69d88dSmrg
154af69d88dSmrg	/* Update the cb_target_mask
155af69d88dSmrg	 * XXX: I think this is a potential spot for bugs once we start doing
156af69d88dSmrg	 * GL interop.  cb_target_mask may be modified in the 3D sections
157af69d88dSmrg	 * of this driver. */
158af69d88dSmrg	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
159af69d88dSmrg
160af69d88dSmrg	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
161af69d88dSmrg	evergreen_init_color_surface_rat(rctx, surf);
162af69d88dSmrg}
163af69d88dSmrg
1647e995a2eSmrgstatic void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
1657e995a2eSmrg					   unsigned vb_index,
1667e995a2eSmrg					   unsigned offset,
1677e995a2eSmrg					   struct pipe_resource *buffer)
168af69d88dSmrg{
169af69d88dSmrg	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
170af69d88dSmrg	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
171af69d88dSmrg	vb->stride = 1;
172af69d88dSmrg	vb->buffer_offset = offset;
1737e995a2eSmrg	vb->buffer.resource = buffer;
1747e995a2eSmrg	vb->is_user_buffer = false;
175af69d88dSmrg
176af69d88dSmrg	/* The vertex instructions in the compute shaders use the texture cache,
177af69d88dSmrg	 * so we need to invalidate it. */
178af69d88dSmrg	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
179af69d88dSmrg	state->enabled_mask |= 1 << vb_index;
180af69d88dSmrg	state->dirty_mask |= 1 << vb_index;
1817e995a2eSmrg	r600_mark_atom_dirty(rctx, &state->atom);
182af69d88dSmrg}
183af69d88dSmrg
1847e995a2eSmrgstatic void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
1857e995a2eSmrg					     unsigned cb_index,
1867e995a2eSmrg					     unsigned offset,
1877e995a2eSmrg					     unsigned size,
1887e995a2eSmrg					     struct pipe_resource *buffer)
189af69d88dSmrg{
190af69d88dSmrg	struct pipe_constant_buffer cb;
191af69d88dSmrg	cb.buffer_size = size;
192af69d88dSmrg	cb.buffer_offset = offset;
193af69d88dSmrg	cb.buffer = buffer;
194af69d88dSmrg	cb.user_buffer = NULL;
195af69d88dSmrg
1961463c08dSmrg	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
197af69d88dSmrg}
198af69d88dSmrg
1997e995a2eSmrg/* We need to define these R600 registers here, because we can't include
2007e995a2eSmrg * evergreend.h and r600d.h.
2017e995a2eSmrg */
2027e995a2eSmrg#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
2037e995a2eSmrg#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
2047e995a2eSmrg
2057e995a2eSmrg#ifdef HAVE_OPENCL
2067e995a2eSmrgstatic void parse_symbol_table(Elf_Data *symbol_table_data,
2077e995a2eSmrg				const GElf_Shdr *symbol_table_header,
2081463c08dSmrg				struct r600_shader_binary *binary)
209af69d88dSmrg{
2107e995a2eSmrg	GElf_Sym symbol;
2117e995a2eSmrg	unsigned i = 0;
2127e995a2eSmrg	unsigned symbol_count =
2137e995a2eSmrg		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
2147e995a2eSmrg
2157e995a2eSmrg	/* We are over allocating this list, because symbol_count gives the
2167e995a2eSmrg	 * total number of symbols, and we will only be filling the list
2177e995a2eSmrg	 * with offsets of global symbols.  The memory savings from
2187e995a2eSmrg	 * allocating the correct size of this list will be small, and
2197e995a2eSmrg	 * I don't think it is worth the cost of pre-computing the number
2207e995a2eSmrg	 * of global symbols.
2217e995a2eSmrg	 */
2227e995a2eSmrg	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
2237e995a2eSmrg
2247e995a2eSmrg	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
2257e995a2eSmrg		unsigned i;
2267e995a2eSmrg		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
2277e995a2eSmrg		    symbol.st_shndx == 0 /* Undefined symbol */) {
2287e995a2eSmrg			continue;
2297e995a2eSmrg		}
230af69d88dSmrg
2317e995a2eSmrg		binary->global_symbol_offsets[binary->global_symbol_count] =
2327e995a2eSmrg					symbol.st_value;
233af69d88dSmrg
2347e995a2eSmrg		/* Sort the list using bubble sort.  This list will usually
2357e995a2eSmrg		 * be small. */
2367e995a2eSmrg		for (i = binary->global_symbol_count; i > 0; --i) {
2377e995a2eSmrg			uint64_t lhs = binary->global_symbol_offsets[i - 1];
2387e995a2eSmrg			uint64_t rhs = binary->global_symbol_offsets[i];
2397e995a2eSmrg			if (lhs < rhs) {
2407e995a2eSmrg				break;
2417e995a2eSmrg			}
2427e995a2eSmrg			binary->global_symbol_offsets[i] = lhs;
2437e995a2eSmrg			binary->global_symbol_offsets[i - 1] = rhs;
2447e995a2eSmrg		}
2457e995a2eSmrg		++binary->global_symbol_count;
2467e995a2eSmrg	}
2477e995a2eSmrg}
2487e995a2eSmrg
2497e995a2eSmrg
2507e995a2eSmrgstatic void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
2517e995a2eSmrg			unsigned symbol_sh_link,
2521463c08dSmrg			struct r600_shader_binary *binary)
253af69d88dSmrg{
2547e995a2eSmrg	unsigned i;
255af69d88dSmrg
2567e995a2eSmrg	if (!relocs || !symbols || !binary->reloc_count) {
2577e995a2eSmrg		return;
2587e995a2eSmrg	}
2597e995a2eSmrg	binary->relocs = CALLOC(binary->reloc_count,
2601463c08dSmrg			sizeof(struct r600_shader_reloc));
2617e995a2eSmrg	for (i = 0; i < binary->reloc_count; i++) {
2627e995a2eSmrg		GElf_Sym symbol;
2637e995a2eSmrg		GElf_Rel rel;
2647e995a2eSmrg		char *symbol_name;
2651463c08dSmrg		struct r600_shader_reloc *reloc = &binary->relocs[i];
2667e995a2eSmrg
2677e995a2eSmrg		gelf_getrel(relocs, i, &rel);
2687e995a2eSmrg		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
2697e995a2eSmrg		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
2707e995a2eSmrg
2717e995a2eSmrg		reloc->offset = rel.r_offset;
2727e995a2eSmrg		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
2737e995a2eSmrg		reloc->name[sizeof(reloc->name)-1] = 0;
2747e995a2eSmrg	}
2757e995a2eSmrg}
2767e995a2eSmrg
2777e995a2eSmrgstatic void r600_elf_read(const char *elf_data, unsigned elf_size,
2781463c08dSmrg		 struct r600_shader_binary *binary)
2797e995a2eSmrg{
2807e995a2eSmrg	char *elf_buffer;
2817e995a2eSmrg	Elf *elf;
2827e995a2eSmrg	Elf_Scn *section = NULL;
2837e995a2eSmrg	Elf_Data *symbols = NULL, *relocs = NULL;
2847e995a2eSmrg	size_t section_str_index;
2857e995a2eSmrg	unsigned symbol_sh_link = 0;
2867e995a2eSmrg
2877e995a2eSmrg	/* One of the libelf implementations
2887e995a2eSmrg	 * (http://www.mr511.de/software/english.htm) requires calling
2897e995a2eSmrg	 * elf_version() before elf_memory().
2907e995a2eSmrg	 */
2917e995a2eSmrg	elf_version(EV_CURRENT);
2927e995a2eSmrg	elf_buffer = MALLOC(elf_size);
2937e995a2eSmrg	memcpy(elf_buffer, elf_data, elf_size);
2947e995a2eSmrg
2957e995a2eSmrg	elf = elf_memory(elf_buffer, elf_size);
2967e995a2eSmrg
2977e995a2eSmrg	elf_getshdrstrndx(elf, &section_str_index);
2987e995a2eSmrg
2997e995a2eSmrg	while ((section = elf_nextscn(elf, section))) {
3007e995a2eSmrg		const char *name;
3017e995a2eSmrg		Elf_Data *section_data = NULL;
3027e995a2eSmrg		GElf_Shdr section_header;
3037e995a2eSmrg		if (gelf_getshdr(section, &section_header) != &section_header) {
3047e995a2eSmrg			fprintf(stderr, "Failed to read ELF section header\n");
3057e995a2eSmrg			return;
3067e995a2eSmrg		}
3077e995a2eSmrg		name = elf_strptr(elf, section_str_index, section_header.sh_name);
3087e995a2eSmrg		if (!strcmp(name, ".text")) {
3097e995a2eSmrg			section_data = elf_getdata(section, section_data);
3107e995a2eSmrg			binary->code_size = section_data->d_size;
3117e995a2eSmrg			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
3127e995a2eSmrg			memcpy(binary->code, section_data->d_buf, binary->code_size);
3137e995a2eSmrg		} else if (!strcmp(name, ".AMDGPU.config")) {
3147e995a2eSmrg			section_data = elf_getdata(section, section_data);
3157e995a2eSmrg			binary->config_size = section_data->d_size;
3167e995a2eSmrg			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
3177e995a2eSmrg			memcpy(binary->config, section_data->d_buf, binary->config_size);
3187e995a2eSmrg		} else if (!strcmp(name, ".AMDGPU.disasm")) {
3197e995a2eSmrg			/* Always read disassembly if it's available. */
3207e995a2eSmrg			section_data = elf_getdata(section, section_data);
3217e995a2eSmrg			binary->disasm_string = strndup(section_data->d_buf,
3227e995a2eSmrg							section_data->d_size);
3237e995a2eSmrg		} else if (!strncmp(name, ".rodata", 7)) {
3247e995a2eSmrg			section_data = elf_getdata(section, section_data);
3257e995a2eSmrg			binary->rodata_size = section_data->d_size;
3267e995a2eSmrg			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
3277e995a2eSmrg			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
3287e995a2eSmrg		} else if (!strncmp(name, ".symtab", 7)) {
3297e995a2eSmrg			symbols = elf_getdata(section, section_data);
3307e995a2eSmrg			symbol_sh_link = section_header.sh_link;
3317e995a2eSmrg			parse_symbol_table(symbols, &section_header, binary);
3327e995a2eSmrg		} else if (!strcmp(name, ".rel.text")) {
3337e995a2eSmrg			relocs = elf_getdata(section, section_data);
3347e995a2eSmrg			binary->reloc_count = section_header.sh_size /
3357e995a2eSmrg					section_header.sh_entsize;
3367e995a2eSmrg		}
3377e995a2eSmrg	}
3387e995a2eSmrg
3397e995a2eSmrg	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
3407e995a2eSmrg
3417e995a2eSmrg	if (elf){
3427e995a2eSmrg		elf_end(elf);
3437e995a2eSmrg	}
3447e995a2eSmrg	FREE(elf_buffer);
3457e995a2eSmrg
3467e995a2eSmrg	/* Cache the config size per symbol */
3477e995a2eSmrg	if (binary->global_symbol_count) {
3487e995a2eSmrg		binary->config_size_per_symbol =
3497e995a2eSmrg			binary->config_size / binary->global_symbol_count;
3507e995a2eSmrg	} else {
3517e995a2eSmrg		binary->global_symbol_count = 1;
3527e995a2eSmrg		binary->config_size_per_symbol = binary->config_size;
3537e995a2eSmrg	}
3547e995a2eSmrg}
3557e995a2eSmrg
3567e995a2eSmrgstatic const unsigned char *r600_shader_binary_config_start(
3571463c08dSmrg	const struct r600_shader_binary *binary,
3587e995a2eSmrg	uint64_t symbol_offset)
3597e995a2eSmrg{
360af69d88dSmrg	unsigned i;
3617e995a2eSmrg	for (i = 0; i < binary->global_symbol_count; ++i) {
3627e995a2eSmrg		if (binary->global_symbol_offsets[i] == symbol_offset) {
3637e995a2eSmrg			unsigned offset = i * binary->config_size_per_symbol;
3647e995a2eSmrg			return binary->config + offset;
3657e995a2eSmrg		}
3667e995a2eSmrg	}
3677e995a2eSmrg	return binary->config;
3687e995a2eSmrg}
369af69d88dSmrg
3701463c08dSmrgstatic void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
3717e995a2eSmrg					   struct r600_bytecode *bc,
3727e995a2eSmrg					   uint64_t symbol_offset,
3737e995a2eSmrg					   boolean *use_kill)
3747e995a2eSmrg{
3757e995a2eSmrg       unsigned i;
3767e995a2eSmrg       const unsigned char *config =
3777e995a2eSmrg               r600_shader_binary_config_start(binary, symbol_offset);
3787e995a2eSmrg
3797e995a2eSmrg       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
3807e995a2eSmrg               unsigned reg =
3817e995a2eSmrg                       util_le32_to_cpu(*(uint32_t*)(config + i));
3827e995a2eSmrg               unsigned value =
3837e995a2eSmrg                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
3847e995a2eSmrg               switch (reg) {
3857e995a2eSmrg               /* R600 / R700 */
3867e995a2eSmrg               case R_028850_SQ_PGM_RESOURCES_PS:
3877e995a2eSmrg               case R_028868_SQ_PGM_RESOURCES_VS:
3887e995a2eSmrg               /* Evergreen / Northern Islands */
3897e995a2eSmrg               case R_028844_SQ_PGM_RESOURCES_PS:
3907e995a2eSmrg               case R_028860_SQ_PGM_RESOURCES_VS:
3917e995a2eSmrg               case R_0288D4_SQ_PGM_RESOURCES_LS:
3927e995a2eSmrg                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
3937e995a2eSmrg                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
3947e995a2eSmrg                       break;
3957e995a2eSmrg               case R_02880C_DB_SHADER_CONTROL:
3967e995a2eSmrg                       *use_kill = G_02880C_KILL_ENABLE(value);
3977e995a2eSmrg                       break;
3987e995a2eSmrg               case R_0288E8_SQ_LDS_ALLOC:
3997e995a2eSmrg                       bc->nlds_dw = value;
4007e995a2eSmrg                       break;
4017e995a2eSmrg               }
4027e995a2eSmrg       }
4037e995a2eSmrg}
404af69d88dSmrg
4057e995a2eSmrgstatic unsigned r600_create_shader(struct r600_bytecode *bc,
4061463c08dSmrg				   const struct r600_shader_binary *binary,
4077e995a2eSmrg				   boolean *use_kill)
4087e995a2eSmrg
4097e995a2eSmrg{
4107e995a2eSmrg	assert(binary->code_size % 4 == 0);
4117e995a2eSmrg	bc->bytecode = CALLOC(1, binary->code_size);
4127e995a2eSmrg	memcpy(bc->bytecode, binary->code, binary->code_size);
4137e995a2eSmrg	bc->ndw = binary->code_size / 4;
4147e995a2eSmrg
4157e995a2eSmrg	r600_shader_binary_read_config(binary, bc, 0, use_kill);
4167e995a2eSmrg	return 0;
4177e995a2eSmrg}
418af69d88dSmrg
419af69d88dSmrg#endif
420af69d88dSmrg
4217e995a2eSmrgstatic void r600_destroy_shader(struct r600_bytecode *bc)
4227e995a2eSmrg{
4237e995a2eSmrg	FREE(bc->bytecode);
4247e995a2eSmrg}
4257e995a2eSmrg
4267e995a2eSmrgstatic void *evergreen_create_compute_state(struct pipe_context *ctx,
4277e995a2eSmrg					    const struct pipe_compute_state *cso)
4287e995a2eSmrg{
4297e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
4307e995a2eSmrg	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
4317e995a2eSmrg#ifdef HAVE_OPENCL
4321463c08dSmrg	const struct pipe_binary_program_header *header;
4337e995a2eSmrg	void *p;
4347e995a2eSmrg	boolean use_kill;
4357e995a2eSmrg#endif
4367e995a2eSmrg
4377e995a2eSmrg	shader->ctx = rctx;
438af69d88dSmrg	shader->local_size = cso->req_local_mem;
439af69d88dSmrg	shader->private_size = cso->req_private_mem;
440af69d88dSmrg	shader->input_size = cso->req_input_mem;
441af69d88dSmrg
4427e995a2eSmrg	shader->ir_type = cso->ir_type;
443af69d88dSmrg
4441463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
4451463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
4461463c08dSmrg		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
4477e995a2eSmrg		return shader;
448af69d88dSmrg	}
4497e995a2eSmrg#ifdef HAVE_OPENCL
4507e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
4517e995a2eSmrg	header = cso->prog;
4527e995a2eSmrg	radeon_shader_binary_init(&shader->binary);
4531463c08dSmrg	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
4547e995a2eSmrg	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
4557e995a2eSmrg
4567e995a2eSmrg	/* Upload code + ROdata */
4577e995a2eSmrg	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
4587e995a2eSmrg							shader->bc.ndw * 4);
459d8407755Smaya	p = r600_buffer_map_sync_with_rings(
460d8407755Smaya		&rctx->b, shader->code_bo,
4611463c08dSmrg		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
4627e995a2eSmrg	//TODO: use util_memcpy_cpu_to_le32 ?
4637e995a2eSmrg	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
4641463c08dSmrg	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
465af69d88dSmrg#endif
4667e995a2eSmrg
467af69d88dSmrg	return shader;
468af69d88dSmrg}
469af69d88dSmrg
4707e995a2eSmrgstatic void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
471af69d88dSmrg{
4727e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
4737e995a2eSmrg	struct r600_pipe_compute *shader = state;
4747e995a2eSmrg
4757e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
476af69d88dSmrg
477af69d88dSmrg	if (!shader)
478af69d88dSmrg		return;
479af69d88dSmrg
4801463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
4811463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
4827e995a2eSmrg		r600_delete_shader_selector(ctx, shader->sel);
4837e995a2eSmrg	} else {
484af69d88dSmrg#ifdef HAVE_OPENCL
4857e995a2eSmrg		radeon_shader_binary_clean(&shader->binary);
486d8407755Smaya		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
487d8407755Smaya		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
488af69d88dSmrg#endif
4897e995a2eSmrg		r600_destroy_shader(&shader->bc);
4907e995a2eSmrg	}
491af69d88dSmrg	FREE(shader);
492af69d88dSmrg}
493af69d88dSmrg
4947e995a2eSmrgstatic void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
495af69d88dSmrg{
4967e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
4977e995a2eSmrg	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
4987e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
4997e995a2eSmrg
5007e995a2eSmrg	if (!state) {
5017e995a2eSmrg		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
5027e995a2eSmrg		return;
5037e995a2eSmrg	}
5047e995a2eSmrg
5051463c08dSmrg	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
5061463c08dSmrg	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
5077e995a2eSmrg		bool compute_dirty;
5081463c08dSmrg		cstate->sel->ir_type = cstate->ir_type;
5091463c08dSmrg		if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
5101463c08dSmrg			R600_ERR("Failed to select compute shader\n");
5117e995a2eSmrg	}
5121463c08dSmrg
5137e995a2eSmrg	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
514af69d88dSmrg}
515af69d88dSmrg
516af69d88dSmrg/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
517af69d88dSmrg * kernel parameters there are implicit parameters that need to be stored
518af69d88dSmrg * in the vertex buffer as well.  Here is how these parameters are organized in
519af69d88dSmrg * the buffer:
520af69d88dSmrg *
521af69d88dSmrg * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
522af69d88dSmrg * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
523af69d88dSmrg * DWORDS 6-8: Number of work items within each work group in each dimension
524af69d88dSmrg *             (x,y,z)
525af69d88dSmrg * DWORDS 9+ : Kernel parameters
526af69d88dSmrg */
5277e995a2eSmrgstatic void evergreen_compute_upload_input(struct pipe_context *ctx,
5287e995a2eSmrg					   const struct pipe_grid_info *info)
529af69d88dSmrg{
5307e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
5317e995a2eSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
532af69d88dSmrg	unsigned i;
533af69d88dSmrg	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
534af69d88dSmrg	 * parameters.
535af69d88dSmrg	 */
5367e995a2eSmrg	unsigned input_size;
5377e995a2eSmrg	uint32_t *num_work_groups_start;
5387e995a2eSmrg	uint32_t *global_size_start;
5397e995a2eSmrg	uint32_t *local_size_start;
5407e995a2eSmrg	uint32_t *kernel_parameters_start;
541af69d88dSmrg	struct pipe_box box;
542af69d88dSmrg	struct pipe_transfer *transfer = NULL;
543af69d88dSmrg
5447e995a2eSmrg	if (!shader)
5457e995a2eSmrg		return;
546af69d88dSmrg	if (shader->input_size == 0) {
547af69d88dSmrg		return;
548af69d88dSmrg	}
5497e995a2eSmrg	input_size = shader->input_size + 36;
550af69d88dSmrg	if (!shader->kernel_param) {
551af69d88dSmrg		/* Add space for the grid dimensions */
552af69d88dSmrg		shader->kernel_param = (struct r600_resource *)
5537e995a2eSmrg			pipe_buffer_create(ctx->screen, 0,
554af69d88dSmrg					PIPE_USAGE_IMMUTABLE, input_size);
555af69d88dSmrg	}
556af69d88dSmrg
557af69d88dSmrg	u_box_1d(0, input_size, &box);
5581463c08dSmrg	num_work_groups_start = ctx->buffer_map(ctx,
559af69d88dSmrg			(struct pipe_resource*)shader->kernel_param,
5601463c08dSmrg			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
561af69d88dSmrg			&box, &transfer);
562af69d88dSmrg	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
563af69d88dSmrg	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
564af69d88dSmrg	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
565af69d88dSmrg
566af69d88dSmrg	/* Copy the work group size */
5677e995a2eSmrg	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
568af69d88dSmrg
569af69d88dSmrg	/* Copy the global size */
570af69d88dSmrg	for (i = 0; i < 3; i++) {
5717e995a2eSmrg		global_size_start[i] = info->grid[i] * info->block[i];
572af69d88dSmrg	}
573af69d88dSmrg
574af69d88dSmrg	/* Copy the local dimensions */
5757e995a2eSmrg	memcpy(local_size_start, info->block, 3 * sizeof(uint));
576af69d88dSmrg
577af69d88dSmrg	/* Copy the kernel inputs */
5787e995a2eSmrg	memcpy(kernel_parameters_start, info->input, shader->input_size);
579af69d88dSmrg
580af69d88dSmrg	for (i = 0; i < (input_size / 4); i++) {
5817e995a2eSmrg		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
582af69d88dSmrg			((unsigned*)num_work_groups_start)[i]);
583af69d88dSmrg	}
584af69d88dSmrg
5851463c08dSmrg	ctx->buffer_unmap(ctx, transfer);
586af69d88dSmrg
5877e995a2eSmrg	/* ID=0 and ID=3 are reserved for the parameters.
5887e995a2eSmrg	 * LLVM will preferably use ID=0, but it does not work for dynamic
5897e995a2eSmrg	 * indices. */
5907e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
5917e995a2eSmrg			(struct pipe_resource*)shader->kernel_param);
5927e995a2eSmrg	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
593af69d88dSmrg			(struct pipe_resource*)shader->kernel_param);
594af69d88dSmrg}
595af69d88dSmrg
5967e995a2eSmrgstatic void evergreen_emit_dispatch(struct r600_context *rctx,
5977e995a2eSmrg				    const struct pipe_grid_info *info,
5987e995a2eSmrg				    uint32_t indirect_grid[3])
599af69d88dSmrg{
600af69d88dSmrg	int i;
6011463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
602af69d88dSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
6037e995a2eSmrg	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
604af69d88dSmrg	unsigned num_waves;
6057e995a2eSmrg	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
606af69d88dSmrg	unsigned wave_divisor = (16 * num_pipes);
607af69d88dSmrg	int group_size = 1;
608af69d88dSmrg	int grid_size = 1;
6097e995a2eSmrg	unsigned lds_size = shader->local_size / 4;
6107e995a2eSmrg
6111463c08dSmrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
6121463c08dSmrg	    shader->ir_type != PIPE_SHADER_IR_NIR)
6137e995a2eSmrg		lds_size += shader->bc.nlds_dw;
6141463c08dSmrg
615af69d88dSmrg	/* Calculate group_size/grid_size */
616af69d88dSmrg	for (i = 0; i < 3; i++) {
6177e995a2eSmrg		group_size *= info->block[i];
618af69d88dSmrg	}
619af69d88dSmrg
620af69d88dSmrg	for (i = 0; i < 3; i++)	{
6217e995a2eSmrg		grid_size *= info->grid[i];
622af69d88dSmrg	}
623af69d88dSmrg
624af69d88dSmrg	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
6257e995a2eSmrg	num_waves = (info->block[0] * info->block[1] * info->block[2] +
626af69d88dSmrg			wave_divisor - 1) / wave_divisor;
627af69d88dSmrg
628af69d88dSmrg	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
629af69d88dSmrg				"%u wavefronts per thread block, "
630af69d88dSmrg				"allocating %u dwords lds.\n",
631af69d88dSmrg				num_pipes, num_waves, lds_size);
632af69d88dSmrg
6337e995a2eSmrg	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
634af69d88dSmrg
6357e995a2eSmrg	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
636af69d88dSmrg	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
637af69d88dSmrg	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
638af69d88dSmrg	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
639af69d88dSmrg
6407e995a2eSmrg	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
641af69d88dSmrg								group_size);
642af69d88dSmrg
6437e995a2eSmrg	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
6447e995a2eSmrg	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
6457e995a2eSmrg	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
6467e995a2eSmrg	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
647af69d88dSmrg
648af69d88dSmrg	if (rctx->b.chip_class < CAYMAN) {
649af69d88dSmrg		assert(lds_size <= 8192);
650af69d88dSmrg	} else {
651af69d88dSmrg		/* Cayman appears to have a slightly smaller limit, see the
652af69d88dSmrg		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
653af69d88dSmrg		assert(lds_size <= 8160);
654af69d88dSmrg	}
655af69d88dSmrg
6567e995a2eSmrg	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
657af69d88dSmrg					lds_size | (num_waves << 14));
658af69d88dSmrg
6597e995a2eSmrg	if (info->indirect) {
6607e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
6617e995a2eSmrg		radeon_emit(cs, indirect_grid[0]);
6627e995a2eSmrg		radeon_emit(cs, indirect_grid[1]);
6637e995a2eSmrg		radeon_emit(cs, indirect_grid[2]);
6647e995a2eSmrg		radeon_emit(cs, 1);
6657e995a2eSmrg	} else {
6667e995a2eSmrg		/* Dispatch packet */
6677e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
6687e995a2eSmrg		radeon_emit(cs, info->grid[0]);
6697e995a2eSmrg		radeon_emit(cs, info->grid[1]);
6707e995a2eSmrg		radeon_emit(cs, info->grid[2]);
6717e995a2eSmrg		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
6727e995a2eSmrg		radeon_emit(cs, 1);
6737e995a2eSmrg	}
6747e995a2eSmrg
6757e995a2eSmrg	if (rctx->is_debug)
6767e995a2eSmrg		eg_trace_emit(rctx);
677af69d88dSmrg}
678af69d88dSmrg
6797e995a2eSmrgstatic void compute_setup_cbs(struct r600_context *rctx)
680af69d88dSmrg{
6811463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
682af69d88dSmrg	unsigned i;
683af69d88dSmrg
684af69d88dSmrg	/* Emit colorbuffers. */
685af69d88dSmrg	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
6867e995a2eSmrg	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
6877e995a2eSmrg		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
6887e995a2eSmrg		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
689af69d88dSmrg						       (struct r600_resource*)cb->base.texture,
690af69d88dSmrg						       RADEON_USAGE_READWRITE,
6917e995a2eSmrg						       RADEON_PRIO_SHADER_RW_BUFFER);
692af69d88dSmrg
6937e995a2eSmrg		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
694af69d88dSmrg		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
695af69d88dSmrg		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
696af69d88dSmrg		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
697af69d88dSmrg		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
698af69d88dSmrg		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
699af69d88dSmrg		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
700af69d88dSmrg		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
701af69d88dSmrg
702af69d88dSmrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
703af69d88dSmrg		radeon_emit(cs, reloc);
704af69d88dSmrg
705af69d88dSmrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
706af69d88dSmrg		radeon_emit(cs, reloc);
707af69d88dSmrg	}
7087e995a2eSmrg	for (; i < 8 ; i++)
7097e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
7107e995a2eSmrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
7117e995a2eSmrg	for (; i < 12; i++)
7127e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
7137e995a2eSmrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
7147e995a2eSmrg
7157e995a2eSmrg	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
7167e995a2eSmrg	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
7177e995a2eSmrg				       rctx->compute_cb_target_mask);
7187e995a2eSmrg}
7197e995a2eSmrg
7207e995a2eSmrgstatic void compute_emit_cs(struct r600_context *rctx,
7217e995a2eSmrg			    const struct pipe_grid_info *info)
7227e995a2eSmrg{
7231463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
7247e995a2eSmrg	bool compute_dirty = false;
7257e995a2eSmrg	struct r600_pipe_shader *current;
7267e995a2eSmrg	struct r600_shader_atomic combined_atomics[8];
7277e995a2eSmrg	uint8_t atomic_used_mask;
7287e995a2eSmrg	uint32_t indirect_grid[3] = { 0, 0, 0 };
7297e995a2eSmrg
7307e995a2eSmrg	/* make sure that the gfx ring is only one active */
7311463c08dSmrg	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
7327e995a2eSmrg		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
7337e995a2eSmrg	}
7347e995a2eSmrg
7357e995a2eSmrg	r600_update_compressed_resource_state(rctx, true);
7367e995a2eSmrg
7377e995a2eSmrg	if (!rctx->cmd_buf_is_compute) {
7387e995a2eSmrg		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
7397e995a2eSmrg		rctx->cmd_buf_is_compute = true;
7407e995a2eSmrg	}
7417e995a2eSmrg
7421463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
7431463c08dSmrg	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
7441463c08dSmrg		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
7451463c08dSmrg			R600_ERR("Failed to select compute shader\n");
7461463c08dSmrg			return;
7471463c08dSmrg		}
7481463c08dSmrg
7497e995a2eSmrg		current = rctx->cs_shader_state.shader->sel->current;
7507e995a2eSmrg		if (compute_dirty) {
7517e995a2eSmrg			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
7527e995a2eSmrg			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
7537e995a2eSmrg			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
7547e995a2eSmrg		}
7557e995a2eSmrg
7567e995a2eSmrg		bool need_buf_const = current->shader.uses_tex_buffers ||
7577e995a2eSmrg			current->shader.has_txq_cube_array_z_comp;
7587e995a2eSmrg
7597e995a2eSmrg		if (info->indirect) {
7607e995a2eSmrg			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
7611463c08dSmrg			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
7627e995a2eSmrg			unsigned offset = info->indirect_offset / 4;
7637e995a2eSmrg			indirect_grid[0] = data[offset];
7647e995a2eSmrg			indirect_grid[1] = data[offset + 1];
7657e995a2eSmrg			indirect_grid[2] = data[offset + 2];
7667e995a2eSmrg		}
7677e995a2eSmrg		for (int i = 0; i < 3; i++) {
7687e995a2eSmrg			rctx->cs_block_grid_sizes[i] = info->block[i];
7697e995a2eSmrg			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
7707e995a2eSmrg		}
7717e995a2eSmrg		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
7727e995a2eSmrg		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
7737e995a2eSmrg
7747e995a2eSmrg		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
7757e995a2eSmrg		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
7767e995a2eSmrg
7777e995a2eSmrg		if (need_buf_const) {
7787e995a2eSmrg			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
779af69d88dSmrg		}
7807e995a2eSmrg		r600_update_driver_const_buffers(rctx, true);
7817e995a2eSmrg
7827e995a2eSmrg		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
7837e995a2eSmrg		if (atomic_used_mask) {
7847e995a2eSmrg			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
7857e995a2eSmrg			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
786af69d88dSmrg		}
7877e995a2eSmrg	} else
7887e995a2eSmrg		r600_need_cs_space(rctx, 0, true, 0);
7897e995a2eSmrg
7907e995a2eSmrg	/* Initialize all the compute-related registers.
7917e995a2eSmrg	 *
7927e995a2eSmrg	 * See evergreen_init_atom_start_compute_cs() in this file for the list
7937e995a2eSmrg	 * of registers initialized by the start_compute_cs_cmd atom.
7947e995a2eSmrg	 */
7957e995a2eSmrg	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
7967e995a2eSmrg
7977e995a2eSmrg	/* emit config state */
7987e995a2eSmrg	if (rctx->b.chip_class == EVERGREEN) {
7991463c08dSmrg		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
8001463c08dSmrg		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
8017e995a2eSmrg			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
8027e995a2eSmrg			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
8037e995a2eSmrg			radeon_emit(cs, 0);
8047e995a2eSmrg			radeon_emit(cs, 0);
8057e995a2eSmrg			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
8067e995a2eSmrg		} else
8077e995a2eSmrg			r600_emit_atom(rctx, &rctx->config_state.atom);
808af69d88dSmrg	}
809af69d88dSmrg
8107e995a2eSmrg	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
8117e995a2eSmrg	r600_flush_emit(rctx);
812af69d88dSmrg
8131463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
8141463c08dSmrg	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
815af69d88dSmrg
8167e995a2eSmrg		compute_setup_cbs(rctx);
8177e995a2eSmrg
8187e995a2eSmrg		/* Emit vertex buffer state */
8197e995a2eSmrg		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
8207e995a2eSmrg		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
8217e995a2eSmrg	} else {
8227e995a2eSmrg		uint32_t rat_mask;
8237e995a2eSmrg
8247e995a2eSmrg		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
8257e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
8267e995a2eSmrg					       rat_mask);
8277e995a2eSmrg	}
8287e995a2eSmrg
8297e995a2eSmrg	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
830af69d88dSmrg
831af69d88dSmrg	/* Emit constant buffer state */
8327e995a2eSmrg	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
8337e995a2eSmrg
8347e995a2eSmrg	/* Emit sampler state */
8357e995a2eSmrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
8367e995a2eSmrg
8377e995a2eSmrg	/* Emit sampler view (texture resource) state */
8387e995a2eSmrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
8397e995a2eSmrg
8407e995a2eSmrg	/* Emit images state */
8417e995a2eSmrg	r600_emit_atom(rctx, &rctx->compute_images.atom);
8427e995a2eSmrg
8437e995a2eSmrg	/* Emit buffers state */
8447e995a2eSmrg	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
845af69d88dSmrg
8467e995a2eSmrg	/* Emit shader state */
8477e995a2eSmrg	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
848af69d88dSmrg
849af69d88dSmrg	/* Emit dispatch state and dispatch packet */
8507e995a2eSmrg	evergreen_emit_dispatch(rctx, info, indirect_grid);
851af69d88dSmrg
852af69d88dSmrg	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
853af69d88dSmrg	 */
8547e995a2eSmrg	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
855af69d88dSmrg		      R600_CONTEXT_INV_VERTEX_CACHE |
856af69d88dSmrg	              R600_CONTEXT_INV_TEX_CACHE;
8577e995a2eSmrg	r600_flush_emit(rctx);
8587e995a2eSmrg	rctx->b.flags = 0;
859af69d88dSmrg
8607e995a2eSmrg	if (rctx->b.chip_class >= CAYMAN) {
8617e995a2eSmrg		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
8627e995a2eSmrg		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
863af69d88dSmrg		/* DEALLOC_STATE prevents the GPU from hanging when a
864af69d88dSmrg		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
865af69d88dSmrg		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
866af69d88dSmrg		 */
8677e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
8687e995a2eSmrg		radeon_emit(cs, 0);
869af69d88dSmrg	}
8701463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
8711463c08dSmrg	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
8727e995a2eSmrg		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
873af69d88dSmrg
874af69d88dSmrg#if 0
8757e995a2eSmrg	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
876af69d88dSmrg	for (i = 0; i < cs->cdw; i++) {
8777e995a2eSmrg		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
878af69d88dSmrg	}
879af69d88dSmrg#endif
880af69d88dSmrg
881af69d88dSmrg}
882af69d88dSmrg
883af69d88dSmrg
884af69d88dSmrg/**
885af69d88dSmrg * Emit function for r600_cs_shader_state atom
886af69d88dSmrg */
8877e995a2eSmrgvoid evergreen_emit_cs_shader(struct r600_context *rctx,
8887e995a2eSmrg			      struct r600_atom *atom)
889af69d88dSmrg{
890af69d88dSmrg	struct r600_cs_shader_state *state =
891af69d88dSmrg					(struct r600_cs_shader_state*)atom;
892af69d88dSmrg	struct r600_pipe_compute *shader = state->shader;
8931463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
8947e995a2eSmrg	uint64_t va;
8957e995a2eSmrg	struct r600_resource *code_bo;
8967e995a2eSmrg	unsigned ngpr, nstack;
8977e995a2eSmrg
8981463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
8991463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
9007e995a2eSmrg		code_bo = shader->sel->current->bo;
9017e995a2eSmrg		va = shader->sel->current->bo->gpu_address;
9027e995a2eSmrg		ngpr = shader->sel->current->shader.bc.ngpr;
9037e995a2eSmrg		nstack = shader->sel->current->shader.bc.nstack;
9047e995a2eSmrg	} else {
9057e995a2eSmrg		code_bo = shader->code_bo;
9067e995a2eSmrg		va = shader->code_bo->gpu_address + state->pc;
9077e995a2eSmrg		ngpr = shader->bc.ngpr;
9087e995a2eSmrg		nstack = shader->bc.nstack;
9097e995a2eSmrg	}
910af69d88dSmrg
9117e995a2eSmrg	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
9127e995a2eSmrg	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
913af69d88dSmrg	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
9147e995a2eSmrg			S_0288D4_NUM_GPRS(ngpr) |
9157e995a2eSmrg			S_0288D4_DX10_CLAMP(1) |
9167e995a2eSmrg			S_0288D4_STACK_SIZE(nstack));
917af69d88dSmrg	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
918af69d88dSmrg
919af69d88dSmrg	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
9207e995a2eSmrg	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
9217e995a2eSmrg					      code_bo, RADEON_USAGE_READ,
9227e995a2eSmrg					      RADEON_PRIO_SHADER_BINARY));
923af69d88dSmrg}
924af69d88dSmrg
9257e995a2eSmrgstatic void evergreen_launch_grid(struct pipe_context *ctx,
9267e995a2eSmrg				  const struct pipe_grid_info *info)
927af69d88dSmrg{
9287e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
929af69d88dSmrg#ifdef HAVE_OPENCL
9307e995a2eSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
9317e995a2eSmrg	boolean use_kill;
932af69d88dSmrg
9331463c08dSmrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
9341463c08dSmrg	    shader->ir_type != PIPE_SHADER_IR_NIR) {
9357e995a2eSmrg		rctx->cs_shader_state.pc = info->pc;
9367e995a2eSmrg		/* Get the config information for this kernel. */
9377e995a2eSmrg		r600_shader_binary_read_config(&shader->binary, &shader->bc,
9387e995a2eSmrg					       info->pc, &use_kill);
9397e995a2eSmrg	} else {
9407e995a2eSmrg		use_kill = false;
9417e995a2eSmrg		rctx->cs_shader_state.pc = 0;
942af69d88dSmrg	}
943af69d88dSmrg#endif
9447e995a2eSmrg
9457e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
9467e995a2eSmrg
9477e995a2eSmrg
9487e995a2eSmrg	evergreen_compute_upload_input(ctx, info);
9497e995a2eSmrg	compute_emit_cs(rctx, info);
950af69d88dSmrg}
951af69d88dSmrg
9527e995a2eSmrgstatic void evergreen_set_compute_resources(struct pipe_context *ctx,
9537e995a2eSmrg					    unsigned start, unsigned count,
9547e995a2eSmrg					    struct pipe_surface **surfaces)
955af69d88dSmrg{
9567e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
957af69d88dSmrg	struct r600_surface **resources = (struct r600_surface **)surfaces;
958af69d88dSmrg
9597e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
960af69d88dSmrg			start, count);
961af69d88dSmrg
962af69d88dSmrg	for (unsigned i = 0; i < count; i++) {
9637e995a2eSmrg		/* The First four vertex buffers are reserved for parameters and
964af69d88dSmrg		 * global buffers. */
9657e995a2eSmrg		unsigned vtx_id = 4 + i;
966af69d88dSmrg		if (resources[i]) {
967af69d88dSmrg			struct r600_resource_global *buffer =
968af69d88dSmrg				(struct r600_resource_global*)
969af69d88dSmrg				resources[i]->base.texture;
970af69d88dSmrg			if (resources[i]->base.writable) {
971af69d88dSmrg				assert(i+1 < 12);
972af69d88dSmrg
9737e995a2eSmrg				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
974af69d88dSmrg				(struct r600_resource *)resources[i]->base.texture,
975af69d88dSmrg				buffer->chunk->start_in_dw*4,
976af69d88dSmrg				resources[i]->base.texture->width0);
977af69d88dSmrg			}
978af69d88dSmrg
9797e995a2eSmrg			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
980af69d88dSmrg					buffer->chunk->start_in_dw * 4,
981af69d88dSmrg					resources[i]->base.texture);
982af69d88dSmrg		}
983af69d88dSmrg	}
984af69d88dSmrg}
985af69d88dSmrg
9867e995a2eSmrgstatic void evergreen_set_global_binding(struct pipe_context *ctx,
9877e995a2eSmrg					 unsigned first, unsigned n,
9887e995a2eSmrg					 struct pipe_resource **resources,
9897e995a2eSmrg					 uint32_t **handles)
990af69d88dSmrg{
9917e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
9927e995a2eSmrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
993af69d88dSmrg	struct r600_resource_global **buffers =
994af69d88dSmrg		(struct r600_resource_global **)resources;
995af69d88dSmrg	unsigned i;
996af69d88dSmrg
9977e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
998af69d88dSmrg			first, n);
999af69d88dSmrg
1000af69d88dSmrg	if (!resources) {
1001af69d88dSmrg		/* XXX: Unset */
1002af69d88dSmrg		return;
1003af69d88dSmrg	}
1004af69d88dSmrg
1005af69d88dSmrg	/* We mark these items for promotion to the pool if they
1006af69d88dSmrg	 * aren't already there */
1007af69d88dSmrg	for (i = first; i < first + n; i++) {
1008af69d88dSmrg		struct compute_memory_item *item = buffers[i]->chunk;
1009af69d88dSmrg
1010af69d88dSmrg		if (!is_item_in_pool(item))
1011af69d88dSmrg			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1012af69d88dSmrg	}
1013af69d88dSmrg
10147e995a2eSmrg	if (compute_memory_finalize_pending(pool, ctx) == -1) {
1015af69d88dSmrg		/* XXX: Unset */
1016af69d88dSmrg		return;
1017af69d88dSmrg	}
1018af69d88dSmrg
1019af69d88dSmrg	for (i = first; i < first + n; i++)
1020af69d88dSmrg	{
1021af69d88dSmrg		uint32_t buffer_offset;
1022af69d88dSmrg		uint32_t handle;
1023af69d88dSmrg		assert(resources[i]->target == PIPE_BUFFER);
1024af69d88dSmrg		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1025af69d88dSmrg
1026af69d88dSmrg		buffer_offset = util_le32_to_cpu(*(handles[i]));
1027af69d88dSmrg		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1028af69d88dSmrg
1029af69d88dSmrg		*(handles[i]) = util_cpu_to_le32(handle);
1030af69d88dSmrg	}
1031af69d88dSmrg
10327e995a2eSmrg	/* globals for writing */
10337e995a2eSmrg	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
10347e995a2eSmrg	/* globals for reading */
10357e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1036af69d88dSmrg				(struct pipe_resource*)pool->bo);
10377e995a2eSmrg
10387e995a2eSmrg	/* constants for reading, LLVM puts them in text segment */
10397e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
10407e995a2eSmrg				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1041af69d88dSmrg}
1042af69d88dSmrg
1043af69d88dSmrg/**
1044af69d88dSmrg * This function initializes all the compute specific registers that need to
1045af69d88dSmrg * be initialized for each compute command stream.  Registers that are common
1046af69d88dSmrg * to both compute and 3D will be initialized at the beginning of each compute
1047af69d88dSmrg * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1048af69d88dSmrg * packet requires that the shader type bit be set, we must initialize all
1049af69d88dSmrg * context registers needed for compute in this function.  The registers
10507e995a2eSmrg * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1051af69d88dSmrg * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1052af69d88dSmrg * on the GPU family.
1053af69d88dSmrg */
10547e995a2eSmrgvoid evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1055af69d88dSmrg{
10567e995a2eSmrg	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1057af69d88dSmrg	int num_threads;
1058af69d88dSmrg	int num_stack_entries;
1059af69d88dSmrg
10607e995a2eSmrg	/* since all required registers are initialized in the
1061af69d88dSmrg	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1062af69d88dSmrg	 */
1063af69d88dSmrg	r600_init_command_buffer(cb, 256);
1064af69d88dSmrg	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1065af69d88dSmrg
1066af69d88dSmrg	/* We're setting config registers here. */
1067af69d88dSmrg	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1068af69d88dSmrg	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1069af69d88dSmrg
10707e995a2eSmrg	switch (rctx->b.family) {
1071af69d88dSmrg	case CHIP_CEDAR:
1072af69d88dSmrg	default:
1073af69d88dSmrg		num_threads = 128;
1074af69d88dSmrg		num_stack_entries = 256;
1075af69d88dSmrg		break;
1076af69d88dSmrg	case CHIP_REDWOOD:
1077af69d88dSmrg		num_threads = 128;
1078af69d88dSmrg		num_stack_entries = 256;
1079af69d88dSmrg		break;
1080af69d88dSmrg	case CHIP_JUNIPER:
1081af69d88dSmrg		num_threads = 128;
1082af69d88dSmrg		num_stack_entries = 512;
1083af69d88dSmrg		break;
1084af69d88dSmrg	case CHIP_CYPRESS:
1085af69d88dSmrg	case CHIP_HEMLOCK:
1086af69d88dSmrg		num_threads = 128;
1087af69d88dSmrg		num_stack_entries = 512;
1088af69d88dSmrg		break;
1089af69d88dSmrg	case CHIP_PALM:
1090af69d88dSmrg		num_threads = 128;
1091af69d88dSmrg		num_stack_entries = 256;
1092af69d88dSmrg		break;
1093af69d88dSmrg	case CHIP_SUMO:
1094af69d88dSmrg		num_threads = 128;
1095af69d88dSmrg		num_stack_entries = 256;
1096af69d88dSmrg		break;
1097af69d88dSmrg	case CHIP_SUMO2:
1098af69d88dSmrg		num_threads = 128;
1099af69d88dSmrg		num_stack_entries = 512;
1100af69d88dSmrg		break;
1101af69d88dSmrg	case CHIP_BARTS:
1102af69d88dSmrg		num_threads = 128;
1103af69d88dSmrg		num_stack_entries = 512;
1104af69d88dSmrg		break;
1105af69d88dSmrg	case CHIP_TURKS:
1106af69d88dSmrg		num_threads = 128;
1107af69d88dSmrg		num_stack_entries = 256;
1108af69d88dSmrg		break;
1109af69d88dSmrg	case CHIP_CAICOS:
1110af69d88dSmrg		num_threads = 128;
1111af69d88dSmrg		num_stack_entries = 256;
1112af69d88dSmrg		break;
1113af69d88dSmrg	}
1114af69d88dSmrg
1115af69d88dSmrg	/* The primitive type always needs to be POINTLIST for compute. */
1116af69d88dSmrg	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1117af69d88dSmrg						V_008958_DI_PT_POINTLIST);
1118af69d88dSmrg
11197e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
1120af69d88dSmrg
1121af69d88dSmrg		/* These registers control which simds can be used by each stage.
1122af69d88dSmrg		 * The default for these registers is 0xffffffff, which means
1123af69d88dSmrg		 * all simds are available for each stage.  It's possible we may
1124af69d88dSmrg		 * want to play around with these in the future, but for now
1125af69d88dSmrg		 * the default value is fine.
1126af69d88dSmrg		 *
1127af69d88dSmrg		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1128af69d88dSmrg		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1129af69d88dSmrg		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1130af69d88dSmrg		 */
1131af69d88dSmrg
11327e995a2eSmrg		/* XXX: We may need to adjust the thread and stack resource
1133af69d88dSmrg		 * values for 3D/compute interop */
1134af69d88dSmrg
1135af69d88dSmrg		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1136af69d88dSmrg
1137af69d88dSmrg		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1138af69d88dSmrg		 * Set the number of threads used by the PS/VS/GS/ES stage to
1139af69d88dSmrg		 * 0.
1140af69d88dSmrg		 */
1141af69d88dSmrg		r600_store_value(cb, 0);
1142af69d88dSmrg
1143af69d88dSmrg		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1144af69d88dSmrg		 * Set the number of threads used by the CS (aka LS) stage to
1145af69d88dSmrg		 * the maximum number of threads and set the number of threads
1146af69d88dSmrg		 * for the HS stage to 0. */
1147af69d88dSmrg		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1148af69d88dSmrg
1149af69d88dSmrg		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1150af69d88dSmrg		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1151af69d88dSmrg		r600_store_value(cb, 0);
1152af69d88dSmrg
1153af69d88dSmrg		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1154af69d88dSmrg		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1155af69d88dSmrg		r600_store_value(cb, 0);
1156af69d88dSmrg
1157af69d88dSmrg		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1158af69d88dSmrg		 * Set the Contol Flow stack entries to 0 for the HS stage, and
1159af69d88dSmrg		 * set it to the maximum value for the CS (aka LS) stage. */
1160af69d88dSmrg		r600_store_value(cb,
1161af69d88dSmrg			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1162af69d88dSmrg	}
1163af69d88dSmrg	/* Give the compute shader all the available LDS space.
1164af69d88dSmrg	 * NOTE: This only sets the maximum number of dwords that a compute
1165af69d88dSmrg	 * shader can allocate.  When a shader is executed, we still need to
1166af69d88dSmrg	 * allocate the appropriate amount of LDS dwords using the
1167af69d88dSmrg	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1168af69d88dSmrg	 */
11697e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
1170af69d88dSmrg		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1171af69d88dSmrg			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1172af69d88dSmrg	} else {
1173af69d88dSmrg		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1174af69d88dSmrg			S_0286FC_NUM_PS_LDS(0) |
1175af69d88dSmrg			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1176af69d88dSmrg	}
1177af69d88dSmrg
1178af69d88dSmrg	/* Context Registers */
1179af69d88dSmrg
11807e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
1181af69d88dSmrg		/* workaround for hw issues with dyn gpr - must set all limits
1182af69d88dSmrg		 * to 240 instead of 0, 0x1e == 240 / 8
1183af69d88dSmrg		 */
1184af69d88dSmrg		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1185af69d88dSmrg				S_028838_PS_GPRS(0x1e) |
1186af69d88dSmrg				S_028838_VS_GPRS(0x1e) |
1187af69d88dSmrg				S_028838_GS_GPRS(0x1e) |
1188af69d88dSmrg				S_028838_ES_GPRS(0x1e) |
1189af69d88dSmrg				S_028838_HS_GPRS(0x1e) |
1190af69d88dSmrg				S_028838_LS_GPRS(0x1e));
1191af69d88dSmrg	}
1192af69d88dSmrg
1193af69d88dSmrg	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1194af69d88dSmrg	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1195af69d88dSmrg		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1196af69d88dSmrg
1197af69d88dSmrg	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1198af69d88dSmrg
1199af69d88dSmrg	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
12007e995a2eSmrg			       S_0286E8_TID_IN_GROUP_ENA(1) |
12017e995a2eSmrg			       S_0286E8_TGID_ENA(1) |
12027e995a2eSmrg			       S_0286E8_DISABLE_INDEX_PACK(1));
1203af69d88dSmrg
1204af69d88dSmrg	/* The LOOP_CONST registers are an optimizations for loops that allows
1205af69d88dSmrg	 * you to store the initial counter, increment value, and maximum
1206af69d88dSmrg	 * counter value in a register so that hardware can calculate the
1207af69d88dSmrg	 * correct number of iterations for the loop, so that you don't need
1208af69d88dSmrg	 * to have the loop counter in your shader code.  We don't currently use
1209af69d88dSmrg	 * this optimization, so we must keep track of the counter in the
1210af69d88dSmrg	 * shader and use a break instruction to exit loops.  However, the
1211af69d88dSmrg	 * hardware will still uses this register to determine when to exit a
1212af69d88dSmrg	 * loop, so we need to initialize the counter to 0, set the increment
1213af69d88dSmrg	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1214af69d88dSmrg	 * is the maximum value allowed.  This gives us a maximum of 4096
1215af69d88dSmrg	 * iterations for our loops, but hopefully our break instruction will
1216af69d88dSmrg	 * execute before some time before the 4096th iteration.
1217af69d88dSmrg	 */
1218af69d88dSmrg	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1219af69d88dSmrg}
1220af69d88dSmrg
12217e995a2eSmrgvoid evergreen_init_compute_state_functions(struct r600_context *rctx)
1222af69d88dSmrg{
12237e995a2eSmrg	rctx->b.b.create_compute_state = evergreen_create_compute_state;
12247e995a2eSmrg	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
12257e995a2eSmrg	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
12267e995a2eSmrg//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
12277e995a2eSmrg	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
12287e995a2eSmrg	rctx->b.b.set_global_binding = evergreen_set_global_binding;
12297e995a2eSmrg	rctx->b.b.launch_grid = evergreen_launch_grid;
1230af69d88dSmrg
1231af69d88dSmrg}
1232af69d88dSmrg
12331463c08dSmrgvoid *r600_compute_global_transfer_map(struct pipe_context *ctx,
12341463c08dSmrg				      struct pipe_resource *resource,
12351463c08dSmrg				      unsigned level,
12361463c08dSmrg				      unsigned usage,
12371463c08dSmrg				      const struct pipe_box *box,
12381463c08dSmrg				      struct pipe_transfer **ptransfer)
1239af69d88dSmrg{
12407e995a2eSmrg	struct r600_context *rctx = (struct r600_context*)ctx;
1241af69d88dSmrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
1242af69d88dSmrg	struct r600_resource_global* buffer =
1243af69d88dSmrg		(struct r600_resource_global*)resource;
1244af69d88dSmrg
1245af69d88dSmrg	struct compute_memory_item *item = buffer->chunk;
1246af69d88dSmrg	struct pipe_resource *dst = NULL;
1247af69d88dSmrg	unsigned offset = box->x;
1248af69d88dSmrg
1249af69d88dSmrg	if (is_item_in_pool(item)) {
12507e995a2eSmrg		compute_memory_demote_item(pool, item, ctx);
1251af69d88dSmrg	}
1252af69d88dSmrg	else {
1253af69d88dSmrg		if (item->real_buffer == NULL) {
12547e995a2eSmrg			item->real_buffer =
1255af69d88dSmrg					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256af69d88dSmrg		}
1257af69d88dSmrg	}
1258af69d88dSmrg
1259af69d88dSmrg	dst = (struct pipe_resource*)item->real_buffer;
1260af69d88dSmrg
12611463c08dSmrg	if (usage & PIPE_MAP_READ)
1262af69d88dSmrg		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1263af69d88dSmrg
1264af69d88dSmrg	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1265af69d88dSmrg			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1266af69d88dSmrg			"width = %u, height = %u, depth = %u)\n", level, usage,
1267af69d88dSmrg			box->x, box->y, box->z, box->width, box->height,
1268af69d88dSmrg			box->depth);
1269af69d88dSmrg	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1270af69d88dSmrg		"%u (box.x)\n", item->id, box->x);
1271af69d88dSmrg
1272af69d88dSmrg
1273af69d88dSmrg	assert(resource->target == PIPE_BUFFER);
1274af69d88dSmrg	assert(resource->bind & PIPE_BIND_GLOBAL);
1275af69d88dSmrg	assert(box->x >= 0);
1276af69d88dSmrg	assert(box->y == 0);
1277af69d88dSmrg	assert(box->z == 0);
1278af69d88dSmrg
1279af69d88dSmrg	///TODO: do it better, mapping is not possible if the pool is too big
12807e995a2eSmrg	return pipe_buffer_map_range(ctx, dst,
1281af69d88dSmrg			offset, box->width, usage, ptransfer);
1282af69d88dSmrg}
1283af69d88dSmrg
12841463c08dSmrgvoid r600_compute_global_transfer_unmap(struct pipe_context *ctx,
12851463c08dSmrg					struct pipe_transfer *transfer)
1286af69d88dSmrg{
1287af69d88dSmrg	/* struct r600_resource_global are not real resources, they just map
1288af69d88dSmrg	 * to an offset within the compute memory pool.  The function
1289af69d88dSmrg	 * r600_compute_global_transfer_map() maps the memory pool
1290af69d88dSmrg	 * resource rather than the struct r600_resource_global passed to
12911463c08dSmrg	 * it as an argument and then initializes ptransfer->resource with
1292af69d88dSmrg	 * the memory pool resource (via pipe_buffer_map_range).
1293af69d88dSmrg	 * When transfer_unmap is called it uses the memory pool's
1294af69d88dSmrg	 * vtable which calls r600_buffer_transfer_map() rather than
1295af69d88dSmrg	 * this function.
1296af69d88dSmrg	 */
1297af69d88dSmrg	assert (!"This function should not be called");
1298af69d88dSmrg}
1299af69d88dSmrg
13001463c08dSmrgvoid r600_compute_global_buffer_destroy(struct pipe_screen *screen,
13011463c08dSmrg					struct pipe_resource *res)
1302af69d88dSmrg{
13037e995a2eSmrg	struct r600_resource_global* buffer = NULL;
13047e995a2eSmrg	struct r600_screen* rscreen = NULL;
13057e995a2eSmrg
13067e995a2eSmrg	assert(res->target == PIPE_BUFFER);
13077e995a2eSmrg	assert(res->bind & PIPE_BIND_GLOBAL);
13087e995a2eSmrg
13097e995a2eSmrg	buffer = (struct r600_resource_global*)res;
13107e995a2eSmrg	rscreen = (struct r600_screen*)screen;
13117e995a2eSmrg
13127e995a2eSmrg	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
13137e995a2eSmrg
13147e995a2eSmrg	buffer->chunk = NULL;
13157e995a2eSmrg	free(res);
13167e995a2eSmrg}
13177e995a2eSmrg
13187e995a2eSmrgstruct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
13197e995a2eSmrg							const struct pipe_resource *templ)
13207e995a2eSmrg{
13217e995a2eSmrg	struct r600_resource_global* result = NULL;
13227e995a2eSmrg	struct r600_screen* rscreen = NULL;
13237e995a2eSmrg	int size_in_dw = 0;
13247e995a2eSmrg
13257e995a2eSmrg	assert(templ->target == PIPE_BUFFER);
13267e995a2eSmrg	assert(templ->bind & PIPE_BIND_GLOBAL);
13277e995a2eSmrg	assert(templ->array_size == 1 || templ->array_size == 0);
13287e995a2eSmrg	assert(templ->depth0 == 1 || templ->depth0 == 0);
13297e995a2eSmrg	assert(templ->height0 == 1 || templ->height0 == 0);
13307e995a2eSmrg
13317e995a2eSmrg	result = (struct r600_resource_global*)
13327e995a2eSmrg	CALLOC(sizeof(struct r600_resource_global), 1);
13337e995a2eSmrg	rscreen = (struct r600_screen*)screen;
13347e995a2eSmrg
13357e995a2eSmrg	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
13367e995a2eSmrg	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
13377e995a2eSmrg			templ->array_size);
13387e995a2eSmrg
13397e995a2eSmrg	result->base.b.b = *templ;
13407e995a2eSmrg	result->base.b.b.screen = screen;
13411463c08dSmrg	result->base.compute_global_bo = true;
13427e995a2eSmrg	pipe_reference_init(&result->base.b.b.reference, 1);
13437e995a2eSmrg
13447e995a2eSmrg	size_in_dw = (templ->width0+3) / 4;
13457e995a2eSmrg
13467e995a2eSmrg	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
13477e995a2eSmrg
13487e995a2eSmrg	if (result->chunk == NULL)
13497e995a2eSmrg	{
13507e995a2eSmrg		free(result);
13517e995a2eSmrg		return NULL;
13527e995a2eSmrg	}
13537e995a2eSmrg
13547e995a2eSmrg	return &result->base.b.b;
1355af69d88dSmrg}
1356