drivers/r600/evergreen_compute.c

af69d88dSmrg/*
af69d88dSmrg * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
af69d88dSmrg *
af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
af69d88dSmrg * copy of this software and associated documentation files (the "Software"),
af69d88dSmrg * to deal in the Software without restriction, including without limitation
af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom
af69d88dSmrg * the Software is furnished to do so, subject to the following conditions:
af69d88dSmrg *
af69d88dSmrg * The above copyright notice and this permission notice (including the next
af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the
af69d88dSmrg * Software.
af69d88dSmrg *
af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
af69d88dSmrg *
af69d88dSmrg * Authors:
af69d88dSmrg *      Adam Rak <adam.rak@streamnovation.com>
af69d88dSmrg */
af69d88dSmrg
7e995a2eSmrg#ifdef HAVE_OPENCL
7e995a2eSmrg#include <gelf.h>
7e995a2eSmrg#include <libelf.h>
7e995a2eSmrg#endif
af69d88dSmrg#include <stdio.h>
af69d88dSmrg#include <errno.h>
af69d88dSmrg#include "pipe/p_defines.h"
af69d88dSmrg#include "pipe/p_state.h"
af69d88dSmrg#include "pipe/p_context.h"
af69d88dSmrg#include "util/u_blitter.h"
7e995a2eSmrg#include "util/list.h"
af69d88dSmrg#include "util/u_transfer.h"
af69d88dSmrg#include "util/u_surface.h"
af69d88dSmrg#include "util/u_pack_color.h"
af69d88dSmrg#include "util/u_memory.h"
af69d88dSmrg#include "util/u_inlines.h"
af69d88dSmrg#include "util/u_framebuffer.h"
7e995a2eSmrg#include "tgsi/tgsi_parse.h"
af69d88dSmrg#include "pipebuffer/pb_buffer.h"
af69d88dSmrg#include "evergreend.h"
af69d88dSmrg#include "r600_shader.h"
af69d88dSmrg#include "r600_pipe.h"
af69d88dSmrg#include "r600_formats.h"
af69d88dSmrg#include "evergreen_compute.h"
af69d88dSmrg#include "evergreen_compute_internal.h"
af69d88dSmrg#include "compute_memory_pool.h"
af69d88dSmrg#include "sb/sb_public.h"
af69d88dSmrg#include <inttypes.h>
af69d88dSmrg
af69d88dSmrg/**
af69d88dSmrgRAT0 is for global binding write
af69d88dSmrgVTX1 is for global binding read
af69d88dSmrg
af69d88dSmrgfor wrting images RAT1...
af69d88dSmrgfor reading images TEX2...
af69d88dSmrg  TEX2-RAT1 is paired
af69d88dSmrg
af69d88dSmrgTEX2... consumes the same fetch resources, that VTX2... would consume
af69d88dSmrg
af69d88dSmrgCONST0 and VTX0 is for parameters
af69d88dSmrg  CONST0 is binding smaller input parameter buffer, and for constant indexing,
af69d88dSmrg  also constant cached
af69d88dSmrg  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
af69d88dSmrg  the constant cache can handle
af69d88dSmrg
af69d88dSmrgRAT-s are limited to 12, so we can only bind at most 11 texture for writing
af69d88dSmrgbecause we reserve RAT0 for global bindings. With byteaddressing enabled,
af69d88dSmrgwe should reserve another one too.=> 10 image binding for writing max.
af69d88dSmrg
af69d88dSmrgfrom Nvidia OpenCL:
af69d88dSmrg  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
af69d88dSmrg  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
af69d88dSmrg
af69d88dSmrgso 10 for writing is enough. 176 is the max for reading according to the docs
af69d88dSmrg
af69d88dSmrgwritable images should be listed first < 10, so their id corresponds to RAT(id+1)
af69d88dSmrgwritable images will consume TEX slots, VTX slots too because of linear indexing
af69d88dSmrg
af69d88dSmrg*/
af69d88dSmrg
1463c08dSmrg#ifdef HAVE_OPENCL
1463c08dSmrgstatic void radeon_shader_binary_init(struct r600_shader_binary *b)
1463c08dSmrg{
1463c08dSmrg	memset(b, 0, sizeof(*b));
1463c08dSmrg}
1463c08dSmrg
1463c08dSmrgstatic void radeon_shader_binary_clean(struct r600_shader_binary *b)
1463c08dSmrg{
1463c08dSmrg	if (!b)
1463c08dSmrg		return;
1463c08dSmrg	FREE(b->code);
1463c08dSmrg	FREE(b->config);
1463c08dSmrg	FREE(b->rodata);
1463c08dSmrg	FREE(b->global_symbol_offsets);
1463c08dSmrg	FREE(b->relocs);
1463c08dSmrg	FREE(b->disasm_string);
1463c08dSmrg}
1463c08dSmrg#endif
1463c08dSmrg
7e995a2eSmrgstruct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
7e995a2eSmrg						     unsigned size)
af69d88dSmrg{
7e995a2eSmrg	struct pipe_resource *buffer = NULL;
af69d88dSmrg	assert(size);
af69d88dSmrg
7e995a2eSmrg	buffer = pipe_buffer_create((struct pipe_screen*) screen,
7e995a2eSmrg				    0, PIPE_USAGE_IMMUTABLE, size);
af69d88dSmrg
af69d88dSmrg	return (struct r600_resource *)buffer;
af69d88dSmrg}
af69d88dSmrg
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_set_rat(struct r600_pipe_compute *pipe,
7e995a2eSmrg			      unsigned id,
7e995a2eSmrg			      struct r600_resource *bo,
7e995a2eSmrg			      int start,
7e995a2eSmrg			      int size)
af69d88dSmrg{
af69d88dSmrg	struct pipe_surface rat_templ;
af69d88dSmrg	struct r600_surface *surf = NULL;
af69d88dSmrg	struct r600_context *rctx = NULL;
af69d88dSmrg
af69d88dSmrg	assert(id < 12);
af69d88dSmrg	assert((size & 3) == 0);
af69d88dSmrg	assert((start & 0xFF) == 0);
af69d88dSmrg
af69d88dSmrg	rctx = pipe->ctx;
af69d88dSmrg
af69d88dSmrg	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
af69d88dSmrg
af69d88dSmrg	/* Create the RAT surface */
af69d88dSmrg	memset(&rat_templ, 0, sizeof(rat_templ));
af69d88dSmrg	rat_templ.format = PIPE_FORMAT_R32_UINT;
af69d88dSmrg	rat_templ.u.tex.level = 0;
af69d88dSmrg	rat_templ.u.tex.first_layer = 0;
af69d88dSmrg	rat_templ.u.tex.last_layer = 0;
af69d88dSmrg
7e995a2eSmrg	/* Add the RAT the list of color buffers. Drop the old buffer first. */
7e995a2eSmrg	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
af69d88dSmrg	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
af69d88dSmrg		(struct pipe_context *)pipe->ctx,
af69d88dSmrg		(struct pipe_resource *)bo, &rat_templ);
af69d88dSmrg
af69d88dSmrg	/* Update the number of color buffers */
af69d88dSmrg	pipe->ctx->framebuffer.state.nr_cbufs =
af69d88dSmrg		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
af69d88dSmrg
af69d88dSmrg	/* Update the cb_target_mask
af69d88dSmrg	 * XXX: I think this is a potential spot for bugs once we start doing
af69d88dSmrg	 * GL interop.  cb_target_mask may be modified in the 3D sections
af69d88dSmrg	 * of this driver. */
af69d88dSmrg	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
af69d88dSmrg
af69d88dSmrg	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
af69d88dSmrg	evergreen_init_color_surface_rat(rctx, surf);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
7e995a2eSmrg					   unsigned vb_index,
7e995a2eSmrg					   unsigned offset,
7e995a2eSmrg					   struct pipe_resource *buffer)
af69d88dSmrg{
af69d88dSmrg	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
af69d88dSmrg	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
af69d88dSmrg	vb->stride = 1;
af69d88dSmrg	vb->buffer_offset = offset;
7e995a2eSmrg	vb->buffer.resource = buffer;
7e995a2eSmrg	vb->is_user_buffer = false;
af69d88dSmrg
af69d88dSmrg	/* The vertex instructions in the compute shaders use the texture cache,
af69d88dSmrg	 * so we need to invalidate it. */
af69d88dSmrg	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
af69d88dSmrg	state->enabled_mask |= 1 << vb_index;
af69d88dSmrg	state->dirty_mask |= 1 << vb_index;
7e995a2eSmrg	r600_mark_atom_dirty(rctx, &state->atom);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
7e995a2eSmrg					     unsigned cb_index,
7e995a2eSmrg					     unsigned offset,
7e995a2eSmrg					     unsigned size,
7e995a2eSmrg					     struct pipe_resource *buffer)
af69d88dSmrg{
af69d88dSmrg	struct pipe_constant_buffer cb;
af69d88dSmrg	cb.buffer_size = size;
af69d88dSmrg	cb.buffer_offset = offset;
af69d88dSmrg	cb.buffer = buffer;
af69d88dSmrg	cb.user_buffer = NULL;
af69d88dSmrg
1463c08dSmrg	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrg/* We need to define these R600 registers here, because we can't include
7e995a2eSmrg * evergreend.h and r600d.h.
7e995a2eSmrg */
7e995a2eSmrg#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
7e995a2eSmrg#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
7e995a2eSmrg
7e995a2eSmrg#ifdef HAVE_OPENCL
7e995a2eSmrgstatic void parse_symbol_table(Elf_Data *symbol_table_data,
7e995a2eSmrg				const GElf_Shdr *symbol_table_header,
1463c08dSmrg				struct r600_shader_binary *binary)
af69d88dSmrg{
7e995a2eSmrg	GElf_Sym symbol;
7e995a2eSmrg	unsigned i = 0;
7e995a2eSmrg	unsigned symbol_count =
7e995a2eSmrg		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
7e995a2eSmrg
7e995a2eSmrg	/* We are over allocating this list, because symbol_count gives the
7e995a2eSmrg	 * total number of symbols, and we will only be filling the list
7e995a2eSmrg	 * with offsets of global symbols.  The memory savings from
7e995a2eSmrg	 * allocating the correct size of this list will be small, and
7e995a2eSmrg	 * I don't think it is worth the cost of pre-computing the number
7e995a2eSmrg	 * of global symbols.
7e995a2eSmrg	 */
7e995a2eSmrg	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
7e995a2eSmrg
7e995a2eSmrg	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
7e995a2eSmrg		unsigned i;
7e995a2eSmrg		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
7e995a2eSmrg		    symbol.st_shndx == 0 /* Undefined symbol */) {
7e995a2eSmrg			continue;
7e995a2eSmrg		}
af69d88dSmrg
7e995a2eSmrg		binary->global_symbol_offsets[binary->global_symbol_count] =
7e995a2eSmrg					symbol.st_value;
af69d88dSmrg
7e995a2eSmrg		/* Sort the list using bubble sort.  This list will usually
7e995a2eSmrg		 * be small. */
7e995a2eSmrg		for (i = binary->global_symbol_count; i > 0; --i) {
7e995a2eSmrg			uint64_t lhs = binary->global_symbol_offsets[i - 1];
7e995a2eSmrg			uint64_t rhs = binary->global_symbol_offsets[i];
7e995a2eSmrg			if (lhs < rhs) {
7e995a2eSmrg				break;
7e995a2eSmrg			}
7e995a2eSmrg			binary->global_symbol_offsets[i] = lhs;
7e995a2eSmrg			binary->global_symbol_offsets[i - 1] = rhs;
7e995a2eSmrg		}
7e995a2eSmrg		++binary->global_symbol_count;
7e995a2eSmrg	}
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrg
7e995a2eSmrgstatic void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
7e995a2eSmrg			unsigned symbol_sh_link,
1463c08dSmrg			struct r600_shader_binary *binary)
af69d88dSmrg{
7e995a2eSmrg	unsigned i;
af69d88dSmrg
7e995a2eSmrg	if (!relocs || !symbols || !binary->reloc_count) {
7e995a2eSmrg		return;
7e995a2eSmrg	}
7e995a2eSmrg	binary->relocs = CALLOC(binary->reloc_count,
1463c08dSmrg			sizeof(struct r600_shader_reloc));
7e995a2eSmrg	for (i = 0; i < binary->reloc_count; i++) {
7e995a2eSmrg		GElf_Sym symbol;
7e995a2eSmrg		GElf_Rel rel;
7e995a2eSmrg		char *symbol_name;
1463c08dSmrg		struct r600_shader_reloc *reloc = &binary->relocs[i];
7e995a2eSmrg
7e995a2eSmrg		gelf_getrel(relocs, i, &rel);
7e995a2eSmrg		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
7e995a2eSmrg		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
7e995a2eSmrg
7e995a2eSmrg		reloc->offset = rel.r_offset;
7e995a2eSmrg		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
7e995a2eSmrg		reloc->name[sizeof(reloc->name)-1] = 0;
7e995a2eSmrg	}
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrgstatic void r600_elf_read(const char *elf_data, unsigned elf_size,
1463c08dSmrg		 struct r600_shader_binary *binary)
7e995a2eSmrg{
7e995a2eSmrg	char *elf_buffer;
7e995a2eSmrg	Elf *elf;
7e995a2eSmrg	Elf_Scn *section = NULL;
7e995a2eSmrg	Elf_Data *symbols = NULL, *relocs = NULL;
7e995a2eSmrg	size_t section_str_index;
7e995a2eSmrg	unsigned symbol_sh_link = 0;
7e995a2eSmrg
7e995a2eSmrg	/* One of the libelf implementations
7e995a2eSmrg	 * (http://www.mr511.de/software/english.htm) requires calling
7e995a2eSmrg	 * elf_version() before elf_memory().
7e995a2eSmrg	 */
7e995a2eSmrg	elf_version(EV_CURRENT);
7e995a2eSmrg	elf_buffer = MALLOC(elf_size);
7e995a2eSmrg	memcpy(elf_buffer, elf_data, elf_size);
7e995a2eSmrg
7e995a2eSmrg	elf = elf_memory(elf_buffer, elf_size);
7e995a2eSmrg
7e995a2eSmrg	elf_getshdrstrndx(elf, &section_str_index);
7e995a2eSmrg
7e995a2eSmrg	while ((section = elf_nextscn(elf, section))) {
7e995a2eSmrg		const char *name;
7e995a2eSmrg		Elf_Data *section_data = NULL;
7e995a2eSmrg		GElf_Shdr section_header;
7e995a2eSmrg		if (gelf_getshdr(section, &section_header) != &section_header) {
7e995a2eSmrg			fprintf(stderr, "Failed to read ELF section header\n");
7e995a2eSmrg			return;
7e995a2eSmrg		}
7e995a2eSmrg		name = elf_strptr(elf, section_str_index, section_header.sh_name);
7e995a2eSmrg		if (!strcmp(name, ".text")) {
7e995a2eSmrg			section_data = elf_getdata(section, section_data);
7e995a2eSmrg			binary->code_size = section_data->d_size;
7e995a2eSmrg			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
7e995a2eSmrg			memcpy(binary->code, section_data->d_buf, binary->code_size);
7e995a2eSmrg		} else if (!strcmp(name, ".AMDGPU.config")) {
7e995a2eSmrg			section_data = elf_getdata(section, section_data);
7e995a2eSmrg			binary->config_size = section_data->d_size;
7e995a2eSmrg			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
7e995a2eSmrg			memcpy(binary->config, section_data->d_buf, binary->config_size);
7e995a2eSmrg		} else if (!strcmp(name, ".AMDGPU.disasm")) {
7e995a2eSmrg			/* Always read disassembly if it's available. */
7e995a2eSmrg			section_data = elf_getdata(section, section_data);
7e995a2eSmrg			binary->disasm_string = strndup(section_data->d_buf,
7e995a2eSmrg							section_data->d_size);
7e995a2eSmrg		} else if (!strncmp(name, ".rodata", 7)) {
7e995a2eSmrg			section_data = elf_getdata(section, section_data);
7e995a2eSmrg			binary->rodata_size = section_data->d_size;
7e995a2eSmrg			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
7e995a2eSmrg			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
7e995a2eSmrg		} else if (!strncmp(name, ".symtab", 7)) {
7e995a2eSmrg			symbols = elf_getdata(section, section_data);
7e995a2eSmrg			symbol_sh_link = section_header.sh_link;
7e995a2eSmrg			parse_symbol_table(symbols, &section_header, binary);
7e995a2eSmrg		} else if (!strcmp(name, ".rel.text")) {
7e995a2eSmrg			relocs = elf_getdata(section, section_data);
7e995a2eSmrg			binary->reloc_count = section_header.sh_size /
7e995a2eSmrg					section_header.sh_entsize;
7e995a2eSmrg		}
7e995a2eSmrg	}
7e995a2eSmrg
7e995a2eSmrg	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
7e995a2eSmrg
7e995a2eSmrg	if (elf){
7e995a2eSmrg		elf_end(elf);
7e995a2eSmrg	}
7e995a2eSmrg	FREE(elf_buffer);
7e995a2eSmrg
7e995a2eSmrg	/* Cache the config size per symbol */
7e995a2eSmrg	if (binary->global_symbol_count) {
7e995a2eSmrg		binary->config_size_per_symbol =
7e995a2eSmrg			binary->config_size / binary->global_symbol_count;
7e995a2eSmrg	} else {
7e995a2eSmrg		binary->global_symbol_count = 1;
7e995a2eSmrg		binary->config_size_per_symbol = binary->config_size;
7e995a2eSmrg	}
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrgstatic const unsigned char *r600_shader_binary_config_start(
1463c08dSmrg	const struct r600_shader_binary *binary,
7e995a2eSmrg	uint64_t symbol_offset)
7e995a2eSmrg{
af69d88dSmrg	unsigned i;
7e995a2eSmrg	for (i = 0; i < binary->global_symbol_count; ++i) {
7e995a2eSmrg		if (binary->global_symbol_offsets[i] == symbol_offset) {
7e995a2eSmrg			unsigned offset = i * binary->config_size_per_symbol;
7e995a2eSmrg			return binary->config + offset;
7e995a2eSmrg		}
7e995a2eSmrg	}
7e995a2eSmrg	return binary->config;
7e995a2eSmrg}
af69d88dSmrg
1463c08dSmrgstatic void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
7e995a2eSmrg					   struct r600_bytecode *bc,
7e995a2eSmrg					   uint64_t symbol_offset,
7e995a2eSmrg					   boolean *use_kill)
7e995a2eSmrg{
7e995a2eSmrg       unsigned i;
7e995a2eSmrg       const unsigned char *config =
7e995a2eSmrg               r600_shader_binary_config_start(binary, symbol_offset);
7e995a2eSmrg
7e995a2eSmrg       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
7e995a2eSmrg               unsigned reg =
7e995a2eSmrg                       util_le32_to_cpu(*(uint32_t*)(config + i));
7e995a2eSmrg               unsigned value =
7e995a2eSmrg                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
7e995a2eSmrg               switch (reg) {
7e995a2eSmrg               /* R600 / R700 */
7e995a2eSmrg               case R_028850_SQ_PGM_RESOURCES_PS:
7e995a2eSmrg               case R_028868_SQ_PGM_RESOURCES_VS:
7e995a2eSmrg               /* Evergreen / Northern Islands */
7e995a2eSmrg               case R_028844_SQ_PGM_RESOURCES_PS:
7e995a2eSmrg               case R_028860_SQ_PGM_RESOURCES_VS:
7e995a2eSmrg               case R_0288D4_SQ_PGM_RESOURCES_LS:
7e995a2eSmrg                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
7e995a2eSmrg                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
7e995a2eSmrg                       break;
7e995a2eSmrg               case R_02880C_DB_SHADER_CONTROL:
7e995a2eSmrg                       *use_kill = G_02880C_KILL_ENABLE(value);
7e995a2eSmrg                       break;
7e995a2eSmrg               case R_0288E8_SQ_LDS_ALLOC:
7e995a2eSmrg                       bc->nlds_dw = value;
7e995a2eSmrg                       break;
7e995a2eSmrg               }
7e995a2eSmrg       }
7e995a2eSmrg}
af69d88dSmrg
7e995a2eSmrgstatic unsigned r600_create_shader(struct r600_bytecode *bc,
1463c08dSmrg				   const struct r600_shader_binary *binary,
7e995a2eSmrg				   boolean *use_kill)
7e995a2eSmrg
7e995a2eSmrg{
7e995a2eSmrg	assert(binary->code_size % 4 == 0);
7e995a2eSmrg	bc->bytecode = CALLOC(1, binary->code_size);
7e995a2eSmrg	memcpy(bc->bytecode, binary->code, binary->code_size);
7e995a2eSmrg	bc->ndw = binary->code_size / 4;
7e995a2eSmrg
7e995a2eSmrg	r600_shader_binary_read_config(binary, bc, 0, use_kill);
7e995a2eSmrg	return 0;
7e995a2eSmrg}
af69d88dSmrg
af69d88dSmrg#endif
af69d88dSmrg
7e995a2eSmrgstatic void r600_destroy_shader(struct r600_bytecode *bc)
7e995a2eSmrg{
7e995a2eSmrg	FREE(bc->bytecode);
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrgstatic void *evergreen_create_compute_state(struct pipe_context *ctx,
7e995a2eSmrg					    const struct pipe_compute_state *cso)
7e995a2eSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
7e995a2eSmrg	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
7e995a2eSmrg#ifdef HAVE_OPENCL
1463c08dSmrg	const struct pipe_binary_program_header *header;
7e995a2eSmrg	void *p;
7e995a2eSmrg	boolean use_kill;
7e995a2eSmrg#endif
7e995a2eSmrg
7e995a2eSmrg	shader->ctx = rctx;
af69d88dSmrg	shader->local_size = cso->req_local_mem;
af69d88dSmrg	shader->private_size = cso->req_private_mem;
af69d88dSmrg	shader->input_size = cso->req_input_mem;
af69d88dSmrg
7e995a2eSmrg	shader->ir_type = cso->ir_type;
af69d88dSmrg
1463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
1463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
1463c08dSmrg		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
7e995a2eSmrg		return shader;
af69d88dSmrg	}
7e995a2eSmrg#ifdef HAVE_OPENCL
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
7e995a2eSmrg	header = cso->prog;
7e995a2eSmrg	radeon_shader_binary_init(&shader->binary);
1463c08dSmrg	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
7e995a2eSmrg	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
7e995a2eSmrg
7e995a2eSmrg	/* Upload code + ROdata */
7e995a2eSmrg	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
7e995a2eSmrg							shader->bc.ndw * 4);
d8407755Smaya	p = r600_buffer_map_sync_with_rings(
d8407755Smaya		&rctx->b, shader->code_bo,
1463c08dSmrg		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
7e995a2eSmrg	//TODO: use util_memcpy_cpu_to_le32 ?
7e995a2eSmrg	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
1463c08dSmrg	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
af69d88dSmrg#endif
7e995a2eSmrg
af69d88dSmrg	return shader;
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
7e995a2eSmrg	struct r600_pipe_compute *shader = state;
7e995a2eSmrg
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
af69d88dSmrg
af69d88dSmrg	if (!shader)
af69d88dSmrg		return;
af69d88dSmrg
1463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
1463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
7e995a2eSmrg		r600_delete_shader_selector(ctx, shader->sel);
7e995a2eSmrg	} else {
af69d88dSmrg#ifdef HAVE_OPENCL
7e995a2eSmrg		radeon_shader_binary_clean(&shader->binary);
d8407755Smaya		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
d8407755Smaya		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
af69d88dSmrg#endif
7e995a2eSmrg		r600_destroy_shader(&shader->bc);
7e995a2eSmrg	}
af69d88dSmrg	FREE(shader);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
7e995a2eSmrg	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
7e995a2eSmrg
7e995a2eSmrg	if (!state) {
7e995a2eSmrg		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
7e995a2eSmrg		return;
7e995a2eSmrg	}
7e995a2eSmrg
1463c08dSmrg	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
1463c08dSmrg	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
7e995a2eSmrg		bool compute_dirty;
1463c08dSmrg		cstate->sel->ir_type = cstate->ir_type;
1463c08dSmrg		if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
1463c08dSmrg			R600_ERR("Failed to select compute shader\n");
7e995a2eSmrg	}
1463c08dSmrg
7e995a2eSmrg	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
af69d88dSmrg}
af69d88dSmrg
af69d88dSmrg/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
af69d88dSmrg * kernel parameters there are implicit parameters that need to be stored
af69d88dSmrg * in the vertex buffer as well.  Here is how these parameters are organized in
af69d88dSmrg * the buffer:
af69d88dSmrg *
af69d88dSmrg * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
af69d88dSmrg * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
af69d88dSmrg * DWORDS 6-8: Number of work items within each work group in each dimension
af69d88dSmrg *             (x,y,z)
af69d88dSmrg * DWORDS 9+ : Kernel parameters
af69d88dSmrg */
7e995a2eSmrgstatic void evergreen_compute_upload_input(struct pipe_context *ctx,
7e995a2eSmrg					   const struct pipe_grid_info *info)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
7e995a2eSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
af69d88dSmrg	unsigned i;
af69d88dSmrg	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
af69d88dSmrg	 * parameters.
af69d88dSmrg	 */
7e995a2eSmrg	unsigned input_size;
7e995a2eSmrg	uint32_t *num_work_groups_start;
7e995a2eSmrg	uint32_t *global_size_start;
7e995a2eSmrg	uint32_t *local_size_start;
7e995a2eSmrg	uint32_t *kernel_parameters_start;
af69d88dSmrg	struct pipe_box box;
af69d88dSmrg	struct pipe_transfer *transfer = NULL;
af69d88dSmrg
7e995a2eSmrg	if (!shader)
7e995a2eSmrg		return;
af69d88dSmrg	if (shader->input_size == 0) {
af69d88dSmrg		return;
af69d88dSmrg	}
7e995a2eSmrg	input_size = shader->input_size + 36;
af69d88dSmrg	if (!shader->kernel_param) {
af69d88dSmrg		/* Add space for the grid dimensions */
af69d88dSmrg		shader->kernel_param = (struct r600_resource *)
7e995a2eSmrg			pipe_buffer_create(ctx->screen, 0,
af69d88dSmrg					PIPE_USAGE_IMMUTABLE, input_size);
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	u_box_1d(0, input_size, &box);
1463c08dSmrg	num_work_groups_start = ctx->buffer_map(ctx,
af69d88dSmrg			(struct pipe_resource*)shader->kernel_param,
1463c08dSmrg			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
af69d88dSmrg			&box, &transfer);
af69d88dSmrg	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
af69d88dSmrg	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
af69d88dSmrg	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
af69d88dSmrg
af69d88dSmrg	/* Copy the work group size */
7e995a2eSmrg	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
af69d88dSmrg
af69d88dSmrg	/* Copy the global size */
af69d88dSmrg	for (i = 0; i < 3; i++) {
7e995a2eSmrg		global_size_start[i] = info->grid[i] * info->block[i];
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* Copy the local dimensions */
7e995a2eSmrg	memcpy(local_size_start, info->block, 3 * sizeof(uint));
af69d88dSmrg
af69d88dSmrg	/* Copy the kernel inputs */
7e995a2eSmrg	memcpy(kernel_parameters_start, info->input, shader->input_size);
af69d88dSmrg
af69d88dSmrg	for (i = 0; i < (input_size / 4); i++) {
7e995a2eSmrg		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
af69d88dSmrg			((unsigned*)num_work_groups_start)[i]);
af69d88dSmrg	}
af69d88dSmrg
1463c08dSmrg	ctx->buffer_unmap(ctx, transfer);
af69d88dSmrg
7e995a2eSmrg	/* ID=0 and ID=3 are reserved for the parameters.
7e995a2eSmrg	 * LLVM will preferably use ID=0, but it does not work for dynamic
7e995a2eSmrg	 * indices. */
7e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
7e995a2eSmrg			(struct pipe_resource*)shader->kernel_param);
7e995a2eSmrg	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
af69d88dSmrg			(struct pipe_resource*)shader->kernel_param);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_emit_dispatch(struct r600_context *rctx,
7e995a2eSmrg				    const struct pipe_grid_info *info,
7e995a2eSmrg				    uint32_t indirect_grid[3])
af69d88dSmrg{
af69d88dSmrg	int i;
1463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
af69d88dSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
7e995a2eSmrg	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
af69d88dSmrg	unsigned num_waves;
7e995a2eSmrg	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
af69d88dSmrg	unsigned wave_divisor = (16 * num_pipes);
af69d88dSmrg	int group_size = 1;
af69d88dSmrg	int grid_size = 1;
7e995a2eSmrg	unsigned lds_size = shader->local_size / 4;
7e995a2eSmrg
1463c08dSmrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
1463c08dSmrg	    shader->ir_type != PIPE_SHADER_IR_NIR)
7e995a2eSmrg		lds_size += shader->bc.nlds_dw;
1463c08dSmrg
af69d88dSmrg	/* Calculate group_size/grid_size */
af69d88dSmrg	for (i = 0; i < 3; i++) {
7e995a2eSmrg		group_size *= info->block[i];
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	for (i = 0; i < 3; i++)	{
7e995a2eSmrg		grid_size *= info->grid[i];
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
7e995a2eSmrg	num_waves = (info->block[0] * info->block[1] * info->block[2] +
af69d88dSmrg			wave_divisor - 1) / wave_divisor;
af69d88dSmrg
af69d88dSmrg	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
af69d88dSmrg				"%u wavefronts per thread block, "
af69d88dSmrg				"allocating %u dwords lds.\n",
af69d88dSmrg				num_pipes, num_waves, lds_size);
af69d88dSmrg
7e995a2eSmrg	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
af69d88dSmrg
7e995a2eSmrg	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
af69d88dSmrg	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
af69d88dSmrg	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
af69d88dSmrg	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
af69d88dSmrg
7e995a2eSmrg	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
af69d88dSmrg								group_size);
af69d88dSmrg
7e995a2eSmrg	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
7e995a2eSmrg	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
7e995a2eSmrg	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
7e995a2eSmrg	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
af69d88dSmrg
af69d88dSmrg	if (rctx->b.chip_class < CAYMAN) {
af69d88dSmrg		assert(lds_size <= 8192);
af69d88dSmrg	} else {
af69d88dSmrg		/* Cayman appears to have a slightly smaller limit, see the
af69d88dSmrg		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
af69d88dSmrg		assert(lds_size <= 8160);
af69d88dSmrg	}
af69d88dSmrg
7e995a2eSmrg	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
af69d88dSmrg					lds_size | (num_waves << 14));
af69d88dSmrg
7e995a2eSmrg	if (info->indirect) {
7e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
7e995a2eSmrg		radeon_emit(cs, indirect_grid[0]);
7e995a2eSmrg		radeon_emit(cs, indirect_grid[1]);
7e995a2eSmrg		radeon_emit(cs, indirect_grid[2]);
7e995a2eSmrg		radeon_emit(cs, 1);
7e995a2eSmrg	} else {
7e995a2eSmrg		/* Dispatch packet */
7e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
7e995a2eSmrg		radeon_emit(cs, info->grid[0]);
7e995a2eSmrg		radeon_emit(cs, info->grid[1]);
7e995a2eSmrg		radeon_emit(cs, info->grid[2]);
7e995a2eSmrg		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
7e995a2eSmrg		radeon_emit(cs, 1);
7e995a2eSmrg	}
7e995a2eSmrg
7e995a2eSmrg	if (rctx->is_debug)
7e995a2eSmrg		eg_trace_emit(rctx);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void compute_setup_cbs(struct r600_context *rctx)
af69d88dSmrg{
1463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
af69d88dSmrg	unsigned i;
af69d88dSmrg
af69d88dSmrg	/* Emit colorbuffers. */
af69d88dSmrg	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
7e995a2eSmrg	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
7e995a2eSmrg		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
7e995a2eSmrg		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
af69d88dSmrg						       (struct r600_resource*)cb->base.texture,
af69d88dSmrg						       RADEON_USAGE_READWRITE,
7e995a2eSmrg						       RADEON_PRIO_SHADER_RW_BUFFER);
af69d88dSmrg
7e995a2eSmrg		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
af69d88dSmrg		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
af69d88dSmrg		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
af69d88dSmrg
af69d88dSmrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
af69d88dSmrg		radeon_emit(cs, reloc);
af69d88dSmrg
af69d88dSmrg		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
af69d88dSmrg		radeon_emit(cs, reloc);
af69d88dSmrg	}
7e995a2eSmrg	for (; i < 8 ; i++)
7e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
7e995a2eSmrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
7e995a2eSmrg	for (; i < 12; i++)
7e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
7e995a2eSmrg					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
7e995a2eSmrg
7e995a2eSmrg	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
7e995a2eSmrg	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
7e995a2eSmrg				       rctx->compute_cb_target_mask);
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrgstatic void compute_emit_cs(struct r600_context *rctx,
7e995a2eSmrg			    const struct pipe_grid_info *info)
7e995a2eSmrg{
1463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
7e995a2eSmrg	bool compute_dirty = false;
7e995a2eSmrg	struct r600_pipe_shader *current;
7e995a2eSmrg	struct r600_shader_atomic combined_atomics[8];
7e995a2eSmrg	uint8_t atomic_used_mask;
7e995a2eSmrg	uint32_t indirect_grid[3] = { 0, 0, 0 };
7e995a2eSmrg
7e995a2eSmrg	/* make sure that the gfx ring is only one active */
1463c08dSmrg	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
7e995a2eSmrg		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
7e995a2eSmrg	}
7e995a2eSmrg
7e995a2eSmrg	r600_update_compressed_resource_state(rctx, true);
7e995a2eSmrg
7e995a2eSmrg	if (!rctx->cmd_buf_is_compute) {
7e995a2eSmrg		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
7e995a2eSmrg		rctx->cmd_buf_is_compute = true;
7e995a2eSmrg	}
7e995a2eSmrg
1463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
1463c08dSmrg	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
1463c08dSmrg		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
1463c08dSmrg			R600_ERR("Failed to select compute shader\n");
1463c08dSmrg			return;
1463c08dSmrg		}
1463c08dSmrg
7e995a2eSmrg		current = rctx->cs_shader_state.shader->sel->current;
7e995a2eSmrg		if (compute_dirty) {
7e995a2eSmrg			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
7e995a2eSmrg			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
7e995a2eSmrg			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
7e995a2eSmrg		}
7e995a2eSmrg
7e995a2eSmrg		bool need_buf_const = current->shader.uses_tex_buffers ||
7e995a2eSmrg			current->shader.has_txq_cube_array_z_comp;
7e995a2eSmrg
7e995a2eSmrg		if (info->indirect) {
7e995a2eSmrg			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
1463c08dSmrg			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
7e995a2eSmrg			unsigned offset = info->indirect_offset / 4;
7e995a2eSmrg			indirect_grid[0] = data[offset];
7e995a2eSmrg			indirect_grid[1] = data[offset + 1];
7e995a2eSmrg			indirect_grid[2] = data[offset + 2];
7e995a2eSmrg		}
7e995a2eSmrg		for (int i = 0; i < 3; i++) {
7e995a2eSmrg			rctx->cs_block_grid_sizes[i] = info->block[i];
7e995a2eSmrg			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
7e995a2eSmrg		}
7e995a2eSmrg		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
7e995a2eSmrg		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
7e995a2eSmrg
7e995a2eSmrg		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
7e995a2eSmrg		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
7e995a2eSmrg
7e995a2eSmrg		if (need_buf_const) {
7e995a2eSmrg			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
af69d88dSmrg		}
7e995a2eSmrg		r600_update_driver_const_buffers(rctx, true);
7e995a2eSmrg
7e995a2eSmrg		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
7e995a2eSmrg		if (atomic_used_mask) {
7e995a2eSmrg			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
7e995a2eSmrg			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
af69d88dSmrg		}
7e995a2eSmrg	} else
7e995a2eSmrg		r600_need_cs_space(rctx, 0, true, 0);
7e995a2eSmrg
7e995a2eSmrg	/* Initialize all the compute-related registers.
7e995a2eSmrg	 *
7e995a2eSmrg	 * See evergreen_init_atom_start_compute_cs() in this file for the list
7e995a2eSmrg	 * of registers initialized by the start_compute_cs_cmd atom.
7e995a2eSmrg	 */
7e995a2eSmrg	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
7e995a2eSmrg
7e995a2eSmrg	/* emit config state */
7e995a2eSmrg	if (rctx->b.chip_class == EVERGREEN) {
1463c08dSmrg		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
1463c08dSmrg		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
7e995a2eSmrg			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
7e995a2eSmrg			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
7e995a2eSmrg			radeon_emit(cs, 0);
7e995a2eSmrg			radeon_emit(cs, 0);
7e995a2eSmrg			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
7e995a2eSmrg		} else
7e995a2eSmrg			r600_emit_atom(rctx, &rctx->config_state.atom);
af69d88dSmrg	}
af69d88dSmrg
7e995a2eSmrg	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
7e995a2eSmrg	r600_flush_emit(rctx);
af69d88dSmrg
1463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
1463c08dSmrg	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
af69d88dSmrg
7e995a2eSmrg		compute_setup_cbs(rctx);
7e995a2eSmrg
7e995a2eSmrg		/* Emit vertex buffer state */
7e995a2eSmrg		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
7e995a2eSmrg		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
7e995a2eSmrg	} else {
7e995a2eSmrg		uint32_t rat_mask;
7e995a2eSmrg
7e995a2eSmrg		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
7e995a2eSmrg		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
7e995a2eSmrg					       rat_mask);
7e995a2eSmrg	}
7e995a2eSmrg
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
af69d88dSmrg
af69d88dSmrg	/* Emit constant buffer state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
7e995a2eSmrg
7e995a2eSmrg	/* Emit sampler state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
7e995a2eSmrg
7e995a2eSmrg	/* Emit sampler view (texture resource) state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
7e995a2eSmrg
7e995a2eSmrg	/* Emit images state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->compute_images.atom);
7e995a2eSmrg
7e995a2eSmrg	/* Emit buffers state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
af69d88dSmrg
7e995a2eSmrg	/* Emit shader state */
7e995a2eSmrg	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
af69d88dSmrg
af69d88dSmrg	/* Emit dispatch state and dispatch packet */
7e995a2eSmrg	evergreen_emit_dispatch(rctx, info, indirect_grid);
af69d88dSmrg
af69d88dSmrg	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
af69d88dSmrg	 */
7e995a2eSmrg	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
af69d88dSmrg		      R600_CONTEXT_INV_VERTEX_CACHE |
af69d88dSmrg	              R600_CONTEXT_INV_TEX_CACHE;
7e995a2eSmrg	r600_flush_emit(rctx);
7e995a2eSmrg	rctx->b.flags = 0;
af69d88dSmrg
7e995a2eSmrg	if (rctx->b.chip_class >= CAYMAN) {
7e995a2eSmrg		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
7e995a2eSmrg		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
af69d88dSmrg		/* DEALLOC_STATE prevents the GPU from hanging when a
af69d88dSmrg		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
af69d88dSmrg		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
af69d88dSmrg		 */
7e995a2eSmrg		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
7e995a2eSmrg		radeon_emit(cs, 0);
af69d88dSmrg	}
1463c08dSmrg	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
1463c08dSmrg	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
7e995a2eSmrg		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
af69d88dSmrg
af69d88dSmrg#if 0
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
af69d88dSmrg	for (i = 0; i < cs->cdw; i++) {
7e995a2eSmrg		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
af69d88dSmrg	}
af69d88dSmrg#endif
af69d88dSmrg
af69d88dSmrg}
af69d88dSmrg
af69d88dSmrg
af69d88dSmrg/**
af69d88dSmrg * Emit function for r600_cs_shader_state atom
af69d88dSmrg */
7e995a2eSmrgvoid evergreen_emit_cs_shader(struct r600_context *rctx,
7e995a2eSmrg			      struct r600_atom *atom)
af69d88dSmrg{
af69d88dSmrg	struct r600_cs_shader_state *state =
af69d88dSmrg					(struct r600_cs_shader_state*)atom;
af69d88dSmrg	struct r600_pipe_compute *shader = state->shader;
1463c08dSmrg	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
7e995a2eSmrg	uint64_t va;
7e995a2eSmrg	struct r600_resource *code_bo;
7e995a2eSmrg	unsigned ngpr, nstack;
7e995a2eSmrg
1463c08dSmrg	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
1463c08dSmrg	    shader->ir_type == PIPE_SHADER_IR_NIR) {
7e995a2eSmrg		code_bo = shader->sel->current->bo;
7e995a2eSmrg		va = shader->sel->current->bo->gpu_address;
7e995a2eSmrg		ngpr = shader->sel->current->shader.bc.ngpr;
7e995a2eSmrg		nstack = shader->sel->current->shader.bc.nstack;
7e995a2eSmrg	} else {
7e995a2eSmrg		code_bo = shader->code_bo;
7e995a2eSmrg		va = shader->code_bo->gpu_address + state->pc;
7e995a2eSmrg		ngpr = shader->bc.ngpr;
7e995a2eSmrg		nstack = shader->bc.nstack;
7e995a2eSmrg	}
af69d88dSmrg
7e995a2eSmrg	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
7e995a2eSmrg	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
af69d88dSmrg	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
7e995a2eSmrg			S_0288D4_NUM_GPRS(ngpr) |
7e995a2eSmrg			S_0288D4_DX10_CLAMP(1) |
7e995a2eSmrg			S_0288D4_STACK_SIZE(nstack));
af69d88dSmrg	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
af69d88dSmrg
af69d88dSmrg	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
7e995a2eSmrg	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
7e995a2eSmrg					      code_bo, RADEON_USAGE_READ,
7e995a2eSmrg					      RADEON_PRIO_SHADER_BINARY));
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_launch_grid(struct pipe_context *ctx,
7e995a2eSmrg				  const struct pipe_grid_info *info)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
af69d88dSmrg#ifdef HAVE_OPENCL
7e995a2eSmrg	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
7e995a2eSmrg	boolean use_kill;
af69d88dSmrg
1463c08dSmrg	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
1463c08dSmrg	    shader->ir_type != PIPE_SHADER_IR_NIR) {
7e995a2eSmrg		rctx->cs_shader_state.pc = info->pc;
7e995a2eSmrg		/* Get the config information for this kernel. */
7e995a2eSmrg		r600_shader_binary_read_config(&shader->binary, &shader->bc,
7e995a2eSmrg					       info->pc, &use_kill);
7e995a2eSmrg	} else {
7e995a2eSmrg		use_kill = false;
7e995a2eSmrg		rctx->cs_shader_state.pc = 0;
af69d88dSmrg	}
af69d88dSmrg#endif
7e995a2eSmrg
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
7e995a2eSmrg
7e995a2eSmrg
7e995a2eSmrg	evergreen_compute_upload_input(ctx, info);
7e995a2eSmrg	compute_emit_cs(rctx, info);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_set_compute_resources(struct pipe_context *ctx,
7e995a2eSmrg					    unsigned start, unsigned count,
7e995a2eSmrg					    struct pipe_surface **surfaces)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
af69d88dSmrg	struct r600_surface **resources = (struct r600_surface **)surfaces;
af69d88dSmrg
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
af69d88dSmrg			start, count);
af69d88dSmrg
af69d88dSmrg	for (unsigned i = 0; i < count; i++) {
7e995a2eSmrg		/* The First four vertex buffers are reserved for parameters and
af69d88dSmrg		 * global buffers. */
7e995a2eSmrg		unsigned vtx_id = 4 + i;
af69d88dSmrg		if (resources[i]) {
af69d88dSmrg			struct r600_resource_global *buffer =
af69d88dSmrg				(struct r600_resource_global*)
af69d88dSmrg				resources[i]->base.texture;
af69d88dSmrg			if (resources[i]->base.writable) {
af69d88dSmrg				assert(i+1 < 12);
af69d88dSmrg
7e995a2eSmrg				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
af69d88dSmrg				(struct r600_resource *)resources[i]->base.texture,
af69d88dSmrg				buffer->chunk->start_in_dw*4,
af69d88dSmrg				resources[i]->base.texture->width0);
af69d88dSmrg			}
af69d88dSmrg
7e995a2eSmrg			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
af69d88dSmrg					buffer->chunk->start_in_dw * 4,
af69d88dSmrg					resources[i]->base.texture);
af69d88dSmrg		}
af69d88dSmrg	}
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgstatic void evergreen_set_global_binding(struct pipe_context *ctx,
7e995a2eSmrg					 unsigned first, unsigned n,
7e995a2eSmrg					 struct pipe_resource **resources,
7e995a2eSmrg					 uint32_t **handles)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context *)ctx;
7e995a2eSmrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
af69d88dSmrg	struct r600_resource_global **buffers =
af69d88dSmrg		(struct r600_resource_global **)resources;
af69d88dSmrg	unsigned i;
af69d88dSmrg
7e995a2eSmrg	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
af69d88dSmrg			first, n);
af69d88dSmrg
af69d88dSmrg	if (!resources) {
af69d88dSmrg		/* XXX: Unset */
af69d88dSmrg		return;
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* We mark these items for promotion to the pool if they
af69d88dSmrg	 * aren't already there */
af69d88dSmrg	for (i = first; i < first + n; i++) {
af69d88dSmrg		struct compute_memory_item *item = buffers[i]->chunk;
af69d88dSmrg
af69d88dSmrg		if (!is_item_in_pool(item))
af69d88dSmrg			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
af69d88dSmrg	}
af69d88dSmrg
7e995a2eSmrg	if (compute_memory_finalize_pending(pool, ctx) == -1) {
af69d88dSmrg		/* XXX: Unset */
af69d88dSmrg		return;
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	for (i = first; i < first + n; i++)
af69d88dSmrg	{
af69d88dSmrg		uint32_t buffer_offset;
af69d88dSmrg		uint32_t handle;
af69d88dSmrg		assert(resources[i]->target == PIPE_BUFFER);
af69d88dSmrg		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
af69d88dSmrg
af69d88dSmrg		buffer_offset = util_le32_to_cpu(*(handles[i]));
af69d88dSmrg		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
af69d88dSmrg
af69d88dSmrg		*(handles[i]) = util_cpu_to_le32(handle);
af69d88dSmrg	}
af69d88dSmrg
7e995a2eSmrg	/* globals for writing */
7e995a2eSmrg	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
7e995a2eSmrg	/* globals for reading */
7e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
af69d88dSmrg				(struct pipe_resource*)pool->bo);
7e995a2eSmrg
7e995a2eSmrg	/* constants for reading, LLVM puts them in text segment */
7e995a2eSmrg	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
7e995a2eSmrg				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
af69d88dSmrg}
af69d88dSmrg
af69d88dSmrg/**
af69d88dSmrg * This function initializes all the compute specific registers that need to
af69d88dSmrg * be initialized for each compute command stream.  Registers that are common
af69d88dSmrg * to both compute and 3D will be initialized at the beginning of each compute
af69d88dSmrg * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
af69d88dSmrg * packet requires that the shader type bit be set, we must initialize all
af69d88dSmrg * context registers needed for compute in this function.  The registers
7e995a2eSmrg * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
af69d88dSmrg * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
af69d88dSmrg * on the GPU family.
af69d88dSmrg */
7e995a2eSmrgvoid evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
af69d88dSmrg{
7e995a2eSmrg	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
af69d88dSmrg	int num_threads;
af69d88dSmrg	int num_stack_entries;
af69d88dSmrg
7e995a2eSmrg	/* since all required registers are initialized in the
af69d88dSmrg	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
af69d88dSmrg	 */
af69d88dSmrg	r600_init_command_buffer(cb, 256);
af69d88dSmrg	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
af69d88dSmrg
af69d88dSmrg	/* We're setting config registers here. */
af69d88dSmrg	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
af69d88dSmrg	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
af69d88dSmrg
7e995a2eSmrg	switch (rctx->b.family) {
af69d88dSmrg	case CHIP_CEDAR:
af69d88dSmrg	default:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_REDWOOD:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_JUNIPER:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 512;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_CYPRESS:
af69d88dSmrg	case CHIP_HEMLOCK:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 512;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_PALM:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_SUMO:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_SUMO2:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 512;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_BARTS:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 512;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_TURKS:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	case CHIP_CAICOS:
af69d88dSmrg		num_threads = 128;
af69d88dSmrg		num_stack_entries = 256;
af69d88dSmrg		break;
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* The primitive type always needs to be POINTLIST for compute. */
af69d88dSmrg	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
af69d88dSmrg						V_008958_DI_PT_POINTLIST);
af69d88dSmrg
7e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
af69d88dSmrg
af69d88dSmrg		/* These registers control which simds can be used by each stage.
af69d88dSmrg		 * The default for these registers is 0xffffffff, which means
af69d88dSmrg		 * all simds are available for each stage.  It's possible we may
af69d88dSmrg		 * want to play around with these in the future, but for now
af69d88dSmrg		 * the default value is fine.
af69d88dSmrg		 *
af69d88dSmrg		 * R_008E20_SQ_STATIC_THREAD_MGMT1
af69d88dSmrg		 * R_008E24_SQ_STATIC_THREAD_MGMT2
af69d88dSmrg		 * R_008E28_SQ_STATIC_THREAD_MGMT3
af69d88dSmrg		 */
af69d88dSmrg
7e995a2eSmrg		/* XXX: We may need to adjust the thread and stack resource
af69d88dSmrg		 * values for 3D/compute interop */
af69d88dSmrg
af69d88dSmrg		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
af69d88dSmrg
af69d88dSmrg		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
af69d88dSmrg		 * Set the number of threads used by the PS/VS/GS/ES stage to
af69d88dSmrg		 * 0.
af69d88dSmrg		 */
af69d88dSmrg		r600_store_value(cb, 0);
af69d88dSmrg
af69d88dSmrg		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
af69d88dSmrg		 * Set the number of threads used by the CS (aka LS) stage to
af69d88dSmrg		 * the maximum number of threads and set the number of threads
af69d88dSmrg		 * for the HS stage to 0. */
af69d88dSmrg		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
af69d88dSmrg
af69d88dSmrg		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
af69d88dSmrg		 * Set the Control Flow stack entries to 0 for PS/VS stages */
af69d88dSmrg		r600_store_value(cb, 0);
af69d88dSmrg
af69d88dSmrg		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
af69d88dSmrg		 * Set the Control Flow stack entries to 0 for GS/ES stages */
af69d88dSmrg		r600_store_value(cb, 0);
af69d88dSmrg
af69d88dSmrg		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
af69d88dSmrg		 * Set the Contol Flow stack entries to 0 for the HS stage, and
af69d88dSmrg		 * set it to the maximum value for the CS (aka LS) stage. */
af69d88dSmrg		r600_store_value(cb,
af69d88dSmrg			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
af69d88dSmrg	}
af69d88dSmrg	/* Give the compute shader all the available LDS space.
af69d88dSmrg	 * NOTE: This only sets the maximum number of dwords that a compute
af69d88dSmrg	 * shader can allocate.  When a shader is executed, we still need to
af69d88dSmrg	 * allocate the appropriate amount of LDS dwords using the
af69d88dSmrg	 * CM_R_0288E8_SQ_LDS_ALLOC register.
af69d88dSmrg	 */
7e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
af69d88dSmrg		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
af69d88dSmrg			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
af69d88dSmrg	} else {
af69d88dSmrg		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
af69d88dSmrg			S_0286FC_NUM_PS_LDS(0) |
af69d88dSmrg			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* Context Registers */
af69d88dSmrg
7e995a2eSmrg	if (rctx->b.chip_class < CAYMAN) {
af69d88dSmrg		/* workaround for hw issues with dyn gpr - must set all limits
af69d88dSmrg		 * to 240 instead of 0, 0x1e == 240 / 8
af69d88dSmrg		 */
af69d88dSmrg		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
af69d88dSmrg				S_028838_PS_GPRS(0x1e) |
af69d88dSmrg				S_028838_VS_GPRS(0x1e) |
af69d88dSmrg				S_028838_GS_GPRS(0x1e) |
af69d88dSmrg				S_028838_ES_GPRS(0x1e) |
af69d88dSmrg				S_028838_HS_GPRS(0x1e) |
af69d88dSmrg				S_028838_LS_GPRS(0x1e));
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
af69d88dSmrg	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
af69d88dSmrg		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
af69d88dSmrg
af69d88dSmrg	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
af69d88dSmrg
af69d88dSmrg	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
7e995a2eSmrg			       S_0286E8_TID_IN_GROUP_ENA(1) |
7e995a2eSmrg			       S_0286E8_TGID_ENA(1) |
7e995a2eSmrg			       S_0286E8_DISABLE_INDEX_PACK(1));
af69d88dSmrg
af69d88dSmrg	/* The LOOP_CONST registers are an optimizations for loops that allows
af69d88dSmrg	 * you to store the initial counter, increment value, and maximum
af69d88dSmrg	 * counter value in a register so that hardware can calculate the
af69d88dSmrg	 * correct number of iterations for the loop, so that you don't need
af69d88dSmrg	 * to have the loop counter in your shader code.  We don't currently use
af69d88dSmrg	 * this optimization, so we must keep track of the counter in the
af69d88dSmrg	 * shader and use a break instruction to exit loops.  However, the
af69d88dSmrg	 * hardware will still uses this register to determine when to exit a
af69d88dSmrg	 * loop, so we need to initialize the counter to 0, set the increment
af69d88dSmrg	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
af69d88dSmrg	 * is the maximum value allowed.  This gives us a maximum of 4096
af69d88dSmrg	 * iterations for our loops, but hopefully our break instruction will
af69d88dSmrg	 * execute before some time before the 4096th iteration.
af69d88dSmrg	 */
af69d88dSmrg	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
af69d88dSmrg}
af69d88dSmrg
7e995a2eSmrgvoid evergreen_init_compute_state_functions(struct r600_context *rctx)
af69d88dSmrg{
7e995a2eSmrg	rctx->b.b.create_compute_state = evergreen_create_compute_state;
7e995a2eSmrg	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
7e995a2eSmrg	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
7e995a2eSmrg//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
7e995a2eSmrg	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
7e995a2eSmrg	rctx->b.b.set_global_binding = evergreen_set_global_binding;
7e995a2eSmrg	rctx->b.b.launch_grid = evergreen_launch_grid;
af69d88dSmrg
af69d88dSmrg}
af69d88dSmrg
1463c08dSmrgvoid *r600_compute_global_transfer_map(struct pipe_context *ctx,
1463c08dSmrg				      struct pipe_resource *resource,
1463c08dSmrg				      unsigned level,
1463c08dSmrg				      unsigned usage,
1463c08dSmrg				      const struct pipe_box *box,
1463c08dSmrg				      struct pipe_transfer **ptransfer)
af69d88dSmrg{
7e995a2eSmrg	struct r600_context *rctx = (struct r600_context*)ctx;
af69d88dSmrg	struct compute_memory_pool *pool = rctx->screen->global_pool;
af69d88dSmrg	struct r600_resource_global* buffer =
af69d88dSmrg		(struct r600_resource_global*)resource;
af69d88dSmrg
af69d88dSmrg	struct compute_memory_item *item = buffer->chunk;
af69d88dSmrg	struct pipe_resource *dst = NULL;
af69d88dSmrg	unsigned offset = box->x;
af69d88dSmrg
af69d88dSmrg	if (is_item_in_pool(item)) {
7e995a2eSmrg		compute_memory_demote_item(pool, item, ctx);
af69d88dSmrg	}
af69d88dSmrg	else {
af69d88dSmrg		if (item->real_buffer == NULL) {
7e995a2eSmrg			item->real_buffer =
af69d88dSmrg					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
af69d88dSmrg		}
af69d88dSmrg	}
af69d88dSmrg
af69d88dSmrg	dst = (struct pipe_resource*)item->real_buffer;
af69d88dSmrg
1463c08dSmrg	if (usage & PIPE_MAP_READ)
af69d88dSmrg		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
af69d88dSmrg
af69d88dSmrg	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
af69d88dSmrg			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
af69d88dSmrg			"width = %u, height = %u, depth = %u)\n", level, usage,
af69d88dSmrg			box->x, box->y, box->z, box->width, box->height,
af69d88dSmrg			box->depth);
af69d88dSmrg	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
af69d88dSmrg		"%u (box.x)\n", item->id, box->x);
af69d88dSmrg
af69d88dSmrg
af69d88dSmrg	assert(resource->target == PIPE_BUFFER);
af69d88dSmrg	assert(resource->bind & PIPE_BIND_GLOBAL);
af69d88dSmrg	assert(box->x >= 0);
af69d88dSmrg	assert(box->y == 0);
af69d88dSmrg	assert(box->z == 0);
af69d88dSmrg
af69d88dSmrg	///TODO: do it better, mapping is not possible if the pool is too big
7e995a2eSmrg	return pipe_buffer_map_range(ctx, dst,
af69d88dSmrg			offset, box->width, usage, ptransfer);
af69d88dSmrg}
af69d88dSmrg
1463c08dSmrgvoid r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1463c08dSmrg					struct pipe_transfer *transfer)
af69d88dSmrg{
af69d88dSmrg	/* struct r600_resource_global are not real resources, they just map
af69d88dSmrg	 * to an offset within the compute memory pool.  The function
af69d88dSmrg	 * r600_compute_global_transfer_map() maps the memory pool
af69d88dSmrg	 * resource rather than the struct r600_resource_global passed to
1463c08dSmrg	 * it as an argument and then initializes ptransfer->resource with
af69d88dSmrg	 * the memory pool resource (via pipe_buffer_map_range).
af69d88dSmrg	 * When transfer_unmap is called it uses the memory pool's
af69d88dSmrg	 * vtable which calls r600_buffer_transfer_map() rather than
af69d88dSmrg	 * this function.
af69d88dSmrg	 */
af69d88dSmrg	assert (!"This function should not be called");
af69d88dSmrg}
af69d88dSmrg
1463c08dSmrgvoid r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1463c08dSmrg					struct pipe_resource *res)
af69d88dSmrg{
7e995a2eSmrg	struct r600_resource_global* buffer = NULL;
7e995a2eSmrg	struct r600_screen* rscreen = NULL;
7e995a2eSmrg
7e995a2eSmrg	assert(res->target == PIPE_BUFFER);
7e995a2eSmrg	assert(res->bind & PIPE_BIND_GLOBAL);
7e995a2eSmrg
7e995a2eSmrg	buffer = (struct r600_resource_global*)res;
7e995a2eSmrg	rscreen = (struct r600_screen*)screen;
7e995a2eSmrg
7e995a2eSmrg	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
7e995a2eSmrg
7e995a2eSmrg	buffer->chunk = NULL;
7e995a2eSmrg	free(res);
7e995a2eSmrg}
7e995a2eSmrg
7e995a2eSmrgstruct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
7e995a2eSmrg							const struct pipe_resource *templ)
7e995a2eSmrg{
7e995a2eSmrg	struct r600_resource_global* result = NULL;
7e995a2eSmrg	struct r600_screen* rscreen = NULL;
7e995a2eSmrg	int size_in_dw = 0;
7e995a2eSmrg
7e995a2eSmrg	assert(templ->target == PIPE_BUFFER);
7e995a2eSmrg	assert(templ->bind & PIPE_BIND_GLOBAL);
7e995a2eSmrg	assert(templ->array_size == 1 || templ->array_size == 0);
7e995a2eSmrg	assert(templ->depth0 == 1 || templ->depth0 == 0);
7e995a2eSmrg	assert(templ->height0 == 1 || templ->height0 == 0);
7e995a2eSmrg
7e995a2eSmrg	result = (struct r600_resource_global*)
7e995a2eSmrg	CALLOC(sizeof(struct r600_resource_global), 1);
7e995a2eSmrg	rscreen = (struct r600_screen*)screen;
7e995a2eSmrg
7e995a2eSmrg	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
7e995a2eSmrg	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
7e995a2eSmrg			templ->array_size);
7e995a2eSmrg
7e995a2eSmrg	result->base.b.b = *templ;
7e995a2eSmrg	result->base.b.b.screen = screen;
1463c08dSmrg	result->base.compute_global_bo = true;
7e995a2eSmrg	pipe_reference_init(&result->base.b.b.reference, 1);
7e995a2eSmrg
7e995a2eSmrg	size_in_dw = (templ->width0+3) / 4;
7e995a2eSmrg
7e995a2eSmrg	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
7e995a2eSmrg
7e995a2eSmrg	if (result->chunk == NULL)
7e995a2eSmrg	{
7e995a2eSmrg		free(result);
7e995a2eSmrg		return NULL;
7e995a2eSmrg	}
7e995a2eSmrg
7e995a2eSmrg	return &result->base.b.b;
af69d88dSmrg}