1af69d88dSmrg/* 2af69d88dSmrg * Copyright 2011 Adam Rak <adam.rak@streamnovation.com> 3af69d88dSmrg * 4af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 5af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 6af69d88dSmrg * to deal in the Software without restriction, including without limitation 7af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 8af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom 9af69d88dSmrg * the Software is furnished to do so, subject to the following conditions: 10af69d88dSmrg * 11af69d88dSmrg * The above copyright notice and this permission notice (including the next 12af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 13af69d88dSmrg * Software. 14af69d88dSmrg * 15af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 22af69d88dSmrg * 23af69d88dSmrg * Authors: 24af69d88dSmrg * Adam Rak <adam.rak@streamnovation.com> 25af69d88dSmrg */ 26af69d88dSmrg 277e995a2eSmrg#ifdef HAVE_OPENCL 287e995a2eSmrg#include <gelf.h> 297e995a2eSmrg#include <libelf.h> 307e995a2eSmrg#endif 31af69d88dSmrg#include <stdio.h> 32af69d88dSmrg#include <errno.h> 33af69d88dSmrg#include "pipe/p_defines.h" 34af69d88dSmrg#include "pipe/p_state.h" 35af69d88dSmrg#include "pipe/p_context.h" 36af69d88dSmrg#include "util/u_blitter.h" 377e995a2eSmrg#include "util/list.h" 38af69d88dSmrg#include "util/u_transfer.h" 39af69d88dSmrg#include "util/u_surface.h" 40af69d88dSmrg#include "util/u_pack_color.h" 41af69d88dSmrg#include "util/u_memory.h" 42af69d88dSmrg#include "util/u_inlines.h" 43af69d88dSmrg#include "util/u_framebuffer.h" 447e995a2eSmrg#include "tgsi/tgsi_parse.h" 45af69d88dSmrg#include "pipebuffer/pb_buffer.h" 46af69d88dSmrg#include "evergreend.h" 47af69d88dSmrg#include "r600_shader.h" 48af69d88dSmrg#include "r600_pipe.h" 49af69d88dSmrg#include "r600_formats.h" 50af69d88dSmrg#include "evergreen_compute.h" 51af69d88dSmrg#include "evergreen_compute_internal.h" 52af69d88dSmrg#include "compute_memory_pool.h" 53af69d88dSmrg#include "sb/sb_public.h" 54af69d88dSmrg#include <inttypes.h> 55af69d88dSmrg 56af69d88dSmrg/** 57af69d88dSmrgRAT0 is for global binding write 58af69d88dSmrgVTX1 is for global binding read 59af69d88dSmrg 60af69d88dSmrgfor wrting images RAT1... 61af69d88dSmrgfor reading images TEX2... 62af69d88dSmrg TEX2-RAT1 is paired 63af69d88dSmrg 64af69d88dSmrgTEX2... consumes the same fetch resources, that VTX2... would consume 65af69d88dSmrg 66af69d88dSmrgCONST0 and VTX0 is for parameters 67af69d88dSmrg CONST0 is binding smaller input parameter buffer, and for constant indexing, 68af69d88dSmrg also constant cached 69af69d88dSmrg VTX0 is for indirect/non-constant indexing, or if the input is bigger than 70af69d88dSmrg the constant cache can handle 71af69d88dSmrg 72af69d88dSmrgRAT-s are limited to 12, so we can only bind at most 11 texture for writing 73af69d88dSmrgbecause we reserve RAT0 for global bindings. With byteaddressing enabled, 74af69d88dSmrgwe should reserve another one too.=> 10 image binding for writing max. 75af69d88dSmrg 76af69d88dSmrgfrom Nvidia OpenCL: 77af69d88dSmrg CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 78af69d88dSmrg CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 79af69d88dSmrg 80af69d88dSmrgso 10 for writing is enough. 176 is the max for reading according to the docs 81af69d88dSmrg 82af69d88dSmrgwritable images should be listed first < 10, so their id corresponds to RAT(id+1) 83af69d88dSmrgwritable images will consume TEX slots, VTX slots too because of linear indexing 84af69d88dSmrg 85af69d88dSmrg*/ 86af69d88dSmrg 871463c08dSmrg#ifdef HAVE_OPENCL 881463c08dSmrgstatic void radeon_shader_binary_init(struct r600_shader_binary *b) 891463c08dSmrg{ 901463c08dSmrg memset(b, 0, sizeof(*b)); 911463c08dSmrg} 921463c08dSmrg 931463c08dSmrgstatic void radeon_shader_binary_clean(struct r600_shader_binary *b) 941463c08dSmrg{ 951463c08dSmrg if (!b) 961463c08dSmrg return; 971463c08dSmrg FREE(b->code); 981463c08dSmrg FREE(b->config); 991463c08dSmrg FREE(b->rodata); 1001463c08dSmrg FREE(b->global_symbol_offsets); 1011463c08dSmrg FREE(b->relocs); 1021463c08dSmrg FREE(b->disasm_string); 1031463c08dSmrg} 1041463c08dSmrg#endif 1051463c08dSmrg 1067e995a2eSmrgstruct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen, 1077e995a2eSmrg unsigned size) 108af69d88dSmrg{ 1097e995a2eSmrg struct pipe_resource *buffer = NULL; 110af69d88dSmrg assert(size); 111af69d88dSmrg 1127e995a2eSmrg buffer = pipe_buffer_create((struct pipe_screen*) screen, 1137e995a2eSmrg 0, PIPE_USAGE_IMMUTABLE, size); 114af69d88dSmrg 115af69d88dSmrg return (struct r600_resource *)buffer; 116af69d88dSmrg} 117af69d88dSmrg 118af69d88dSmrg 1197e995a2eSmrgstatic void evergreen_set_rat(struct r600_pipe_compute *pipe, 1207e995a2eSmrg unsigned id, 1217e995a2eSmrg struct r600_resource *bo, 1227e995a2eSmrg int start, 1237e995a2eSmrg int size) 124af69d88dSmrg{ 125af69d88dSmrg struct pipe_surface rat_templ; 126af69d88dSmrg struct r600_surface *surf = NULL; 127af69d88dSmrg struct r600_context *rctx = NULL; 128af69d88dSmrg 129af69d88dSmrg assert(id < 12); 130af69d88dSmrg assert((size & 3) == 0); 131af69d88dSmrg assert((start & 0xFF) == 0); 132af69d88dSmrg 133af69d88dSmrg rctx = pipe->ctx; 134af69d88dSmrg 135af69d88dSmrg COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id); 136af69d88dSmrg 137af69d88dSmrg /* Create the RAT surface */ 138af69d88dSmrg memset(&rat_templ, 0, sizeof(rat_templ)); 139af69d88dSmrg rat_templ.format = PIPE_FORMAT_R32_UINT; 140af69d88dSmrg rat_templ.u.tex.level = 0; 141af69d88dSmrg rat_templ.u.tex.first_layer = 0; 142af69d88dSmrg rat_templ.u.tex.last_layer = 0; 143af69d88dSmrg 1447e995a2eSmrg /* Add the RAT the list of color buffers. Drop the old buffer first. */ 1457e995a2eSmrg pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL); 146af69d88dSmrg pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface( 147af69d88dSmrg (struct pipe_context *)pipe->ctx, 148af69d88dSmrg (struct pipe_resource *)bo, &rat_templ); 149af69d88dSmrg 150af69d88dSmrg /* Update the number of color buffers */ 151af69d88dSmrg pipe->ctx->framebuffer.state.nr_cbufs = 152af69d88dSmrg MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs); 153af69d88dSmrg 154af69d88dSmrg /* Update the cb_target_mask 155af69d88dSmrg * XXX: I think this is a potential spot for bugs once we start doing 156af69d88dSmrg * GL interop. cb_target_mask may be modified in the 3D sections 157af69d88dSmrg * of this driver. */ 158af69d88dSmrg pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4)); 159af69d88dSmrg 160af69d88dSmrg surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id]; 161af69d88dSmrg evergreen_init_color_surface_rat(rctx, surf); 162af69d88dSmrg} 163af69d88dSmrg 1647e995a2eSmrgstatic void evergreen_cs_set_vertex_buffer(struct r600_context *rctx, 1657e995a2eSmrg unsigned vb_index, 1667e995a2eSmrg unsigned offset, 1677e995a2eSmrg struct pipe_resource *buffer) 168af69d88dSmrg{ 169af69d88dSmrg struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state; 170af69d88dSmrg struct pipe_vertex_buffer *vb = &state->vb[vb_index]; 171af69d88dSmrg vb->stride = 1; 172af69d88dSmrg vb->buffer_offset = offset; 1737e995a2eSmrg vb->buffer.resource = buffer; 1747e995a2eSmrg vb->is_user_buffer = false; 175af69d88dSmrg 176af69d88dSmrg /* The vertex instructions in the compute shaders use the texture cache, 177af69d88dSmrg * so we need to invalidate it. */ 178af69d88dSmrg rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE; 179af69d88dSmrg state->enabled_mask |= 1 << vb_index; 180af69d88dSmrg state->dirty_mask |= 1 << vb_index; 1817e995a2eSmrg r600_mark_atom_dirty(rctx, &state->atom); 182af69d88dSmrg} 183af69d88dSmrg 1847e995a2eSmrgstatic void evergreen_cs_set_constant_buffer(struct r600_context *rctx, 1857e995a2eSmrg unsigned cb_index, 1867e995a2eSmrg unsigned offset, 1877e995a2eSmrg unsigned size, 1887e995a2eSmrg struct pipe_resource *buffer) 189af69d88dSmrg{ 190af69d88dSmrg struct pipe_constant_buffer cb; 191af69d88dSmrg cb.buffer_size = size; 192af69d88dSmrg cb.buffer_offset = offset; 193af69d88dSmrg cb.buffer = buffer; 194af69d88dSmrg cb.user_buffer = NULL; 195af69d88dSmrg 1961463c08dSmrg rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb); 197af69d88dSmrg} 198af69d88dSmrg 1997e995a2eSmrg/* We need to define these R600 registers here, because we can't include 2007e995a2eSmrg * evergreend.h and r600d.h. 2017e995a2eSmrg */ 2027e995a2eSmrg#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 2037e995a2eSmrg#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 2047e995a2eSmrg 2057e995a2eSmrg#ifdef HAVE_OPENCL 2067e995a2eSmrgstatic void parse_symbol_table(Elf_Data *symbol_table_data, 2077e995a2eSmrg const GElf_Shdr *symbol_table_header, 2081463c08dSmrg struct r600_shader_binary *binary) 209af69d88dSmrg{ 2107e995a2eSmrg GElf_Sym symbol; 2117e995a2eSmrg unsigned i = 0; 2127e995a2eSmrg unsigned symbol_count = 2137e995a2eSmrg symbol_table_header->sh_size / symbol_table_header->sh_entsize; 2147e995a2eSmrg 2157e995a2eSmrg /* We are over allocating this list, because symbol_count gives the 2167e995a2eSmrg * total number of symbols, and we will only be filling the list 2177e995a2eSmrg * with offsets of global symbols. The memory savings from 2187e995a2eSmrg * allocating the correct size of this list will be small, and 2197e995a2eSmrg * I don't think it is worth the cost of pre-computing the number 2207e995a2eSmrg * of global symbols. 2217e995a2eSmrg */ 2227e995a2eSmrg binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t)); 2237e995a2eSmrg 2247e995a2eSmrg while (gelf_getsym(symbol_table_data, i++, &symbol)) { 2257e995a2eSmrg unsigned i; 2267e995a2eSmrg if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL || 2277e995a2eSmrg symbol.st_shndx == 0 /* Undefined symbol */) { 2287e995a2eSmrg continue; 2297e995a2eSmrg } 230af69d88dSmrg 2317e995a2eSmrg binary->global_symbol_offsets[binary->global_symbol_count] = 2327e995a2eSmrg symbol.st_value; 233af69d88dSmrg 2347e995a2eSmrg /* Sort the list using bubble sort. This list will usually 2357e995a2eSmrg * be small. */ 2367e995a2eSmrg for (i = binary->global_symbol_count; i > 0; --i) { 2377e995a2eSmrg uint64_t lhs = binary->global_symbol_offsets[i - 1]; 2387e995a2eSmrg uint64_t rhs = binary->global_symbol_offsets[i]; 2397e995a2eSmrg if (lhs < rhs) { 2407e995a2eSmrg break; 2417e995a2eSmrg } 2427e995a2eSmrg binary->global_symbol_offsets[i] = lhs; 2437e995a2eSmrg binary->global_symbol_offsets[i - 1] = rhs; 2447e995a2eSmrg } 2457e995a2eSmrg ++binary->global_symbol_count; 2467e995a2eSmrg } 2477e995a2eSmrg} 2487e995a2eSmrg 2497e995a2eSmrg 2507e995a2eSmrgstatic void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols, 2517e995a2eSmrg unsigned symbol_sh_link, 2521463c08dSmrg struct r600_shader_binary *binary) 253af69d88dSmrg{ 2547e995a2eSmrg unsigned i; 255af69d88dSmrg 2567e995a2eSmrg if (!relocs || !symbols || !binary->reloc_count) { 2577e995a2eSmrg return; 2587e995a2eSmrg } 2597e995a2eSmrg binary->relocs = CALLOC(binary->reloc_count, 2601463c08dSmrg sizeof(struct r600_shader_reloc)); 2617e995a2eSmrg for (i = 0; i < binary->reloc_count; i++) { 2627e995a2eSmrg GElf_Sym symbol; 2637e995a2eSmrg GElf_Rel rel; 2647e995a2eSmrg char *symbol_name; 2651463c08dSmrg struct r600_shader_reloc *reloc = &binary->relocs[i]; 2667e995a2eSmrg 2677e995a2eSmrg gelf_getrel(relocs, i, &rel); 2687e995a2eSmrg gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol); 2697e995a2eSmrg symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name); 2707e995a2eSmrg 2717e995a2eSmrg reloc->offset = rel.r_offset; 2727e995a2eSmrg strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1); 2737e995a2eSmrg reloc->name[sizeof(reloc->name)-1] = 0; 2747e995a2eSmrg } 2757e995a2eSmrg} 2767e995a2eSmrg 2777e995a2eSmrgstatic void r600_elf_read(const char *elf_data, unsigned elf_size, 2781463c08dSmrg struct r600_shader_binary *binary) 2797e995a2eSmrg{ 2807e995a2eSmrg char *elf_buffer; 2817e995a2eSmrg Elf *elf; 2827e995a2eSmrg Elf_Scn *section = NULL; 2837e995a2eSmrg Elf_Data *symbols = NULL, *relocs = NULL; 2847e995a2eSmrg size_t section_str_index; 2857e995a2eSmrg unsigned symbol_sh_link = 0; 2867e995a2eSmrg 2877e995a2eSmrg /* One of the libelf implementations 2887e995a2eSmrg * (http://www.mr511.de/software/english.htm) requires calling 2897e995a2eSmrg * elf_version() before elf_memory(). 2907e995a2eSmrg */ 2917e995a2eSmrg elf_version(EV_CURRENT); 2927e995a2eSmrg elf_buffer = MALLOC(elf_size); 2937e995a2eSmrg memcpy(elf_buffer, elf_data, elf_size); 2947e995a2eSmrg 2957e995a2eSmrg elf = elf_memory(elf_buffer, elf_size); 2967e995a2eSmrg 2977e995a2eSmrg elf_getshdrstrndx(elf, §ion_str_index); 2987e995a2eSmrg 2997e995a2eSmrg while ((section = elf_nextscn(elf, section))) { 3007e995a2eSmrg const char *name; 3017e995a2eSmrg Elf_Data *section_data = NULL; 3027e995a2eSmrg GElf_Shdr section_header; 3037e995a2eSmrg if (gelf_getshdr(section, §ion_header) != §ion_header) { 3047e995a2eSmrg fprintf(stderr, "Failed to read ELF section header\n"); 3057e995a2eSmrg return; 3067e995a2eSmrg } 3077e995a2eSmrg name = elf_strptr(elf, section_str_index, section_header.sh_name); 3087e995a2eSmrg if (!strcmp(name, ".text")) { 3097e995a2eSmrg section_data = elf_getdata(section, section_data); 3107e995a2eSmrg binary->code_size = section_data->d_size; 3117e995a2eSmrg binary->code = MALLOC(binary->code_size * sizeof(unsigned char)); 3127e995a2eSmrg memcpy(binary->code, section_data->d_buf, binary->code_size); 3137e995a2eSmrg } else if (!strcmp(name, ".AMDGPU.config")) { 3147e995a2eSmrg section_data = elf_getdata(section, section_data); 3157e995a2eSmrg binary->config_size = section_data->d_size; 3167e995a2eSmrg binary->config = MALLOC(binary->config_size * sizeof(unsigned char)); 3177e995a2eSmrg memcpy(binary->config, section_data->d_buf, binary->config_size); 3187e995a2eSmrg } else if (!strcmp(name, ".AMDGPU.disasm")) { 3197e995a2eSmrg /* Always read disassembly if it's available. */ 3207e995a2eSmrg section_data = elf_getdata(section, section_data); 3217e995a2eSmrg binary->disasm_string = strndup(section_data->d_buf, 3227e995a2eSmrg section_data->d_size); 3237e995a2eSmrg } else if (!strncmp(name, ".rodata", 7)) { 3247e995a2eSmrg section_data = elf_getdata(section, section_data); 3257e995a2eSmrg binary->rodata_size = section_data->d_size; 3267e995a2eSmrg binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char)); 3277e995a2eSmrg memcpy(binary->rodata, section_data->d_buf, binary->rodata_size); 3287e995a2eSmrg } else if (!strncmp(name, ".symtab", 7)) { 3297e995a2eSmrg symbols = elf_getdata(section, section_data); 3307e995a2eSmrg symbol_sh_link = section_header.sh_link; 3317e995a2eSmrg parse_symbol_table(symbols, §ion_header, binary); 3327e995a2eSmrg } else if (!strcmp(name, ".rel.text")) { 3337e995a2eSmrg relocs = elf_getdata(section, section_data); 3347e995a2eSmrg binary->reloc_count = section_header.sh_size / 3357e995a2eSmrg section_header.sh_entsize; 3367e995a2eSmrg } 3377e995a2eSmrg } 3387e995a2eSmrg 3397e995a2eSmrg parse_relocs(elf, relocs, symbols, symbol_sh_link, binary); 3407e995a2eSmrg 3417e995a2eSmrg if (elf){ 3427e995a2eSmrg elf_end(elf); 3437e995a2eSmrg } 3447e995a2eSmrg FREE(elf_buffer); 3457e995a2eSmrg 3467e995a2eSmrg /* Cache the config size per symbol */ 3477e995a2eSmrg if (binary->global_symbol_count) { 3487e995a2eSmrg binary->config_size_per_symbol = 3497e995a2eSmrg binary->config_size / binary->global_symbol_count; 3507e995a2eSmrg } else { 3517e995a2eSmrg binary->global_symbol_count = 1; 3527e995a2eSmrg binary->config_size_per_symbol = binary->config_size; 3537e995a2eSmrg } 3547e995a2eSmrg} 3557e995a2eSmrg 3567e995a2eSmrgstatic const unsigned char *r600_shader_binary_config_start( 3571463c08dSmrg const struct r600_shader_binary *binary, 3587e995a2eSmrg uint64_t symbol_offset) 3597e995a2eSmrg{ 360af69d88dSmrg unsigned i; 3617e995a2eSmrg for (i = 0; i < binary->global_symbol_count; ++i) { 3627e995a2eSmrg if (binary->global_symbol_offsets[i] == symbol_offset) { 3637e995a2eSmrg unsigned offset = i * binary->config_size_per_symbol; 3647e995a2eSmrg return binary->config + offset; 3657e995a2eSmrg } 3667e995a2eSmrg } 3677e995a2eSmrg return binary->config; 3687e995a2eSmrg} 369af69d88dSmrg 3701463c08dSmrgstatic void r600_shader_binary_read_config(const struct r600_shader_binary *binary, 3717e995a2eSmrg struct r600_bytecode *bc, 3727e995a2eSmrg uint64_t symbol_offset, 3737e995a2eSmrg boolean *use_kill) 3747e995a2eSmrg{ 3757e995a2eSmrg unsigned i; 3767e995a2eSmrg const unsigned char *config = 3777e995a2eSmrg r600_shader_binary_config_start(binary, symbol_offset); 3787e995a2eSmrg 3797e995a2eSmrg for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 3807e995a2eSmrg unsigned reg = 3817e995a2eSmrg util_le32_to_cpu(*(uint32_t*)(config + i)); 3827e995a2eSmrg unsigned value = 3837e995a2eSmrg util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 3847e995a2eSmrg switch (reg) { 3857e995a2eSmrg /* R600 / R700 */ 3867e995a2eSmrg case R_028850_SQ_PGM_RESOURCES_PS: 3877e995a2eSmrg case R_028868_SQ_PGM_RESOURCES_VS: 3887e995a2eSmrg /* Evergreen / Northern Islands */ 3897e995a2eSmrg case R_028844_SQ_PGM_RESOURCES_PS: 3907e995a2eSmrg case R_028860_SQ_PGM_RESOURCES_VS: 3917e995a2eSmrg case R_0288D4_SQ_PGM_RESOURCES_LS: 3927e995a2eSmrg bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value)); 3937e995a2eSmrg bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value)); 3947e995a2eSmrg break; 3957e995a2eSmrg case R_02880C_DB_SHADER_CONTROL: 3967e995a2eSmrg *use_kill = G_02880C_KILL_ENABLE(value); 3977e995a2eSmrg break; 3987e995a2eSmrg case R_0288E8_SQ_LDS_ALLOC: 3997e995a2eSmrg bc->nlds_dw = value; 4007e995a2eSmrg break; 4017e995a2eSmrg } 4027e995a2eSmrg } 4037e995a2eSmrg} 404af69d88dSmrg 4057e995a2eSmrgstatic unsigned r600_create_shader(struct r600_bytecode *bc, 4061463c08dSmrg const struct r600_shader_binary *binary, 4077e995a2eSmrg boolean *use_kill) 4087e995a2eSmrg 4097e995a2eSmrg{ 4107e995a2eSmrg assert(binary->code_size % 4 == 0); 4117e995a2eSmrg bc->bytecode = CALLOC(1, binary->code_size); 4127e995a2eSmrg memcpy(bc->bytecode, binary->code, binary->code_size); 4137e995a2eSmrg bc->ndw = binary->code_size / 4; 4147e995a2eSmrg 4157e995a2eSmrg r600_shader_binary_read_config(binary, bc, 0, use_kill); 4167e995a2eSmrg return 0; 4177e995a2eSmrg} 418af69d88dSmrg 419af69d88dSmrg#endif 420af69d88dSmrg 4217e995a2eSmrgstatic void r600_destroy_shader(struct r600_bytecode *bc) 4227e995a2eSmrg{ 4237e995a2eSmrg FREE(bc->bytecode); 4247e995a2eSmrg} 4257e995a2eSmrg 4267e995a2eSmrgstatic void *evergreen_create_compute_state(struct pipe_context *ctx, 4277e995a2eSmrg const struct pipe_compute_state *cso) 4287e995a2eSmrg{ 4297e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 4307e995a2eSmrg struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); 4317e995a2eSmrg#ifdef HAVE_OPENCL 4321463c08dSmrg const struct pipe_binary_program_header *header; 4337e995a2eSmrg void *p; 4347e995a2eSmrg boolean use_kill; 4357e995a2eSmrg#endif 4367e995a2eSmrg 4377e995a2eSmrg shader->ctx = rctx; 438af69d88dSmrg shader->local_size = cso->req_local_mem; 439af69d88dSmrg shader->private_size = cso->req_private_mem; 440af69d88dSmrg shader->input_size = cso->req_input_mem; 441af69d88dSmrg 4427e995a2eSmrg shader->ir_type = cso->ir_type; 443af69d88dSmrg 4441463c08dSmrg if (shader->ir_type == PIPE_SHADER_IR_TGSI || 4451463c08dSmrg shader->ir_type == PIPE_SHADER_IR_NIR) { 4461463c08dSmrg shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE); 4477e995a2eSmrg return shader; 448af69d88dSmrg } 4497e995a2eSmrg#ifdef HAVE_OPENCL 4507e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); 4517e995a2eSmrg header = cso->prog; 4527e995a2eSmrg radeon_shader_binary_init(&shader->binary); 4531463c08dSmrg r600_elf_read(header->blob, header->num_bytes, &shader->binary); 4547e995a2eSmrg r600_create_shader(&shader->bc, &shader->binary, &use_kill); 4557e995a2eSmrg 4567e995a2eSmrg /* Upload code + ROdata */ 4577e995a2eSmrg shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen, 4587e995a2eSmrg shader->bc.ndw * 4); 459d8407755Smaya p = r600_buffer_map_sync_with_rings( 460d8407755Smaya &rctx->b, shader->code_bo, 4611463c08dSmrg PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); 4627e995a2eSmrg //TODO: use util_memcpy_cpu_to_le32 ? 4637e995a2eSmrg memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); 4641463c08dSmrg rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf); 465af69d88dSmrg#endif 4667e995a2eSmrg 467af69d88dSmrg return shader; 468af69d88dSmrg} 469af69d88dSmrg 4707e995a2eSmrgstatic void evergreen_delete_compute_state(struct pipe_context *ctx, void *state) 471af69d88dSmrg{ 4727e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 4737e995a2eSmrg struct r600_pipe_compute *shader = state; 4747e995a2eSmrg 4757e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n"); 476af69d88dSmrg 477af69d88dSmrg if (!shader) 478af69d88dSmrg return; 479af69d88dSmrg 4801463c08dSmrg if (shader->ir_type == PIPE_SHADER_IR_TGSI || 4811463c08dSmrg shader->ir_type == PIPE_SHADER_IR_NIR) { 4827e995a2eSmrg r600_delete_shader_selector(ctx, shader->sel); 4837e995a2eSmrg } else { 484af69d88dSmrg#ifdef HAVE_OPENCL 4857e995a2eSmrg radeon_shader_binary_clean(&shader->binary); 486d8407755Smaya pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL); 487d8407755Smaya pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL); 488af69d88dSmrg#endif 4897e995a2eSmrg r600_destroy_shader(&shader->bc); 4907e995a2eSmrg } 491af69d88dSmrg FREE(shader); 492af69d88dSmrg} 493af69d88dSmrg 4947e995a2eSmrgstatic void evergreen_bind_compute_state(struct pipe_context *ctx, void *state) 495af69d88dSmrg{ 4967e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 4977e995a2eSmrg struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state; 4987e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n"); 4997e995a2eSmrg 5007e995a2eSmrg if (!state) { 5017e995a2eSmrg rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 5027e995a2eSmrg return; 5037e995a2eSmrg } 5047e995a2eSmrg 5051463c08dSmrg if (cstate->ir_type == PIPE_SHADER_IR_TGSI || 5061463c08dSmrg cstate->ir_type == PIPE_SHADER_IR_NIR) { 5077e995a2eSmrg bool compute_dirty; 5081463c08dSmrg cstate->sel->ir_type = cstate->ir_type; 5091463c08dSmrg if (r600_shader_select(ctx, cstate->sel, &compute_dirty)) 5101463c08dSmrg R600_ERR("Failed to select compute shader\n"); 5117e995a2eSmrg } 5121463c08dSmrg 5137e995a2eSmrg rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 514af69d88dSmrg} 515af69d88dSmrg 516af69d88dSmrg/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit 517af69d88dSmrg * kernel parameters there are implicit parameters that need to be stored 518af69d88dSmrg * in the vertex buffer as well. Here is how these parameters are organized in 519af69d88dSmrg * the buffer: 520af69d88dSmrg * 521af69d88dSmrg * DWORDS 0-2: Number of work groups in each dimension (x,y,z) 522af69d88dSmrg * DWORDS 3-5: Number of global work items in each dimension (x,y,z) 523af69d88dSmrg * DWORDS 6-8: Number of work items within each work group in each dimension 524af69d88dSmrg * (x,y,z) 525af69d88dSmrg * DWORDS 9+ : Kernel parameters 526af69d88dSmrg */ 5277e995a2eSmrgstatic void evergreen_compute_upload_input(struct pipe_context *ctx, 5287e995a2eSmrg const struct pipe_grid_info *info) 529af69d88dSmrg{ 5307e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 5317e995a2eSmrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 532af69d88dSmrg unsigned i; 533af69d88dSmrg /* We need to reserve 9 dwords (36 bytes) for implicit kernel 534af69d88dSmrg * parameters. 535af69d88dSmrg */ 5367e995a2eSmrg unsigned input_size; 5377e995a2eSmrg uint32_t *num_work_groups_start; 5387e995a2eSmrg uint32_t *global_size_start; 5397e995a2eSmrg uint32_t *local_size_start; 5407e995a2eSmrg uint32_t *kernel_parameters_start; 541af69d88dSmrg struct pipe_box box; 542af69d88dSmrg struct pipe_transfer *transfer = NULL; 543af69d88dSmrg 5447e995a2eSmrg if (!shader) 5457e995a2eSmrg return; 546af69d88dSmrg if (shader->input_size == 0) { 547af69d88dSmrg return; 548af69d88dSmrg } 5497e995a2eSmrg input_size = shader->input_size + 36; 550af69d88dSmrg if (!shader->kernel_param) { 551af69d88dSmrg /* Add space for the grid dimensions */ 552af69d88dSmrg shader->kernel_param = (struct r600_resource *) 5537e995a2eSmrg pipe_buffer_create(ctx->screen, 0, 554af69d88dSmrg PIPE_USAGE_IMMUTABLE, input_size); 555af69d88dSmrg } 556af69d88dSmrg 557af69d88dSmrg u_box_1d(0, input_size, &box); 5581463c08dSmrg num_work_groups_start = ctx->buffer_map(ctx, 559af69d88dSmrg (struct pipe_resource*)shader->kernel_param, 5601463c08dSmrg 0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE, 561af69d88dSmrg &box, &transfer); 562af69d88dSmrg global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4)); 563af69d88dSmrg local_size_start = global_size_start + (3 * (sizeof(uint)) / 4); 564af69d88dSmrg kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4); 565af69d88dSmrg 566af69d88dSmrg /* Copy the work group size */ 5677e995a2eSmrg memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint)); 568af69d88dSmrg 569af69d88dSmrg /* Copy the global size */ 570af69d88dSmrg for (i = 0; i < 3; i++) { 5717e995a2eSmrg global_size_start[i] = info->grid[i] * info->block[i]; 572af69d88dSmrg } 573af69d88dSmrg 574af69d88dSmrg /* Copy the local dimensions */ 5757e995a2eSmrg memcpy(local_size_start, info->block, 3 * sizeof(uint)); 576af69d88dSmrg 577af69d88dSmrg /* Copy the kernel inputs */ 5787e995a2eSmrg memcpy(kernel_parameters_start, info->input, shader->input_size); 579af69d88dSmrg 580af69d88dSmrg for (i = 0; i < (input_size / 4); i++) { 5817e995a2eSmrg COMPUTE_DBG(rctx->screen, "input %i : %u\n", i, 582af69d88dSmrg ((unsigned*)num_work_groups_start)[i]); 583af69d88dSmrg } 584af69d88dSmrg 5851463c08dSmrg ctx->buffer_unmap(ctx, transfer); 586af69d88dSmrg 5877e995a2eSmrg /* ID=0 and ID=3 are reserved for the parameters. 5887e995a2eSmrg * LLVM will preferably use ID=0, but it does not work for dynamic 5897e995a2eSmrg * indices. */ 5907e995a2eSmrg evergreen_cs_set_vertex_buffer(rctx, 3, 0, 5917e995a2eSmrg (struct pipe_resource*)shader->kernel_param); 5927e995a2eSmrg evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size, 593af69d88dSmrg (struct pipe_resource*)shader->kernel_param); 594af69d88dSmrg} 595af69d88dSmrg 5967e995a2eSmrgstatic void evergreen_emit_dispatch(struct r600_context *rctx, 5977e995a2eSmrg const struct pipe_grid_info *info, 5987e995a2eSmrg uint32_t indirect_grid[3]) 599af69d88dSmrg{ 600af69d88dSmrg int i; 6011463c08dSmrg struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 602af69d88dSmrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 6037e995a2eSmrg bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off; 604af69d88dSmrg unsigned num_waves; 6057e995a2eSmrg unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes; 606af69d88dSmrg unsigned wave_divisor = (16 * num_pipes); 607af69d88dSmrg int group_size = 1; 608af69d88dSmrg int grid_size = 1; 6097e995a2eSmrg unsigned lds_size = shader->local_size / 4; 6107e995a2eSmrg 6111463c08dSmrg if (shader->ir_type != PIPE_SHADER_IR_TGSI && 6121463c08dSmrg shader->ir_type != PIPE_SHADER_IR_NIR) 6137e995a2eSmrg lds_size += shader->bc.nlds_dw; 6141463c08dSmrg 615af69d88dSmrg /* Calculate group_size/grid_size */ 616af69d88dSmrg for (i = 0; i < 3; i++) { 6177e995a2eSmrg group_size *= info->block[i]; 618af69d88dSmrg } 619af69d88dSmrg 620af69d88dSmrg for (i = 0; i < 3; i++) { 6217e995a2eSmrg grid_size *= info->grid[i]; 622af69d88dSmrg } 623af69d88dSmrg 624af69d88dSmrg /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ 6257e995a2eSmrg num_waves = (info->block[0] * info->block[1] * info->block[2] + 626af69d88dSmrg wave_divisor - 1) / wave_divisor; 627af69d88dSmrg 628af69d88dSmrg COMPUTE_DBG(rctx->screen, "Using %u pipes, " 629af69d88dSmrg "%u wavefronts per thread block, " 630af69d88dSmrg "allocating %u dwords lds.\n", 631af69d88dSmrg num_pipes, num_waves, lds_size); 632af69d88dSmrg 6337e995a2eSmrg radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); 634af69d88dSmrg 6357e995a2eSmrg radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); 636af69d88dSmrg radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ 637af69d88dSmrg radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ 638af69d88dSmrg radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ 639af69d88dSmrg 6407e995a2eSmrg radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, 641af69d88dSmrg group_size); 642af69d88dSmrg 6437e995a2eSmrg radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); 6447e995a2eSmrg radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ 6457e995a2eSmrg radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ 6467e995a2eSmrg radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ 647af69d88dSmrg 648af69d88dSmrg if (rctx->b.chip_class < CAYMAN) { 649af69d88dSmrg assert(lds_size <= 8192); 650af69d88dSmrg } else { 651af69d88dSmrg /* Cayman appears to have a slightly smaller limit, see the 652af69d88dSmrg * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ 653af69d88dSmrg assert(lds_size <= 8160); 654af69d88dSmrg } 655af69d88dSmrg 6567e995a2eSmrg radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, 657af69d88dSmrg lds_size | (num_waves << 14)); 658af69d88dSmrg 6597e995a2eSmrg if (info->indirect) { 6607e995a2eSmrg radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit)); 6617e995a2eSmrg radeon_emit(cs, indirect_grid[0]); 6627e995a2eSmrg radeon_emit(cs, indirect_grid[1]); 6637e995a2eSmrg radeon_emit(cs, indirect_grid[2]); 6647e995a2eSmrg radeon_emit(cs, 1); 6657e995a2eSmrg } else { 6667e995a2eSmrg /* Dispatch packet */ 6677e995a2eSmrg radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit)); 6687e995a2eSmrg radeon_emit(cs, info->grid[0]); 6697e995a2eSmrg radeon_emit(cs, info->grid[1]); 6707e995a2eSmrg radeon_emit(cs, info->grid[2]); 6717e995a2eSmrg /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ 6727e995a2eSmrg radeon_emit(cs, 1); 6737e995a2eSmrg } 6747e995a2eSmrg 6757e995a2eSmrg if (rctx->is_debug) 6767e995a2eSmrg eg_trace_emit(rctx); 677af69d88dSmrg} 678af69d88dSmrg 6797e995a2eSmrgstatic void compute_setup_cbs(struct r600_context *rctx) 680af69d88dSmrg{ 6811463c08dSmrg struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 682af69d88dSmrg unsigned i; 683af69d88dSmrg 684af69d88dSmrg /* Emit colorbuffers. */ 685af69d88dSmrg /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ 6867e995a2eSmrg for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) { 6877e995a2eSmrg struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i]; 6887e995a2eSmrg unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 689af69d88dSmrg (struct r600_resource*)cb->base.texture, 690af69d88dSmrg RADEON_USAGE_READWRITE, 6917e995a2eSmrg RADEON_PRIO_SHADER_RW_BUFFER); 692af69d88dSmrg 6937e995a2eSmrg radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); 694af69d88dSmrg radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ 695af69d88dSmrg radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ 696af69d88dSmrg radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ 697af69d88dSmrg radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ 698af69d88dSmrg radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ 699af69d88dSmrg radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ 700af69d88dSmrg radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ 701af69d88dSmrg 702af69d88dSmrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ 703af69d88dSmrg radeon_emit(cs, reloc); 704af69d88dSmrg 705af69d88dSmrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ 706af69d88dSmrg radeon_emit(cs, reloc); 707af69d88dSmrg } 7087e995a2eSmrg for (; i < 8 ; i++) 7097e995a2eSmrg radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 7107e995a2eSmrg S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 7117e995a2eSmrg for (; i < 12; i++) 7127e995a2eSmrg radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, 7137e995a2eSmrg S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 7147e995a2eSmrg 7157e995a2eSmrg /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ 7167e995a2eSmrg radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, 7177e995a2eSmrg rctx->compute_cb_target_mask); 7187e995a2eSmrg} 7197e995a2eSmrg 7207e995a2eSmrgstatic void compute_emit_cs(struct r600_context *rctx, 7217e995a2eSmrg const struct pipe_grid_info *info) 7227e995a2eSmrg{ 7231463c08dSmrg struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 7247e995a2eSmrg bool compute_dirty = false; 7257e995a2eSmrg struct r600_pipe_shader *current; 7267e995a2eSmrg struct r600_shader_atomic combined_atomics[8]; 7277e995a2eSmrg uint8_t atomic_used_mask; 7287e995a2eSmrg uint32_t indirect_grid[3] = { 0, 0, 0 }; 7297e995a2eSmrg 7307e995a2eSmrg /* make sure that the gfx ring is only one active */ 7311463c08dSmrg if (radeon_emitted(&rctx->b.dma.cs, 0)) { 7327e995a2eSmrg rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL); 7337e995a2eSmrg } 7347e995a2eSmrg 7357e995a2eSmrg r600_update_compressed_resource_state(rctx, true); 7367e995a2eSmrg 7377e995a2eSmrg if (!rctx->cmd_buf_is_compute) { 7387e995a2eSmrg rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL); 7397e995a2eSmrg rctx->cmd_buf_is_compute = true; 7407e995a2eSmrg } 7417e995a2eSmrg 7421463c08dSmrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI|| 7431463c08dSmrg rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) { 7441463c08dSmrg if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) { 7451463c08dSmrg R600_ERR("Failed to select compute shader\n"); 7461463c08dSmrg return; 7471463c08dSmrg } 7481463c08dSmrg 7497e995a2eSmrg current = rctx->cs_shader_state.shader->sel->current; 7507e995a2eSmrg if (compute_dirty) { 7517e995a2eSmrg rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw; 7527e995a2eSmrg r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo); 7537e995a2eSmrg r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true); 7547e995a2eSmrg } 7557e995a2eSmrg 7567e995a2eSmrg bool need_buf_const = current->shader.uses_tex_buffers || 7577e995a2eSmrg current->shader.has_txq_cube_array_z_comp; 7587e995a2eSmrg 7597e995a2eSmrg if (info->indirect) { 7607e995a2eSmrg struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect; 7611463c08dSmrg unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ); 7627e995a2eSmrg unsigned offset = info->indirect_offset / 4; 7637e995a2eSmrg indirect_grid[0] = data[offset]; 7647e995a2eSmrg indirect_grid[1] = data[offset + 1]; 7657e995a2eSmrg indirect_grid[2] = data[offset + 2]; 7667e995a2eSmrg } 7677e995a2eSmrg for (int i = 0; i < 3; i++) { 7687e995a2eSmrg rctx->cs_block_grid_sizes[i] = info->block[i]; 7697e995a2eSmrg rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i]; 7707e995a2eSmrg } 7717e995a2eSmrg rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0; 7727e995a2eSmrg rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true; 7737e995a2eSmrg 7747e995a2eSmrg evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask); 7757e995a2eSmrg r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask)); 7767e995a2eSmrg 7777e995a2eSmrg if (need_buf_const) { 7787e995a2eSmrg eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE); 779af69d88dSmrg } 7807e995a2eSmrg r600_update_driver_const_buffers(rctx, true); 7817e995a2eSmrg 7827e995a2eSmrg evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask); 7837e995a2eSmrg if (atomic_used_mask) { 7847e995a2eSmrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 7857e995a2eSmrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 786af69d88dSmrg } 7877e995a2eSmrg } else 7887e995a2eSmrg r600_need_cs_space(rctx, 0, true, 0); 7897e995a2eSmrg 7907e995a2eSmrg /* Initialize all the compute-related registers. 7917e995a2eSmrg * 7927e995a2eSmrg * See evergreen_init_atom_start_compute_cs() in this file for the list 7937e995a2eSmrg * of registers initialized by the start_compute_cs_cmd atom. 7947e995a2eSmrg */ 7957e995a2eSmrg r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd); 7967e995a2eSmrg 7977e995a2eSmrg /* emit config state */ 7987e995a2eSmrg if (rctx->b.chip_class == EVERGREEN) { 7991463c08dSmrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI|| 8001463c08dSmrg rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) { 8017e995a2eSmrg radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3); 8027e995a2eSmrg radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs)); 8037e995a2eSmrg radeon_emit(cs, 0); 8047e995a2eSmrg radeon_emit(cs, 0); 8057e995a2eSmrg radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8)); 8067e995a2eSmrg } else 8077e995a2eSmrg r600_emit_atom(rctx, &rctx->config_state.atom); 808af69d88dSmrg } 809af69d88dSmrg 8107e995a2eSmrg rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; 8117e995a2eSmrg r600_flush_emit(rctx); 812af69d88dSmrg 8131463c08dSmrg if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI && 8141463c08dSmrg rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) { 815af69d88dSmrg 8167e995a2eSmrg compute_setup_cbs(rctx); 8177e995a2eSmrg 8187e995a2eSmrg /* Emit vertex buffer state */ 8197e995a2eSmrg rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask); 8207e995a2eSmrg r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom); 8217e995a2eSmrg } else { 8227e995a2eSmrg uint32_t rat_mask; 8237e995a2eSmrg 8247e995a2eSmrg rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0); 8257e995a2eSmrg radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, 8267e995a2eSmrg rat_mask); 8277e995a2eSmrg } 8287e995a2eSmrg 8297e995a2eSmrg r600_emit_atom(rctx, &rctx->b.render_cond_atom); 830af69d88dSmrg 831af69d88dSmrg /* Emit constant buffer state */ 8327e995a2eSmrg r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); 8337e995a2eSmrg 8347e995a2eSmrg /* Emit sampler state */ 8357e995a2eSmrg r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom); 8367e995a2eSmrg 8377e995a2eSmrg /* Emit sampler view (texture resource) state */ 8387e995a2eSmrg r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom); 8397e995a2eSmrg 8407e995a2eSmrg /* Emit images state */ 8417e995a2eSmrg r600_emit_atom(rctx, &rctx->compute_images.atom); 8427e995a2eSmrg 8437e995a2eSmrg /* Emit buffers state */ 8447e995a2eSmrg r600_emit_atom(rctx, &rctx->compute_buffers.atom); 845af69d88dSmrg 8467e995a2eSmrg /* Emit shader state */ 8477e995a2eSmrg r600_emit_atom(rctx, &rctx->cs_shader_state.atom); 848af69d88dSmrg 849af69d88dSmrg /* Emit dispatch state and dispatch packet */ 8507e995a2eSmrg evergreen_emit_dispatch(rctx, info, indirect_grid); 851af69d88dSmrg 852af69d88dSmrg /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff 853af69d88dSmrg */ 8547e995a2eSmrg rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | 855af69d88dSmrg R600_CONTEXT_INV_VERTEX_CACHE | 856af69d88dSmrg R600_CONTEXT_INV_TEX_CACHE; 8577e995a2eSmrg r600_flush_emit(rctx); 8587e995a2eSmrg rctx->b.flags = 0; 859af69d88dSmrg 8607e995a2eSmrg if (rctx->b.chip_class >= CAYMAN) { 8617e995a2eSmrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 8627e995a2eSmrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 863af69d88dSmrg /* DEALLOC_STATE prevents the GPU from hanging when a 864af69d88dSmrg * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT 865af69d88dSmrg * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. 866af69d88dSmrg */ 8677e995a2eSmrg radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0)); 8687e995a2eSmrg radeon_emit(cs, 0); 869af69d88dSmrg } 8701463c08dSmrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI || 8711463c08dSmrg rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) 8727e995a2eSmrg evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask); 873af69d88dSmrg 874af69d88dSmrg#if 0 8757e995a2eSmrg COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw); 876af69d88dSmrg for (i = 0; i < cs->cdw; i++) { 8777e995a2eSmrg COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); 878af69d88dSmrg } 879af69d88dSmrg#endif 880af69d88dSmrg 881af69d88dSmrg} 882af69d88dSmrg 883af69d88dSmrg 884af69d88dSmrg/** 885af69d88dSmrg * Emit function for r600_cs_shader_state atom 886af69d88dSmrg */ 8877e995a2eSmrgvoid evergreen_emit_cs_shader(struct r600_context *rctx, 8887e995a2eSmrg struct r600_atom *atom) 889af69d88dSmrg{ 890af69d88dSmrg struct r600_cs_shader_state *state = 891af69d88dSmrg (struct r600_cs_shader_state*)atom; 892af69d88dSmrg struct r600_pipe_compute *shader = state->shader; 8931463c08dSmrg struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 8947e995a2eSmrg uint64_t va; 8957e995a2eSmrg struct r600_resource *code_bo; 8967e995a2eSmrg unsigned ngpr, nstack; 8977e995a2eSmrg 8981463c08dSmrg if (shader->ir_type == PIPE_SHADER_IR_TGSI || 8991463c08dSmrg shader->ir_type == PIPE_SHADER_IR_NIR) { 9007e995a2eSmrg code_bo = shader->sel->current->bo; 9017e995a2eSmrg va = shader->sel->current->bo->gpu_address; 9027e995a2eSmrg ngpr = shader->sel->current->shader.bc.ngpr; 9037e995a2eSmrg nstack = shader->sel->current->shader.bc.nstack; 9047e995a2eSmrg } else { 9057e995a2eSmrg code_bo = shader->code_bo; 9067e995a2eSmrg va = shader->code_bo->gpu_address + state->pc; 9077e995a2eSmrg ngpr = shader->bc.ngpr; 9087e995a2eSmrg nstack = shader->bc.nstack; 9097e995a2eSmrg } 910af69d88dSmrg 9117e995a2eSmrg radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); 9127e995a2eSmrg radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ 913af69d88dSmrg radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ 9147e995a2eSmrg S_0288D4_NUM_GPRS(ngpr) | 9157e995a2eSmrg S_0288D4_DX10_CLAMP(1) | 9167e995a2eSmrg S_0288D4_STACK_SIZE(nstack)); 917af69d88dSmrg radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ 918af69d88dSmrg 919af69d88dSmrg radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); 9207e995a2eSmrg radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 9217e995a2eSmrg code_bo, RADEON_USAGE_READ, 9227e995a2eSmrg RADEON_PRIO_SHADER_BINARY)); 923af69d88dSmrg} 924af69d88dSmrg 9257e995a2eSmrgstatic void evergreen_launch_grid(struct pipe_context *ctx, 9267e995a2eSmrg const struct pipe_grid_info *info) 927af69d88dSmrg{ 9287e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 929af69d88dSmrg#ifdef HAVE_OPENCL 9307e995a2eSmrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 9317e995a2eSmrg boolean use_kill; 932af69d88dSmrg 9331463c08dSmrg if (shader->ir_type != PIPE_SHADER_IR_TGSI && 9341463c08dSmrg shader->ir_type != PIPE_SHADER_IR_NIR) { 9357e995a2eSmrg rctx->cs_shader_state.pc = info->pc; 9367e995a2eSmrg /* Get the config information for this kernel. */ 9377e995a2eSmrg r600_shader_binary_read_config(&shader->binary, &shader->bc, 9387e995a2eSmrg info->pc, &use_kill); 9397e995a2eSmrg } else { 9407e995a2eSmrg use_kill = false; 9417e995a2eSmrg rctx->cs_shader_state.pc = 0; 942af69d88dSmrg } 943af69d88dSmrg#endif 9447e995a2eSmrg 9457e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc); 9467e995a2eSmrg 9477e995a2eSmrg 9487e995a2eSmrg evergreen_compute_upload_input(ctx, info); 9497e995a2eSmrg compute_emit_cs(rctx, info); 950af69d88dSmrg} 951af69d88dSmrg 9527e995a2eSmrgstatic void evergreen_set_compute_resources(struct pipe_context *ctx, 9537e995a2eSmrg unsigned start, unsigned count, 9547e995a2eSmrg struct pipe_surface **surfaces) 955af69d88dSmrg{ 9567e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 957af69d88dSmrg struct r600_surface **resources = (struct r600_surface **)surfaces; 958af69d88dSmrg 9597e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n", 960af69d88dSmrg start, count); 961af69d88dSmrg 962af69d88dSmrg for (unsigned i = 0; i < count; i++) { 9637e995a2eSmrg /* The First four vertex buffers are reserved for parameters and 964af69d88dSmrg * global buffers. */ 9657e995a2eSmrg unsigned vtx_id = 4 + i; 966af69d88dSmrg if (resources[i]) { 967af69d88dSmrg struct r600_resource_global *buffer = 968af69d88dSmrg (struct r600_resource_global*) 969af69d88dSmrg resources[i]->base.texture; 970af69d88dSmrg if (resources[i]->base.writable) { 971af69d88dSmrg assert(i+1 < 12); 972af69d88dSmrg 9737e995a2eSmrg evergreen_set_rat(rctx->cs_shader_state.shader, i+1, 974af69d88dSmrg (struct r600_resource *)resources[i]->base.texture, 975af69d88dSmrg buffer->chunk->start_in_dw*4, 976af69d88dSmrg resources[i]->base.texture->width0); 977af69d88dSmrg } 978af69d88dSmrg 9797e995a2eSmrg evergreen_cs_set_vertex_buffer(rctx, vtx_id, 980af69d88dSmrg buffer->chunk->start_in_dw * 4, 981af69d88dSmrg resources[i]->base.texture); 982af69d88dSmrg } 983af69d88dSmrg } 984af69d88dSmrg} 985af69d88dSmrg 9867e995a2eSmrgstatic void evergreen_set_global_binding(struct pipe_context *ctx, 9877e995a2eSmrg unsigned first, unsigned n, 9887e995a2eSmrg struct pipe_resource **resources, 9897e995a2eSmrg uint32_t **handles) 990af69d88dSmrg{ 9917e995a2eSmrg struct r600_context *rctx = (struct r600_context *)ctx; 9927e995a2eSmrg struct compute_memory_pool *pool = rctx->screen->global_pool; 993af69d88dSmrg struct r600_resource_global **buffers = 994af69d88dSmrg (struct r600_resource_global **)resources; 995af69d88dSmrg unsigned i; 996af69d88dSmrg 9977e995a2eSmrg COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n", 998af69d88dSmrg first, n); 999af69d88dSmrg 1000af69d88dSmrg if (!resources) { 1001af69d88dSmrg /* XXX: Unset */ 1002af69d88dSmrg return; 1003af69d88dSmrg } 1004af69d88dSmrg 1005af69d88dSmrg /* We mark these items for promotion to the pool if they 1006af69d88dSmrg * aren't already there */ 1007af69d88dSmrg for (i = first; i < first + n; i++) { 1008af69d88dSmrg struct compute_memory_item *item = buffers[i]->chunk; 1009af69d88dSmrg 1010af69d88dSmrg if (!is_item_in_pool(item)) 1011af69d88dSmrg buffers[i]->chunk->status |= ITEM_FOR_PROMOTING; 1012af69d88dSmrg } 1013af69d88dSmrg 10147e995a2eSmrg if (compute_memory_finalize_pending(pool, ctx) == -1) { 1015af69d88dSmrg /* XXX: Unset */ 1016af69d88dSmrg return; 1017af69d88dSmrg } 1018af69d88dSmrg 1019af69d88dSmrg for (i = first; i < first + n; i++) 1020af69d88dSmrg { 1021af69d88dSmrg uint32_t buffer_offset; 1022af69d88dSmrg uint32_t handle; 1023af69d88dSmrg assert(resources[i]->target == PIPE_BUFFER); 1024af69d88dSmrg assert(resources[i]->bind & PIPE_BIND_GLOBAL); 1025af69d88dSmrg 1026af69d88dSmrg buffer_offset = util_le32_to_cpu(*(handles[i])); 1027af69d88dSmrg handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4; 1028af69d88dSmrg 1029af69d88dSmrg *(handles[i]) = util_cpu_to_le32(handle); 1030af69d88dSmrg } 1031af69d88dSmrg 10327e995a2eSmrg /* globals for writing */ 10337e995a2eSmrg evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4); 10347e995a2eSmrg /* globals for reading */ 10357e995a2eSmrg evergreen_cs_set_vertex_buffer(rctx, 1, 0, 1036af69d88dSmrg (struct pipe_resource*)pool->bo); 10377e995a2eSmrg 10387e995a2eSmrg /* constants for reading, LLVM puts them in text segment */ 10397e995a2eSmrg evergreen_cs_set_vertex_buffer(rctx, 2, 0, 10407e995a2eSmrg (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo); 1041af69d88dSmrg} 1042af69d88dSmrg 1043af69d88dSmrg/** 1044af69d88dSmrg * This function initializes all the compute specific registers that need to 1045af69d88dSmrg * be initialized for each compute command stream. Registers that are common 1046af69d88dSmrg * to both compute and 3D will be initialized at the beginning of each compute 1047af69d88dSmrg * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG 1048af69d88dSmrg * packet requires that the shader type bit be set, we must initialize all 1049af69d88dSmrg * context registers needed for compute in this function. The registers 10507e995a2eSmrg * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the 1051af69d88dSmrg * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending 1052af69d88dSmrg * on the GPU family. 1053af69d88dSmrg */ 10547e995a2eSmrgvoid evergreen_init_atom_start_compute_cs(struct r600_context *rctx) 1055af69d88dSmrg{ 10567e995a2eSmrg struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd; 1057af69d88dSmrg int num_threads; 1058af69d88dSmrg int num_stack_entries; 1059af69d88dSmrg 10607e995a2eSmrg /* since all required registers are initialized in the 1061af69d88dSmrg * start_compute_cs_cmd atom, we can EMIT_EARLY here. 1062af69d88dSmrg */ 1063af69d88dSmrg r600_init_command_buffer(cb, 256); 1064af69d88dSmrg cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; 1065af69d88dSmrg 1066af69d88dSmrg /* We're setting config registers here. */ 1067af69d88dSmrg r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1068af69d88dSmrg r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1069af69d88dSmrg 10707e995a2eSmrg switch (rctx->b.family) { 1071af69d88dSmrg case CHIP_CEDAR: 1072af69d88dSmrg default: 1073af69d88dSmrg num_threads = 128; 1074af69d88dSmrg num_stack_entries = 256; 1075af69d88dSmrg break; 1076af69d88dSmrg case CHIP_REDWOOD: 1077af69d88dSmrg num_threads = 128; 1078af69d88dSmrg num_stack_entries = 256; 1079af69d88dSmrg break; 1080af69d88dSmrg case CHIP_JUNIPER: 1081af69d88dSmrg num_threads = 128; 1082af69d88dSmrg num_stack_entries = 512; 1083af69d88dSmrg break; 1084af69d88dSmrg case CHIP_CYPRESS: 1085af69d88dSmrg case CHIP_HEMLOCK: 1086af69d88dSmrg num_threads = 128; 1087af69d88dSmrg num_stack_entries = 512; 1088af69d88dSmrg break; 1089af69d88dSmrg case CHIP_PALM: 1090af69d88dSmrg num_threads = 128; 1091af69d88dSmrg num_stack_entries = 256; 1092af69d88dSmrg break; 1093af69d88dSmrg case CHIP_SUMO: 1094af69d88dSmrg num_threads = 128; 1095af69d88dSmrg num_stack_entries = 256; 1096af69d88dSmrg break; 1097af69d88dSmrg case CHIP_SUMO2: 1098af69d88dSmrg num_threads = 128; 1099af69d88dSmrg num_stack_entries = 512; 1100af69d88dSmrg break; 1101af69d88dSmrg case CHIP_BARTS: 1102af69d88dSmrg num_threads = 128; 1103af69d88dSmrg num_stack_entries = 512; 1104af69d88dSmrg break; 1105af69d88dSmrg case CHIP_TURKS: 1106af69d88dSmrg num_threads = 128; 1107af69d88dSmrg num_stack_entries = 256; 1108af69d88dSmrg break; 1109af69d88dSmrg case CHIP_CAICOS: 1110af69d88dSmrg num_threads = 128; 1111af69d88dSmrg num_stack_entries = 256; 1112af69d88dSmrg break; 1113af69d88dSmrg } 1114af69d88dSmrg 1115af69d88dSmrg /* The primitive type always needs to be POINTLIST for compute. */ 1116af69d88dSmrg r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, 1117af69d88dSmrg V_008958_DI_PT_POINTLIST); 1118af69d88dSmrg 11197e995a2eSmrg if (rctx->b.chip_class < CAYMAN) { 1120af69d88dSmrg 1121af69d88dSmrg /* These registers control which simds can be used by each stage. 1122af69d88dSmrg * The default for these registers is 0xffffffff, which means 1123af69d88dSmrg * all simds are available for each stage. It's possible we may 1124af69d88dSmrg * want to play around with these in the future, but for now 1125af69d88dSmrg * the default value is fine. 1126af69d88dSmrg * 1127af69d88dSmrg * R_008E20_SQ_STATIC_THREAD_MGMT1 1128af69d88dSmrg * R_008E24_SQ_STATIC_THREAD_MGMT2 1129af69d88dSmrg * R_008E28_SQ_STATIC_THREAD_MGMT3 1130af69d88dSmrg */ 1131af69d88dSmrg 11327e995a2eSmrg /* XXX: We may need to adjust the thread and stack resource 1133af69d88dSmrg * values for 3D/compute interop */ 1134af69d88dSmrg 1135af69d88dSmrg r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); 1136af69d88dSmrg 1137af69d88dSmrg /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 1138af69d88dSmrg * Set the number of threads used by the PS/VS/GS/ES stage to 1139af69d88dSmrg * 0. 1140af69d88dSmrg */ 1141af69d88dSmrg r600_store_value(cb, 0); 1142af69d88dSmrg 1143af69d88dSmrg /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 1144af69d88dSmrg * Set the number of threads used by the CS (aka LS) stage to 1145af69d88dSmrg * the maximum number of threads and set the number of threads 1146af69d88dSmrg * for the HS stage to 0. */ 1147af69d88dSmrg r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); 1148af69d88dSmrg 1149af69d88dSmrg /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 1150af69d88dSmrg * Set the Control Flow stack entries to 0 for PS/VS stages */ 1151af69d88dSmrg r600_store_value(cb, 0); 1152af69d88dSmrg 1153af69d88dSmrg /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 1154af69d88dSmrg * Set the Control Flow stack entries to 0 for GS/ES stages */ 1155af69d88dSmrg r600_store_value(cb, 0); 1156af69d88dSmrg 1157af69d88dSmrg /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 1158af69d88dSmrg * Set the Contol Flow stack entries to 0 for the HS stage, and 1159af69d88dSmrg * set it to the maximum value for the CS (aka LS) stage. */ 1160af69d88dSmrg r600_store_value(cb, 1161af69d88dSmrg S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); 1162af69d88dSmrg } 1163af69d88dSmrg /* Give the compute shader all the available LDS space. 1164af69d88dSmrg * NOTE: This only sets the maximum number of dwords that a compute 1165af69d88dSmrg * shader can allocate. When a shader is executed, we still need to 1166af69d88dSmrg * allocate the appropriate amount of LDS dwords using the 1167af69d88dSmrg * CM_R_0288E8_SQ_LDS_ALLOC register. 1168af69d88dSmrg */ 11697e995a2eSmrg if (rctx->b.chip_class < CAYMAN) { 1170af69d88dSmrg r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, 1171af69d88dSmrg S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192)); 1172af69d88dSmrg } else { 1173af69d88dSmrg r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT, 1174af69d88dSmrg S_0286FC_NUM_PS_LDS(0) | 1175af69d88dSmrg S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */ 1176af69d88dSmrg } 1177af69d88dSmrg 1178af69d88dSmrg /* Context Registers */ 1179af69d88dSmrg 11807e995a2eSmrg if (rctx->b.chip_class < CAYMAN) { 1181af69d88dSmrg /* workaround for hw issues with dyn gpr - must set all limits 1182af69d88dSmrg * to 240 instead of 0, 0x1e == 240 / 8 1183af69d88dSmrg */ 1184af69d88dSmrg r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, 1185af69d88dSmrg S_028838_PS_GPRS(0x1e) | 1186af69d88dSmrg S_028838_VS_GPRS(0x1e) | 1187af69d88dSmrg S_028838_GS_GPRS(0x1e) | 1188af69d88dSmrg S_028838_ES_GPRS(0x1e) | 1189af69d88dSmrg S_028838_HS_GPRS(0x1e) | 1190af69d88dSmrg S_028838_LS_GPRS(0x1e)); 1191af69d88dSmrg } 1192af69d88dSmrg 1193af69d88dSmrg /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ 1194af69d88dSmrg r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, 1195af69d88dSmrg S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); 1196af69d88dSmrg 1197af69d88dSmrg r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); 1198af69d88dSmrg 1199af69d88dSmrg r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, 12007e995a2eSmrg S_0286E8_TID_IN_GROUP_ENA(1) | 12017e995a2eSmrg S_0286E8_TGID_ENA(1) | 12027e995a2eSmrg S_0286E8_DISABLE_INDEX_PACK(1)); 1203af69d88dSmrg 1204af69d88dSmrg /* The LOOP_CONST registers are an optimizations for loops that allows 1205af69d88dSmrg * you to store the initial counter, increment value, and maximum 1206af69d88dSmrg * counter value in a register so that hardware can calculate the 1207af69d88dSmrg * correct number of iterations for the loop, so that you don't need 1208af69d88dSmrg * to have the loop counter in your shader code. We don't currently use 1209af69d88dSmrg * this optimization, so we must keep track of the counter in the 1210af69d88dSmrg * shader and use a break instruction to exit loops. However, the 1211af69d88dSmrg * hardware will still uses this register to determine when to exit a 1212af69d88dSmrg * loop, so we need to initialize the counter to 0, set the increment 1213af69d88dSmrg * value to 1 and the maximum counter value to the 4095 (0xfff) which 1214af69d88dSmrg * is the maximum value allowed. This gives us a maximum of 4096 1215af69d88dSmrg * iterations for our loops, but hopefully our break instruction will 1216af69d88dSmrg * execute before some time before the 4096th iteration. 1217af69d88dSmrg */ 1218af69d88dSmrg eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); 1219af69d88dSmrg} 1220af69d88dSmrg 12217e995a2eSmrgvoid evergreen_init_compute_state_functions(struct r600_context *rctx) 1222af69d88dSmrg{ 12237e995a2eSmrg rctx->b.b.create_compute_state = evergreen_create_compute_state; 12247e995a2eSmrg rctx->b.b.delete_compute_state = evergreen_delete_compute_state; 12257e995a2eSmrg rctx->b.b.bind_compute_state = evergreen_bind_compute_state; 12267e995a2eSmrg// rctx->context.create_sampler_view = evergreen_compute_create_sampler_view; 12277e995a2eSmrg rctx->b.b.set_compute_resources = evergreen_set_compute_resources; 12287e995a2eSmrg rctx->b.b.set_global_binding = evergreen_set_global_binding; 12297e995a2eSmrg rctx->b.b.launch_grid = evergreen_launch_grid; 1230af69d88dSmrg 1231af69d88dSmrg} 1232af69d88dSmrg 12331463c08dSmrgvoid *r600_compute_global_transfer_map(struct pipe_context *ctx, 12341463c08dSmrg struct pipe_resource *resource, 12351463c08dSmrg unsigned level, 12361463c08dSmrg unsigned usage, 12371463c08dSmrg const struct pipe_box *box, 12381463c08dSmrg struct pipe_transfer **ptransfer) 1239af69d88dSmrg{ 12407e995a2eSmrg struct r600_context *rctx = (struct r600_context*)ctx; 1241af69d88dSmrg struct compute_memory_pool *pool = rctx->screen->global_pool; 1242af69d88dSmrg struct r600_resource_global* buffer = 1243af69d88dSmrg (struct r600_resource_global*)resource; 1244af69d88dSmrg 1245af69d88dSmrg struct compute_memory_item *item = buffer->chunk; 1246af69d88dSmrg struct pipe_resource *dst = NULL; 1247af69d88dSmrg unsigned offset = box->x; 1248af69d88dSmrg 1249af69d88dSmrg if (is_item_in_pool(item)) { 12507e995a2eSmrg compute_memory_demote_item(pool, item, ctx); 1251af69d88dSmrg } 1252af69d88dSmrg else { 1253af69d88dSmrg if (item->real_buffer == NULL) { 12547e995a2eSmrg item->real_buffer = 1255af69d88dSmrg r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4); 1256af69d88dSmrg } 1257af69d88dSmrg } 1258af69d88dSmrg 1259af69d88dSmrg dst = (struct pipe_resource*)item->real_buffer; 1260af69d88dSmrg 12611463c08dSmrg if (usage & PIPE_MAP_READ) 1262af69d88dSmrg buffer->chunk->status |= ITEM_MAPPED_FOR_READING; 1263af69d88dSmrg 1264af69d88dSmrg COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n" 1265af69d88dSmrg "level = %u, usage = %u, box(x = %u, y = %u, z = %u " 1266af69d88dSmrg "width = %u, height = %u, depth = %u)\n", level, usage, 1267af69d88dSmrg box->x, box->y, box->z, box->width, box->height, 1268af69d88dSmrg box->depth); 1269af69d88dSmrg COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = " 1270af69d88dSmrg "%u (box.x)\n", item->id, box->x); 1271af69d88dSmrg 1272af69d88dSmrg 1273af69d88dSmrg assert(resource->target == PIPE_BUFFER); 1274af69d88dSmrg assert(resource->bind & PIPE_BIND_GLOBAL); 1275af69d88dSmrg assert(box->x >= 0); 1276af69d88dSmrg assert(box->y == 0); 1277af69d88dSmrg assert(box->z == 0); 1278af69d88dSmrg 1279af69d88dSmrg ///TODO: do it better, mapping is not possible if the pool is too big 12807e995a2eSmrg return pipe_buffer_map_range(ctx, dst, 1281af69d88dSmrg offset, box->width, usage, ptransfer); 1282af69d88dSmrg} 1283af69d88dSmrg 12841463c08dSmrgvoid r600_compute_global_transfer_unmap(struct pipe_context *ctx, 12851463c08dSmrg struct pipe_transfer *transfer) 1286af69d88dSmrg{ 1287af69d88dSmrg /* struct r600_resource_global are not real resources, they just map 1288af69d88dSmrg * to an offset within the compute memory pool. The function 1289af69d88dSmrg * r600_compute_global_transfer_map() maps the memory pool 1290af69d88dSmrg * resource rather than the struct r600_resource_global passed to 12911463c08dSmrg * it as an argument and then initializes ptransfer->resource with 1292af69d88dSmrg * the memory pool resource (via pipe_buffer_map_range). 1293af69d88dSmrg * When transfer_unmap is called it uses the memory pool's 1294af69d88dSmrg * vtable which calls r600_buffer_transfer_map() rather than 1295af69d88dSmrg * this function. 1296af69d88dSmrg */ 1297af69d88dSmrg assert (!"This function should not be called"); 1298af69d88dSmrg} 1299af69d88dSmrg 13001463c08dSmrgvoid r600_compute_global_buffer_destroy(struct pipe_screen *screen, 13011463c08dSmrg struct pipe_resource *res) 1302af69d88dSmrg{ 13037e995a2eSmrg struct r600_resource_global* buffer = NULL; 13047e995a2eSmrg struct r600_screen* rscreen = NULL; 13057e995a2eSmrg 13067e995a2eSmrg assert(res->target == PIPE_BUFFER); 13077e995a2eSmrg assert(res->bind & PIPE_BIND_GLOBAL); 13087e995a2eSmrg 13097e995a2eSmrg buffer = (struct r600_resource_global*)res; 13107e995a2eSmrg rscreen = (struct r600_screen*)screen; 13117e995a2eSmrg 13127e995a2eSmrg compute_memory_free(rscreen->global_pool, buffer->chunk->id); 13137e995a2eSmrg 13147e995a2eSmrg buffer->chunk = NULL; 13157e995a2eSmrg free(res); 13167e995a2eSmrg} 13177e995a2eSmrg 13187e995a2eSmrgstruct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, 13197e995a2eSmrg const struct pipe_resource *templ) 13207e995a2eSmrg{ 13217e995a2eSmrg struct r600_resource_global* result = NULL; 13227e995a2eSmrg struct r600_screen* rscreen = NULL; 13237e995a2eSmrg int size_in_dw = 0; 13247e995a2eSmrg 13257e995a2eSmrg assert(templ->target == PIPE_BUFFER); 13267e995a2eSmrg assert(templ->bind & PIPE_BIND_GLOBAL); 13277e995a2eSmrg assert(templ->array_size == 1 || templ->array_size == 0); 13287e995a2eSmrg assert(templ->depth0 == 1 || templ->depth0 == 0); 13297e995a2eSmrg assert(templ->height0 == 1 || templ->height0 == 0); 13307e995a2eSmrg 13317e995a2eSmrg result = (struct r600_resource_global*) 13327e995a2eSmrg CALLOC(sizeof(struct r600_resource_global), 1); 13337e995a2eSmrg rscreen = (struct r600_screen*)screen; 13347e995a2eSmrg 13357e995a2eSmrg COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n"); 13367e995a2eSmrg COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0, 13377e995a2eSmrg templ->array_size); 13387e995a2eSmrg 13397e995a2eSmrg result->base.b.b = *templ; 13407e995a2eSmrg result->base.b.b.screen = screen; 13411463c08dSmrg result->base.compute_global_bo = true; 13427e995a2eSmrg pipe_reference_init(&result->base.b.b.reference, 1); 13437e995a2eSmrg 13447e995a2eSmrg size_in_dw = (templ->width0+3) / 4; 13457e995a2eSmrg 13467e995a2eSmrg result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw); 13477e995a2eSmrg 13487e995a2eSmrg if (result->chunk == NULL) 13497e995a2eSmrg { 13507e995a2eSmrg free(result); 13517e995a2eSmrg return NULL; 13527e995a2eSmrg } 13537e995a2eSmrg 13547e995a2eSmrg return &result->base.b.b; 1355af69d88dSmrg} 1356