1848b8605Smrg/* 2848b8605Smrg * Copyright 2011 Adam Rak <adam.rak@streamnovation.com> 3848b8605Smrg * 4848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5848b8605Smrg * copy of this software and associated documentation files (the "Software"), 6848b8605Smrg * to deal in the Software without restriction, including without limitation 7848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub 8848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom 9848b8605Smrg * the Software is furnished to do so, subject to the following conditions: 10848b8605Smrg * 11848b8605Smrg * The above copyright notice and this permission notice (including the next 12848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the 13848b8605Smrg * Software. 14848b8605Smrg * 15848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 22848b8605Smrg * 23848b8605Smrg * Authors: 24848b8605Smrg * Adam Rak <adam.rak@streamnovation.com> 25848b8605Smrg */ 26848b8605Smrg 27b8e80941Smrg#ifdef HAVE_OPENCL 28b8e80941Smrg#include <gelf.h> 29b8e80941Smrg#include <libelf.h> 30b8e80941Smrg#endif 31848b8605Smrg#include <stdio.h> 32848b8605Smrg#include <errno.h> 33848b8605Smrg#include "pipe/p_defines.h" 34848b8605Smrg#include "pipe/p_state.h" 35848b8605Smrg#include "pipe/p_context.h" 36848b8605Smrg#include "util/u_blitter.h" 37b8e80941Smrg#include "util/list.h" 38848b8605Smrg#include "util/u_transfer.h" 39848b8605Smrg#include "util/u_surface.h" 40848b8605Smrg#include "util/u_pack_color.h" 41848b8605Smrg#include "util/u_memory.h" 42848b8605Smrg#include "util/u_inlines.h" 43848b8605Smrg#include "util/u_framebuffer.h" 44b8e80941Smrg#include "tgsi/tgsi_parse.h" 45848b8605Smrg#include "pipebuffer/pb_buffer.h" 46848b8605Smrg#include "evergreend.h" 47848b8605Smrg#include "r600_shader.h" 48848b8605Smrg#include "r600_pipe.h" 49848b8605Smrg#include "r600_formats.h" 50848b8605Smrg#include "evergreen_compute.h" 51848b8605Smrg#include "evergreen_compute_internal.h" 52848b8605Smrg#include "compute_memory_pool.h" 53848b8605Smrg#include "sb/sb_public.h" 54848b8605Smrg#include <inttypes.h> 55848b8605Smrg 56848b8605Smrg/** 57848b8605SmrgRAT0 is for global binding write 58848b8605SmrgVTX1 is for global binding read 59848b8605Smrg 60848b8605Smrgfor wrting images RAT1... 61848b8605Smrgfor reading images TEX2... 62848b8605Smrg TEX2-RAT1 is paired 63848b8605Smrg 64848b8605SmrgTEX2... consumes the same fetch resources, that VTX2... would consume 65848b8605Smrg 66848b8605SmrgCONST0 and VTX0 is for parameters 67848b8605Smrg CONST0 is binding smaller input parameter buffer, and for constant indexing, 68848b8605Smrg also constant cached 69848b8605Smrg VTX0 is for indirect/non-constant indexing, or if the input is bigger than 70848b8605Smrg the constant cache can handle 71848b8605Smrg 72848b8605SmrgRAT-s are limited to 12, so we can only bind at most 11 texture for writing 73848b8605Smrgbecause we reserve RAT0 for global bindings. With byteaddressing enabled, 74848b8605Smrgwe should reserve another one too.=> 10 image binding for writing max. 75848b8605Smrg 76848b8605Smrgfrom Nvidia OpenCL: 77848b8605Smrg CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 78848b8605Smrg CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 79848b8605Smrg 80848b8605Smrgso 10 for writing is enough. 176 is the max for reading according to the docs 81848b8605Smrg 82848b8605Smrgwritable images should be listed first < 10, so their id corresponds to RAT(id+1) 83848b8605Smrgwritable images will consume TEX slots, VTX slots too because of linear indexing 84848b8605Smrg 85848b8605Smrg*/ 86848b8605Smrg 87b8e80941Smrgstruct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen, 88b8e80941Smrg unsigned size) 89848b8605Smrg{ 90b8e80941Smrg struct pipe_resource *buffer = NULL; 91848b8605Smrg assert(size); 92848b8605Smrg 93b8e80941Smrg buffer = pipe_buffer_create((struct pipe_screen*) screen, 94b8e80941Smrg 0, PIPE_USAGE_IMMUTABLE, size); 95848b8605Smrg 96848b8605Smrg return (struct r600_resource *)buffer; 97848b8605Smrg} 98848b8605Smrg 99848b8605Smrg 100b8e80941Smrgstatic void evergreen_set_rat(struct r600_pipe_compute *pipe, 101b8e80941Smrg unsigned id, 102b8e80941Smrg struct r600_resource *bo, 103b8e80941Smrg int start, 104b8e80941Smrg int size) 105848b8605Smrg{ 106848b8605Smrg struct pipe_surface rat_templ; 107848b8605Smrg struct r600_surface *surf = NULL; 108848b8605Smrg struct r600_context *rctx = NULL; 109848b8605Smrg 110848b8605Smrg assert(id < 12); 111848b8605Smrg assert((size & 3) == 0); 112848b8605Smrg assert((start & 0xFF) == 0); 113848b8605Smrg 114848b8605Smrg rctx = pipe->ctx; 115848b8605Smrg 116848b8605Smrg COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id); 117848b8605Smrg 118848b8605Smrg /* Create the RAT surface */ 119848b8605Smrg memset(&rat_templ, 0, sizeof(rat_templ)); 120848b8605Smrg rat_templ.format = PIPE_FORMAT_R32_UINT; 121848b8605Smrg rat_templ.u.tex.level = 0; 122848b8605Smrg rat_templ.u.tex.first_layer = 0; 123848b8605Smrg rat_templ.u.tex.last_layer = 0; 124848b8605Smrg 125b8e80941Smrg /* Add the RAT the list of color buffers. Drop the old buffer first. */ 126b8e80941Smrg pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL); 127848b8605Smrg pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface( 128848b8605Smrg (struct pipe_context *)pipe->ctx, 129848b8605Smrg (struct pipe_resource *)bo, &rat_templ); 130848b8605Smrg 131848b8605Smrg /* Update the number of color buffers */ 132848b8605Smrg pipe->ctx->framebuffer.state.nr_cbufs = 133848b8605Smrg MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs); 134848b8605Smrg 135848b8605Smrg /* Update the cb_target_mask 136848b8605Smrg * XXX: I think this is a potential spot for bugs once we start doing 137848b8605Smrg * GL interop. cb_target_mask may be modified in the 3D sections 138848b8605Smrg * of this driver. */ 139848b8605Smrg pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4)); 140848b8605Smrg 141848b8605Smrg surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id]; 142848b8605Smrg evergreen_init_color_surface_rat(rctx, surf); 143848b8605Smrg} 144848b8605Smrg 145b8e80941Smrgstatic void evergreen_cs_set_vertex_buffer(struct r600_context *rctx, 146b8e80941Smrg unsigned vb_index, 147b8e80941Smrg unsigned offset, 148b8e80941Smrg struct pipe_resource *buffer) 149848b8605Smrg{ 150848b8605Smrg struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state; 151848b8605Smrg struct pipe_vertex_buffer *vb = &state->vb[vb_index]; 152848b8605Smrg vb->stride = 1; 153848b8605Smrg vb->buffer_offset = offset; 154b8e80941Smrg vb->buffer.resource = buffer; 155b8e80941Smrg vb->is_user_buffer = false; 156848b8605Smrg 157848b8605Smrg /* The vertex instructions in the compute shaders use the texture cache, 158848b8605Smrg * so we need to invalidate it. */ 159848b8605Smrg rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE; 160848b8605Smrg state->enabled_mask |= 1 << vb_index; 161848b8605Smrg state->dirty_mask |= 1 << vb_index; 162b8e80941Smrg r600_mark_atom_dirty(rctx, &state->atom); 163848b8605Smrg} 164848b8605Smrg 165b8e80941Smrgstatic void evergreen_cs_set_constant_buffer(struct r600_context *rctx, 166b8e80941Smrg unsigned cb_index, 167b8e80941Smrg unsigned offset, 168b8e80941Smrg unsigned size, 169b8e80941Smrg struct pipe_resource *buffer) 170848b8605Smrg{ 171848b8605Smrg struct pipe_constant_buffer cb; 172848b8605Smrg cb.buffer_size = size; 173848b8605Smrg cb.buffer_offset = offset; 174848b8605Smrg cb.buffer = buffer; 175848b8605Smrg cb.user_buffer = NULL; 176848b8605Smrg 177848b8605Smrg rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb); 178848b8605Smrg} 179848b8605Smrg 180b8e80941Smrg/* We need to define these R600 registers here, because we can't include 181b8e80941Smrg * evergreend.h and r600d.h. 182b8e80941Smrg */ 183b8e80941Smrg#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 184b8e80941Smrg#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 185b8e80941Smrg 186b8e80941Smrg#ifdef HAVE_OPENCL 187b8e80941Smrgstatic void parse_symbol_table(Elf_Data *symbol_table_data, 188b8e80941Smrg const GElf_Shdr *symbol_table_header, 189b8e80941Smrg struct ac_shader_binary *binary) 190848b8605Smrg{ 191b8e80941Smrg GElf_Sym symbol; 192b8e80941Smrg unsigned i = 0; 193b8e80941Smrg unsigned symbol_count = 194b8e80941Smrg symbol_table_header->sh_size / symbol_table_header->sh_entsize; 195b8e80941Smrg 196b8e80941Smrg /* We are over allocating this list, because symbol_count gives the 197b8e80941Smrg * total number of symbols, and we will only be filling the list 198b8e80941Smrg * with offsets of global symbols. The memory savings from 199b8e80941Smrg * allocating the correct size of this list will be small, and 200b8e80941Smrg * I don't think it is worth the cost of pre-computing the number 201b8e80941Smrg * of global symbols. 202b8e80941Smrg */ 203b8e80941Smrg binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t)); 204b8e80941Smrg 205b8e80941Smrg while (gelf_getsym(symbol_table_data, i++, &symbol)) { 206b8e80941Smrg unsigned i; 207b8e80941Smrg if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL || 208b8e80941Smrg symbol.st_shndx == 0 /* Undefined symbol */) { 209b8e80941Smrg continue; 210b8e80941Smrg } 211848b8605Smrg 212b8e80941Smrg binary->global_symbol_offsets[binary->global_symbol_count] = 213b8e80941Smrg symbol.st_value; 214848b8605Smrg 215b8e80941Smrg /* Sort the list using bubble sort. This list will usually 216b8e80941Smrg * be small. */ 217b8e80941Smrg for (i = binary->global_symbol_count; i > 0; --i) { 218b8e80941Smrg uint64_t lhs = binary->global_symbol_offsets[i - 1]; 219b8e80941Smrg uint64_t rhs = binary->global_symbol_offsets[i]; 220b8e80941Smrg if (lhs < rhs) { 221b8e80941Smrg break; 222b8e80941Smrg } 223b8e80941Smrg binary->global_symbol_offsets[i] = lhs; 224b8e80941Smrg binary->global_symbol_offsets[i - 1] = rhs; 225b8e80941Smrg } 226b8e80941Smrg ++binary->global_symbol_count; 227b8e80941Smrg } 228b8e80941Smrg} 229b8e80941Smrg 230b8e80941Smrg 231b8e80941Smrgstatic void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols, 232b8e80941Smrg unsigned symbol_sh_link, 233b8e80941Smrg struct ac_shader_binary *binary) 234848b8605Smrg{ 235b8e80941Smrg unsigned i; 236848b8605Smrg 237b8e80941Smrg if (!relocs || !symbols || !binary->reloc_count) { 238b8e80941Smrg return; 239b8e80941Smrg } 240b8e80941Smrg binary->relocs = CALLOC(binary->reloc_count, 241b8e80941Smrg sizeof(struct ac_shader_reloc)); 242b8e80941Smrg for (i = 0; i < binary->reloc_count; i++) { 243b8e80941Smrg GElf_Sym symbol; 244b8e80941Smrg GElf_Rel rel; 245b8e80941Smrg char *symbol_name; 246b8e80941Smrg struct ac_shader_reloc *reloc = &binary->relocs[i]; 247b8e80941Smrg 248b8e80941Smrg gelf_getrel(relocs, i, &rel); 249b8e80941Smrg gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol); 250b8e80941Smrg symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name); 251b8e80941Smrg 252b8e80941Smrg reloc->offset = rel.r_offset; 253b8e80941Smrg strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1); 254b8e80941Smrg reloc->name[sizeof(reloc->name)-1] = 0; 255b8e80941Smrg } 256b8e80941Smrg} 257b8e80941Smrg 258b8e80941Smrgstatic void r600_elf_read(const char *elf_data, unsigned elf_size, 259b8e80941Smrg struct ac_shader_binary *binary) 260b8e80941Smrg{ 261b8e80941Smrg char *elf_buffer; 262b8e80941Smrg Elf *elf; 263b8e80941Smrg Elf_Scn *section = NULL; 264b8e80941Smrg Elf_Data *symbols = NULL, *relocs = NULL; 265b8e80941Smrg size_t section_str_index; 266b8e80941Smrg unsigned symbol_sh_link = 0; 267b8e80941Smrg 268b8e80941Smrg /* One of the libelf implementations 269b8e80941Smrg * (http://www.mr511.de/software/english.htm) requires calling 270b8e80941Smrg * elf_version() before elf_memory(). 271b8e80941Smrg */ 272b8e80941Smrg elf_version(EV_CURRENT); 273b8e80941Smrg elf_buffer = MALLOC(elf_size); 274b8e80941Smrg memcpy(elf_buffer, elf_data, elf_size); 275b8e80941Smrg 276b8e80941Smrg elf = elf_memory(elf_buffer, elf_size); 277b8e80941Smrg 278b8e80941Smrg elf_getshdrstrndx(elf, §ion_str_index); 279b8e80941Smrg 280b8e80941Smrg while ((section = elf_nextscn(elf, section))) { 281b8e80941Smrg const char *name; 282b8e80941Smrg Elf_Data *section_data = NULL; 283b8e80941Smrg GElf_Shdr section_header; 284b8e80941Smrg if (gelf_getshdr(section, §ion_header) != §ion_header) { 285b8e80941Smrg fprintf(stderr, "Failed to read ELF section header\n"); 286b8e80941Smrg return; 287b8e80941Smrg } 288b8e80941Smrg name = elf_strptr(elf, section_str_index, section_header.sh_name); 289b8e80941Smrg if (!strcmp(name, ".text")) { 290b8e80941Smrg section_data = elf_getdata(section, section_data); 291b8e80941Smrg binary->code_size = section_data->d_size; 292b8e80941Smrg binary->code = MALLOC(binary->code_size * sizeof(unsigned char)); 293b8e80941Smrg memcpy(binary->code, section_data->d_buf, binary->code_size); 294b8e80941Smrg } else if (!strcmp(name, ".AMDGPU.config")) { 295b8e80941Smrg section_data = elf_getdata(section, section_data); 296b8e80941Smrg binary->config_size = section_data->d_size; 297b8e80941Smrg binary->config = MALLOC(binary->config_size * sizeof(unsigned char)); 298b8e80941Smrg memcpy(binary->config, section_data->d_buf, binary->config_size); 299b8e80941Smrg } else if (!strcmp(name, ".AMDGPU.disasm")) { 300b8e80941Smrg /* Always read disassembly if it's available. */ 301b8e80941Smrg section_data = elf_getdata(section, section_data); 302b8e80941Smrg binary->disasm_string = strndup(section_data->d_buf, 303b8e80941Smrg section_data->d_size); 304b8e80941Smrg } else if (!strncmp(name, ".rodata", 7)) { 305b8e80941Smrg section_data = elf_getdata(section, section_data); 306b8e80941Smrg binary->rodata_size = section_data->d_size; 307b8e80941Smrg binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char)); 308b8e80941Smrg memcpy(binary->rodata, section_data->d_buf, binary->rodata_size); 309b8e80941Smrg } else if (!strncmp(name, ".symtab", 7)) { 310b8e80941Smrg symbols = elf_getdata(section, section_data); 311b8e80941Smrg symbol_sh_link = section_header.sh_link; 312b8e80941Smrg parse_symbol_table(symbols, §ion_header, binary); 313b8e80941Smrg } else if (!strcmp(name, ".rel.text")) { 314b8e80941Smrg relocs = elf_getdata(section, section_data); 315b8e80941Smrg binary->reloc_count = section_header.sh_size / 316b8e80941Smrg section_header.sh_entsize; 317b8e80941Smrg } 318b8e80941Smrg } 319b8e80941Smrg 320b8e80941Smrg parse_relocs(elf, relocs, symbols, symbol_sh_link, binary); 321b8e80941Smrg 322b8e80941Smrg if (elf){ 323b8e80941Smrg elf_end(elf); 324b8e80941Smrg } 325b8e80941Smrg FREE(elf_buffer); 326b8e80941Smrg 327b8e80941Smrg /* Cache the config size per symbol */ 328b8e80941Smrg if (binary->global_symbol_count) { 329b8e80941Smrg binary->config_size_per_symbol = 330b8e80941Smrg binary->config_size / binary->global_symbol_count; 331b8e80941Smrg } else { 332b8e80941Smrg binary->global_symbol_count = 1; 333b8e80941Smrg binary->config_size_per_symbol = binary->config_size; 334b8e80941Smrg } 335b8e80941Smrg} 336b8e80941Smrg 337b8e80941Smrgstatic const unsigned char *r600_shader_binary_config_start( 338b8e80941Smrg const struct ac_shader_binary *binary, 339b8e80941Smrg uint64_t symbol_offset) 340b8e80941Smrg{ 341848b8605Smrg unsigned i; 342b8e80941Smrg for (i = 0; i < binary->global_symbol_count; ++i) { 343b8e80941Smrg if (binary->global_symbol_offsets[i] == symbol_offset) { 344b8e80941Smrg unsigned offset = i * binary->config_size_per_symbol; 345b8e80941Smrg return binary->config + offset; 346b8e80941Smrg } 347b8e80941Smrg } 348b8e80941Smrg return binary->config; 349b8e80941Smrg} 350848b8605Smrg 351b8e80941Smrgstatic void r600_shader_binary_read_config(const struct ac_shader_binary *binary, 352b8e80941Smrg struct r600_bytecode *bc, 353b8e80941Smrg uint64_t symbol_offset, 354b8e80941Smrg boolean *use_kill) 355b8e80941Smrg{ 356b8e80941Smrg unsigned i; 357b8e80941Smrg const unsigned char *config = 358b8e80941Smrg r600_shader_binary_config_start(binary, symbol_offset); 359b8e80941Smrg 360b8e80941Smrg for (i = 0; i < binary->config_size_per_symbol; i+= 8) { 361b8e80941Smrg unsigned reg = 362b8e80941Smrg util_le32_to_cpu(*(uint32_t*)(config + i)); 363b8e80941Smrg unsigned value = 364b8e80941Smrg util_le32_to_cpu(*(uint32_t*)(config + i + 4)); 365b8e80941Smrg switch (reg) { 366b8e80941Smrg /* R600 / R700 */ 367b8e80941Smrg case R_028850_SQ_PGM_RESOURCES_PS: 368b8e80941Smrg case R_028868_SQ_PGM_RESOURCES_VS: 369b8e80941Smrg /* Evergreen / Northern Islands */ 370b8e80941Smrg case R_028844_SQ_PGM_RESOURCES_PS: 371b8e80941Smrg case R_028860_SQ_PGM_RESOURCES_VS: 372b8e80941Smrg case R_0288D4_SQ_PGM_RESOURCES_LS: 373b8e80941Smrg bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value)); 374b8e80941Smrg bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value)); 375b8e80941Smrg break; 376b8e80941Smrg case R_02880C_DB_SHADER_CONTROL: 377b8e80941Smrg *use_kill = G_02880C_KILL_ENABLE(value); 378b8e80941Smrg break; 379b8e80941Smrg case R_0288E8_SQ_LDS_ALLOC: 380b8e80941Smrg bc->nlds_dw = value; 381b8e80941Smrg break; 382b8e80941Smrg } 383b8e80941Smrg } 384b8e80941Smrg} 385848b8605Smrg 386b8e80941Smrgstatic unsigned r600_create_shader(struct r600_bytecode *bc, 387b8e80941Smrg const struct ac_shader_binary *binary, 388b8e80941Smrg boolean *use_kill) 389b8e80941Smrg 390b8e80941Smrg{ 391b8e80941Smrg assert(binary->code_size % 4 == 0); 392b8e80941Smrg bc->bytecode = CALLOC(1, binary->code_size); 393b8e80941Smrg memcpy(bc->bytecode, binary->code, binary->code_size); 394b8e80941Smrg bc->ndw = binary->code_size / 4; 395b8e80941Smrg 396b8e80941Smrg r600_shader_binary_read_config(binary, bc, 0, use_kill); 397b8e80941Smrg return 0; 398b8e80941Smrg} 399848b8605Smrg 400848b8605Smrg#endif 401848b8605Smrg 402b8e80941Smrgstatic void r600_destroy_shader(struct r600_bytecode *bc) 403b8e80941Smrg{ 404b8e80941Smrg FREE(bc->bytecode); 405b8e80941Smrg} 406b8e80941Smrg 407b8e80941Smrgstatic void *evergreen_create_compute_state(struct pipe_context *ctx, 408b8e80941Smrg const struct pipe_compute_state *cso) 409b8e80941Smrg{ 410b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 411b8e80941Smrg struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute); 412b8e80941Smrg#ifdef HAVE_OPENCL 413b8e80941Smrg const struct pipe_llvm_program_header *header; 414b8e80941Smrg const char *code; 415b8e80941Smrg void *p; 416b8e80941Smrg boolean use_kill; 417b8e80941Smrg#endif 418b8e80941Smrg 419b8e80941Smrg shader->ctx = rctx; 420848b8605Smrg shader->local_size = cso->req_local_mem; 421848b8605Smrg shader->private_size = cso->req_private_mem; 422848b8605Smrg shader->input_size = cso->req_input_mem; 423848b8605Smrg 424b8e80941Smrg shader->ir_type = cso->ir_type; 425848b8605Smrg 426b8e80941Smrg if (shader->ir_type == PIPE_SHADER_IR_TGSI) { 427b8e80941Smrg shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, PIPE_SHADER_COMPUTE); 428b8e80941Smrg return shader; 429848b8605Smrg } 430b8e80941Smrg#ifdef HAVE_OPENCL 431b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n"); 432b8e80941Smrg header = cso->prog; 433b8e80941Smrg code = cso->prog + sizeof(struct pipe_llvm_program_header); 434b8e80941Smrg radeon_shader_binary_init(&shader->binary); 435b8e80941Smrg r600_elf_read(code, header->num_bytes, &shader->binary); 436b8e80941Smrg r600_create_shader(&shader->bc, &shader->binary, &use_kill); 437b8e80941Smrg 438b8e80941Smrg /* Upload code + ROdata */ 439b8e80941Smrg shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen, 440b8e80941Smrg shader->bc.ndw * 4); 441b8e80941Smrg p = r600_buffer_map_sync_with_rings( 442b8e80941Smrg &rctx->b, shader->code_bo, 443b8e80941Smrg PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); 444b8e80941Smrg //TODO: use util_memcpy_cpu_to_le32 ? 445b8e80941Smrg memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); 446b8e80941Smrg rctx->b.ws->buffer_unmap(shader->code_bo->buf); 447848b8605Smrg#endif 448b8e80941Smrg 449848b8605Smrg return shader; 450848b8605Smrg} 451848b8605Smrg 452b8e80941Smrgstatic void evergreen_delete_compute_state(struct pipe_context *ctx, void *state) 453848b8605Smrg{ 454b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 455b8e80941Smrg struct r600_pipe_compute *shader = state; 456b8e80941Smrg 457b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n"); 458848b8605Smrg 459848b8605Smrg if (!shader) 460848b8605Smrg return; 461848b8605Smrg 462b8e80941Smrg if (shader->ir_type == PIPE_SHADER_IR_TGSI) { 463b8e80941Smrg r600_delete_shader_selector(ctx, shader->sel); 464b8e80941Smrg } else { 465848b8605Smrg#ifdef HAVE_OPENCL 466b8e80941Smrg radeon_shader_binary_clean(&shader->binary); 467b8e80941Smrg pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL); 468b8e80941Smrg pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL); 469848b8605Smrg#endif 470b8e80941Smrg r600_destroy_shader(&shader->bc); 471b8e80941Smrg } 472848b8605Smrg FREE(shader); 473848b8605Smrg} 474848b8605Smrg 475b8e80941Smrgstatic void evergreen_bind_compute_state(struct pipe_context *ctx, void *state) 476848b8605Smrg{ 477b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 478b8e80941Smrg struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state; 479b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n"); 480b8e80941Smrg 481b8e80941Smrg if (!state) { 482b8e80941Smrg rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 483b8e80941Smrg return; 484b8e80941Smrg } 485b8e80941Smrg 486b8e80941Smrg if (cstate->ir_type == PIPE_SHADER_IR_TGSI) { 487b8e80941Smrg bool compute_dirty; 488848b8605Smrg 489b8e80941Smrg r600_shader_select(ctx, cstate->sel, &compute_dirty); 490b8e80941Smrg } 491848b8605Smrg 492b8e80941Smrg rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state; 493848b8605Smrg} 494848b8605Smrg 495848b8605Smrg/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit 496848b8605Smrg * kernel parameters there are implicit parameters that need to be stored 497848b8605Smrg * in the vertex buffer as well. Here is how these parameters are organized in 498848b8605Smrg * the buffer: 499848b8605Smrg * 500848b8605Smrg * DWORDS 0-2: Number of work groups in each dimension (x,y,z) 501848b8605Smrg * DWORDS 3-5: Number of global work items in each dimension (x,y,z) 502848b8605Smrg * DWORDS 6-8: Number of work items within each work group in each dimension 503848b8605Smrg * (x,y,z) 504848b8605Smrg * DWORDS 9+ : Kernel parameters 505848b8605Smrg */ 506b8e80941Smrgstatic void evergreen_compute_upload_input(struct pipe_context *ctx, 507b8e80941Smrg const struct pipe_grid_info *info) 508848b8605Smrg{ 509b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 510b8e80941Smrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 511848b8605Smrg unsigned i; 512848b8605Smrg /* We need to reserve 9 dwords (36 bytes) for implicit kernel 513848b8605Smrg * parameters. 514848b8605Smrg */ 515b8e80941Smrg unsigned input_size; 516b8e80941Smrg uint32_t *num_work_groups_start; 517b8e80941Smrg uint32_t *global_size_start; 518b8e80941Smrg uint32_t *local_size_start; 519b8e80941Smrg uint32_t *kernel_parameters_start; 520848b8605Smrg struct pipe_box box; 521848b8605Smrg struct pipe_transfer *transfer = NULL; 522848b8605Smrg 523b8e80941Smrg if (!shader) 524b8e80941Smrg return; 525848b8605Smrg if (shader->input_size == 0) { 526848b8605Smrg return; 527848b8605Smrg } 528b8e80941Smrg input_size = shader->input_size + 36; 529848b8605Smrg if (!shader->kernel_param) { 530848b8605Smrg /* Add space for the grid dimensions */ 531848b8605Smrg shader->kernel_param = (struct r600_resource *) 532b8e80941Smrg pipe_buffer_create(ctx->screen, 0, 533848b8605Smrg PIPE_USAGE_IMMUTABLE, input_size); 534848b8605Smrg } 535848b8605Smrg 536848b8605Smrg u_box_1d(0, input_size, &box); 537b8e80941Smrg num_work_groups_start = ctx->transfer_map(ctx, 538848b8605Smrg (struct pipe_resource*)shader->kernel_param, 539848b8605Smrg 0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE, 540848b8605Smrg &box, &transfer); 541848b8605Smrg global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4)); 542848b8605Smrg local_size_start = global_size_start + (3 * (sizeof(uint)) / 4); 543848b8605Smrg kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4); 544848b8605Smrg 545848b8605Smrg /* Copy the work group size */ 546b8e80941Smrg memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint)); 547848b8605Smrg 548848b8605Smrg /* Copy the global size */ 549848b8605Smrg for (i = 0; i < 3; i++) { 550b8e80941Smrg global_size_start[i] = info->grid[i] * info->block[i]; 551848b8605Smrg } 552848b8605Smrg 553848b8605Smrg /* Copy the local dimensions */ 554b8e80941Smrg memcpy(local_size_start, info->block, 3 * sizeof(uint)); 555848b8605Smrg 556848b8605Smrg /* Copy the kernel inputs */ 557b8e80941Smrg memcpy(kernel_parameters_start, info->input, shader->input_size); 558848b8605Smrg 559848b8605Smrg for (i = 0; i < (input_size / 4); i++) { 560b8e80941Smrg COMPUTE_DBG(rctx->screen, "input %i : %u\n", i, 561848b8605Smrg ((unsigned*)num_work_groups_start)[i]); 562848b8605Smrg } 563848b8605Smrg 564b8e80941Smrg ctx->transfer_unmap(ctx, transfer); 565848b8605Smrg 566b8e80941Smrg /* ID=0 and ID=3 are reserved for the parameters. 567b8e80941Smrg * LLVM will preferably use ID=0, but it does not work for dynamic 568b8e80941Smrg * indices. */ 569b8e80941Smrg evergreen_cs_set_vertex_buffer(rctx, 3, 0, 570b8e80941Smrg (struct pipe_resource*)shader->kernel_param); 571b8e80941Smrg evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size, 572848b8605Smrg (struct pipe_resource*)shader->kernel_param); 573848b8605Smrg} 574848b8605Smrg 575b8e80941Smrgstatic void evergreen_emit_dispatch(struct r600_context *rctx, 576b8e80941Smrg const struct pipe_grid_info *info, 577b8e80941Smrg uint32_t indirect_grid[3]) 578848b8605Smrg{ 579848b8605Smrg int i; 580b8e80941Smrg struct radeon_cmdbuf *cs = rctx->b.gfx.cs; 581848b8605Smrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 582b8e80941Smrg bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off; 583848b8605Smrg unsigned num_waves; 584b8e80941Smrg unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes; 585848b8605Smrg unsigned wave_divisor = (16 * num_pipes); 586848b8605Smrg int group_size = 1; 587848b8605Smrg int grid_size = 1; 588b8e80941Smrg unsigned lds_size = shader->local_size / 4; 589b8e80941Smrg 590b8e80941Smrg if (shader->ir_type != PIPE_SHADER_IR_TGSI) 591b8e80941Smrg lds_size += shader->bc.nlds_dw; 592848b8605Smrg 593848b8605Smrg /* Calculate group_size/grid_size */ 594848b8605Smrg for (i = 0; i < 3; i++) { 595b8e80941Smrg group_size *= info->block[i]; 596848b8605Smrg } 597848b8605Smrg 598848b8605Smrg for (i = 0; i < 3; i++) { 599b8e80941Smrg grid_size *= info->grid[i]; 600848b8605Smrg } 601848b8605Smrg 602848b8605Smrg /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */ 603b8e80941Smrg num_waves = (info->block[0] * info->block[1] * info->block[2] + 604848b8605Smrg wave_divisor - 1) / wave_divisor; 605848b8605Smrg 606848b8605Smrg COMPUTE_DBG(rctx->screen, "Using %u pipes, " 607848b8605Smrg "%u wavefronts per thread block, " 608848b8605Smrg "allocating %u dwords lds.\n", 609848b8605Smrg num_pipes, num_waves, lds_size); 610848b8605Smrg 611b8e80941Smrg radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); 612848b8605Smrg 613b8e80941Smrg radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3); 614848b8605Smrg radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */ 615848b8605Smrg radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */ 616848b8605Smrg radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */ 617848b8605Smrg 618b8e80941Smrg radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE, 619848b8605Smrg group_size); 620848b8605Smrg 621b8e80941Smrg radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3); 622b8e80941Smrg radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */ 623b8e80941Smrg radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ 624b8e80941Smrg radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ 625848b8605Smrg 626848b8605Smrg if (rctx->b.chip_class < CAYMAN) { 627848b8605Smrg assert(lds_size <= 8192); 628848b8605Smrg } else { 629848b8605Smrg /* Cayman appears to have a slightly smaller limit, see the 630848b8605Smrg * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ 631848b8605Smrg assert(lds_size <= 8160); 632848b8605Smrg } 633848b8605Smrg 634b8e80941Smrg radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, 635848b8605Smrg lds_size | (num_waves << 14)); 636848b8605Smrg 637b8e80941Smrg if (info->indirect) { 638b8e80941Smrg radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit)); 639b8e80941Smrg radeon_emit(cs, indirect_grid[0]); 640b8e80941Smrg radeon_emit(cs, indirect_grid[1]); 641b8e80941Smrg radeon_emit(cs, indirect_grid[2]); 642b8e80941Smrg radeon_emit(cs, 1); 643b8e80941Smrg } else { 644b8e80941Smrg /* Dispatch packet */ 645b8e80941Smrg radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit)); 646b8e80941Smrg radeon_emit(cs, info->grid[0]); 647b8e80941Smrg radeon_emit(cs, info->grid[1]); 648b8e80941Smrg radeon_emit(cs, info->grid[2]); 649b8e80941Smrg /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */ 650b8e80941Smrg radeon_emit(cs, 1); 651b8e80941Smrg } 652b8e80941Smrg 653b8e80941Smrg if (rctx->is_debug) 654b8e80941Smrg eg_trace_emit(rctx); 655848b8605Smrg} 656848b8605Smrg 657b8e80941Smrgstatic void compute_setup_cbs(struct r600_context *rctx) 658848b8605Smrg{ 659b8e80941Smrg struct radeon_cmdbuf *cs = rctx->b.gfx.cs; 660848b8605Smrg unsigned i; 661848b8605Smrg 662848b8605Smrg /* Emit colorbuffers. */ 663848b8605Smrg /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ 664b8e80941Smrg for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) { 665b8e80941Smrg struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i]; 666b8e80941Smrg unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 667848b8605Smrg (struct r600_resource*)cb->base.texture, 668848b8605Smrg RADEON_USAGE_READWRITE, 669b8e80941Smrg RADEON_PRIO_SHADER_RW_BUFFER); 670848b8605Smrg 671b8e80941Smrg radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7); 672848b8605Smrg radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */ 673848b8605Smrg radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */ 674848b8605Smrg radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */ 675848b8605Smrg radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */ 676848b8605Smrg radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */ 677848b8605Smrg radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */ 678848b8605Smrg radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */ 679848b8605Smrg 680848b8605Smrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */ 681848b8605Smrg radeon_emit(cs, reloc); 682848b8605Smrg 683848b8605Smrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */ 684848b8605Smrg radeon_emit(cs, reloc); 685848b8605Smrg } 686b8e80941Smrg for (; i < 8 ; i++) 687b8e80941Smrg radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 688b8e80941Smrg S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 689b8e80941Smrg for (; i < 12; i++) 690b8e80941Smrg radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C, 691b8e80941Smrg S_028C70_FORMAT(V_028C70_COLOR_INVALID)); 692b8e80941Smrg 693b8e80941Smrg /* Set CB_TARGET_MASK XXX: Use cb_misc_state */ 694b8e80941Smrg radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, 695b8e80941Smrg rctx->compute_cb_target_mask); 696b8e80941Smrg} 697b8e80941Smrg 698b8e80941Smrgstatic void compute_emit_cs(struct r600_context *rctx, 699b8e80941Smrg const struct pipe_grid_info *info) 700b8e80941Smrg{ 701b8e80941Smrg struct radeon_cmdbuf *cs = rctx->b.gfx.cs; 702b8e80941Smrg bool compute_dirty = false; 703b8e80941Smrg struct r600_pipe_shader *current; 704b8e80941Smrg struct r600_shader_atomic combined_atomics[8]; 705b8e80941Smrg uint8_t atomic_used_mask; 706b8e80941Smrg uint32_t indirect_grid[3] = { 0, 0, 0 }; 707b8e80941Smrg 708b8e80941Smrg /* make sure that the gfx ring is only one active */ 709b8e80941Smrg if (radeon_emitted(rctx->b.dma.cs, 0)) { 710b8e80941Smrg rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL); 711b8e80941Smrg } 712b8e80941Smrg 713b8e80941Smrg r600_update_compressed_resource_state(rctx, true); 714b8e80941Smrg 715b8e80941Smrg if (!rctx->cmd_buf_is_compute) { 716b8e80941Smrg rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL); 717b8e80941Smrg rctx->cmd_buf_is_compute = true; 718b8e80941Smrg } 719b8e80941Smrg 720b8e80941Smrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) { 721b8e80941Smrg r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty); 722b8e80941Smrg current = rctx->cs_shader_state.shader->sel->current; 723b8e80941Smrg if (compute_dirty) { 724b8e80941Smrg rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw; 725b8e80941Smrg r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo); 726b8e80941Smrg r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true); 727b8e80941Smrg } 728b8e80941Smrg 729b8e80941Smrg bool need_buf_const = current->shader.uses_tex_buffers || 730b8e80941Smrg current->shader.has_txq_cube_array_z_comp; 731b8e80941Smrg 732b8e80941Smrg if (info->indirect) { 733b8e80941Smrg struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect; 734b8e80941Smrg unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_TRANSFER_READ); 735b8e80941Smrg unsigned offset = info->indirect_offset / 4; 736b8e80941Smrg indirect_grid[0] = data[offset]; 737b8e80941Smrg indirect_grid[1] = data[offset + 1]; 738b8e80941Smrg indirect_grid[2] = data[offset + 2]; 739b8e80941Smrg } 740b8e80941Smrg for (int i = 0; i < 3; i++) { 741b8e80941Smrg rctx->cs_block_grid_sizes[i] = info->block[i]; 742b8e80941Smrg rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i]; 743b8e80941Smrg } 744b8e80941Smrg rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0; 745b8e80941Smrg rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true; 746b8e80941Smrg 747b8e80941Smrg evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask); 748b8e80941Smrg r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask)); 749b8e80941Smrg 750b8e80941Smrg if (need_buf_const) { 751b8e80941Smrg eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE); 752848b8605Smrg } 753b8e80941Smrg r600_update_driver_const_buffers(rctx, true); 754b8e80941Smrg 755b8e80941Smrg evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask); 756b8e80941Smrg if (atomic_used_mask) { 757b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 758b8e80941Smrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 759848b8605Smrg } 760b8e80941Smrg } else 761b8e80941Smrg r600_need_cs_space(rctx, 0, true, 0); 762b8e80941Smrg 763b8e80941Smrg /* Initialize all the compute-related registers. 764b8e80941Smrg * 765b8e80941Smrg * See evergreen_init_atom_start_compute_cs() in this file for the list 766b8e80941Smrg * of registers initialized by the start_compute_cs_cmd atom. 767b8e80941Smrg */ 768b8e80941Smrg r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd); 769b8e80941Smrg 770b8e80941Smrg /* emit config state */ 771b8e80941Smrg if (rctx->b.chip_class == EVERGREEN) { 772b8e80941Smrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) { 773b8e80941Smrg radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3); 774b8e80941Smrg radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs)); 775b8e80941Smrg radeon_emit(cs, 0); 776b8e80941Smrg radeon_emit(cs, 0); 777b8e80941Smrg radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8)); 778b8e80941Smrg } else 779b8e80941Smrg r600_emit_atom(rctx, &rctx->config_state.atom); 780848b8605Smrg } 781848b8605Smrg 782b8e80941Smrg rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; 783b8e80941Smrg r600_flush_emit(rctx); 784848b8605Smrg 785b8e80941Smrg if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI) { 786848b8605Smrg 787b8e80941Smrg compute_setup_cbs(rctx); 788b8e80941Smrg 789b8e80941Smrg /* Emit vertex buffer state */ 790b8e80941Smrg rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask); 791b8e80941Smrg r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom); 792b8e80941Smrg } else { 793b8e80941Smrg uint32_t rat_mask; 794b8e80941Smrg 795b8e80941Smrg rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0); 796b8e80941Smrg radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK, 797b8e80941Smrg rat_mask); 798b8e80941Smrg } 799b8e80941Smrg 800b8e80941Smrg r600_emit_atom(rctx, &rctx->b.render_cond_atom); 801848b8605Smrg 802848b8605Smrg /* Emit constant buffer state */ 803b8e80941Smrg r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom); 804b8e80941Smrg 805b8e80941Smrg /* Emit sampler state */ 806b8e80941Smrg r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom); 807b8e80941Smrg 808b8e80941Smrg /* Emit sampler view (texture resource) state */ 809b8e80941Smrg r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom); 810b8e80941Smrg 811b8e80941Smrg /* Emit images state */ 812b8e80941Smrg r600_emit_atom(rctx, &rctx->compute_images.atom); 813b8e80941Smrg 814b8e80941Smrg /* Emit buffers state */ 815b8e80941Smrg r600_emit_atom(rctx, &rctx->compute_buffers.atom); 816848b8605Smrg 817b8e80941Smrg /* Emit shader state */ 818b8e80941Smrg r600_emit_atom(rctx, &rctx->cs_shader_state.atom); 819848b8605Smrg 820848b8605Smrg /* Emit dispatch state and dispatch packet */ 821b8e80941Smrg evergreen_emit_dispatch(rctx, info, indirect_grid); 822848b8605Smrg 823848b8605Smrg /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff 824848b8605Smrg */ 825b8e80941Smrg rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | 826848b8605Smrg R600_CONTEXT_INV_VERTEX_CACHE | 827848b8605Smrg R600_CONTEXT_INV_TEX_CACHE; 828b8e80941Smrg r600_flush_emit(rctx); 829b8e80941Smrg rctx->b.flags = 0; 830848b8605Smrg 831b8e80941Smrg if (rctx->b.chip_class >= CAYMAN) { 832b8e80941Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 833b8e80941Smrg radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 834848b8605Smrg /* DEALLOC_STATE prevents the GPU from hanging when a 835848b8605Smrg * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT 836848b8605Smrg * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set. 837848b8605Smrg */ 838b8e80941Smrg radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0)); 839b8e80941Smrg radeon_emit(cs, 0); 840848b8605Smrg } 841b8e80941Smrg if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) 842b8e80941Smrg evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask); 843848b8605Smrg 844848b8605Smrg#if 0 845b8e80941Smrg COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw); 846848b8605Smrg for (i = 0; i < cs->cdw; i++) { 847b8e80941Smrg COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]); 848848b8605Smrg } 849848b8605Smrg#endif 850848b8605Smrg 851848b8605Smrg} 852848b8605Smrg 853848b8605Smrg 854848b8605Smrg/** 855848b8605Smrg * Emit function for r600_cs_shader_state atom 856848b8605Smrg */ 857b8e80941Smrgvoid evergreen_emit_cs_shader(struct r600_context *rctx, 858b8e80941Smrg struct r600_atom *atom) 859848b8605Smrg{ 860848b8605Smrg struct r600_cs_shader_state *state = 861848b8605Smrg (struct r600_cs_shader_state*)atom; 862848b8605Smrg struct r600_pipe_compute *shader = state->shader; 863b8e80941Smrg struct radeon_cmdbuf *cs = rctx->b.gfx.cs; 864b8e80941Smrg uint64_t va; 865b8e80941Smrg struct r600_resource *code_bo; 866b8e80941Smrg unsigned ngpr, nstack; 867b8e80941Smrg 868b8e80941Smrg if (shader->ir_type == PIPE_SHADER_IR_TGSI) { 869b8e80941Smrg code_bo = shader->sel->current->bo; 870b8e80941Smrg va = shader->sel->current->bo->gpu_address; 871b8e80941Smrg ngpr = shader->sel->current->shader.bc.ngpr; 872b8e80941Smrg nstack = shader->sel->current->shader.bc.nstack; 873b8e80941Smrg } else { 874b8e80941Smrg code_bo = shader->code_bo; 875b8e80941Smrg va = shader->code_bo->gpu_address + state->pc; 876b8e80941Smrg ngpr = shader->bc.ngpr; 877b8e80941Smrg nstack = shader->bc.nstack; 878b8e80941Smrg } 879848b8605Smrg 880b8e80941Smrg radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); 881b8e80941Smrg radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ 882848b8605Smrg radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ 883b8e80941Smrg S_0288D4_NUM_GPRS(ngpr) | 884b8e80941Smrg S_0288D4_DX10_CLAMP(1) | 885b8e80941Smrg S_0288D4_STACK_SIZE(nstack)); 886848b8605Smrg radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ 887848b8605Smrg 888848b8605Smrg radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); 889b8e80941Smrg radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, 890b8e80941Smrg code_bo, RADEON_USAGE_READ, 891b8e80941Smrg RADEON_PRIO_SHADER_BINARY)); 892848b8605Smrg} 893848b8605Smrg 894b8e80941Smrgstatic void evergreen_launch_grid(struct pipe_context *ctx, 895b8e80941Smrg const struct pipe_grid_info *info) 896848b8605Smrg{ 897b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 898848b8605Smrg#ifdef HAVE_OPENCL 899b8e80941Smrg struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; 900b8e80941Smrg boolean use_kill; 901848b8605Smrg 902b8e80941Smrg if (shader->ir_type != PIPE_SHADER_IR_TGSI) { 903b8e80941Smrg rctx->cs_shader_state.pc = info->pc; 904b8e80941Smrg /* Get the config information for this kernel. */ 905b8e80941Smrg r600_shader_binary_read_config(&shader->binary, &shader->bc, 906b8e80941Smrg info->pc, &use_kill); 907b8e80941Smrg } else { 908b8e80941Smrg use_kill = false; 909b8e80941Smrg rctx->cs_shader_state.pc = 0; 910848b8605Smrg } 911848b8605Smrg#endif 912b8e80941Smrg 913b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc); 914b8e80941Smrg 915b8e80941Smrg 916b8e80941Smrg evergreen_compute_upload_input(ctx, info); 917b8e80941Smrg compute_emit_cs(rctx, info); 918848b8605Smrg} 919848b8605Smrg 920b8e80941Smrgstatic void evergreen_set_compute_resources(struct pipe_context *ctx, 921b8e80941Smrg unsigned start, unsigned count, 922b8e80941Smrg struct pipe_surface **surfaces) 923848b8605Smrg{ 924b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 925848b8605Smrg struct r600_surface **resources = (struct r600_surface **)surfaces; 926848b8605Smrg 927b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n", 928848b8605Smrg start, count); 929848b8605Smrg 930848b8605Smrg for (unsigned i = 0; i < count; i++) { 931b8e80941Smrg /* The First four vertex buffers are reserved for parameters and 932848b8605Smrg * global buffers. */ 933b8e80941Smrg unsigned vtx_id = 4 + i; 934848b8605Smrg if (resources[i]) { 935848b8605Smrg struct r600_resource_global *buffer = 936848b8605Smrg (struct r600_resource_global*) 937848b8605Smrg resources[i]->base.texture; 938848b8605Smrg if (resources[i]->base.writable) { 939848b8605Smrg assert(i+1 < 12); 940848b8605Smrg 941b8e80941Smrg evergreen_set_rat(rctx->cs_shader_state.shader, i+1, 942848b8605Smrg (struct r600_resource *)resources[i]->base.texture, 943848b8605Smrg buffer->chunk->start_in_dw*4, 944848b8605Smrg resources[i]->base.texture->width0); 945848b8605Smrg } 946848b8605Smrg 947b8e80941Smrg evergreen_cs_set_vertex_buffer(rctx, vtx_id, 948848b8605Smrg buffer->chunk->start_in_dw * 4, 949848b8605Smrg resources[i]->base.texture); 950848b8605Smrg } 951848b8605Smrg } 952848b8605Smrg} 953848b8605Smrg 954b8e80941Smrgstatic void evergreen_set_global_binding(struct pipe_context *ctx, 955b8e80941Smrg unsigned first, unsigned n, 956b8e80941Smrg struct pipe_resource **resources, 957b8e80941Smrg uint32_t **handles) 958848b8605Smrg{ 959b8e80941Smrg struct r600_context *rctx = (struct r600_context *)ctx; 960b8e80941Smrg struct compute_memory_pool *pool = rctx->screen->global_pool; 961848b8605Smrg struct r600_resource_global **buffers = 962848b8605Smrg (struct r600_resource_global **)resources; 963848b8605Smrg unsigned i; 964848b8605Smrg 965b8e80941Smrg COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n", 966848b8605Smrg first, n); 967848b8605Smrg 968848b8605Smrg if (!resources) { 969848b8605Smrg /* XXX: Unset */ 970848b8605Smrg return; 971848b8605Smrg } 972848b8605Smrg 973848b8605Smrg /* We mark these items for promotion to the pool if they 974848b8605Smrg * aren't already there */ 975848b8605Smrg for (i = first; i < first + n; i++) { 976848b8605Smrg struct compute_memory_item *item = buffers[i]->chunk; 977848b8605Smrg 978848b8605Smrg if (!is_item_in_pool(item)) 979848b8605Smrg buffers[i]->chunk->status |= ITEM_FOR_PROMOTING; 980848b8605Smrg } 981848b8605Smrg 982b8e80941Smrg if (compute_memory_finalize_pending(pool, ctx) == -1) { 983848b8605Smrg /* XXX: Unset */ 984848b8605Smrg return; 985848b8605Smrg } 986848b8605Smrg 987848b8605Smrg for (i = first; i < first + n; i++) 988848b8605Smrg { 989848b8605Smrg uint32_t buffer_offset; 990848b8605Smrg uint32_t handle; 991848b8605Smrg assert(resources[i]->target == PIPE_BUFFER); 992848b8605Smrg assert(resources[i]->bind & PIPE_BIND_GLOBAL); 993848b8605Smrg 994848b8605Smrg buffer_offset = util_le32_to_cpu(*(handles[i])); 995848b8605Smrg handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4; 996848b8605Smrg 997848b8605Smrg *(handles[i]) = util_cpu_to_le32(handle); 998848b8605Smrg } 999848b8605Smrg 1000b8e80941Smrg /* globals for writing */ 1001b8e80941Smrg evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4); 1002b8e80941Smrg /* globals for reading */ 1003b8e80941Smrg evergreen_cs_set_vertex_buffer(rctx, 1, 0, 1004848b8605Smrg (struct pipe_resource*)pool->bo); 1005b8e80941Smrg 1006b8e80941Smrg /* constants for reading, LLVM puts them in text segment */ 1007b8e80941Smrg evergreen_cs_set_vertex_buffer(rctx, 2, 0, 1008b8e80941Smrg (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo); 1009848b8605Smrg} 1010848b8605Smrg 1011848b8605Smrg/** 1012848b8605Smrg * This function initializes all the compute specific registers that need to 1013848b8605Smrg * be initialized for each compute command stream. Registers that are common 1014848b8605Smrg * to both compute and 3D will be initialized at the beginning of each compute 1015848b8605Smrg * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG 1016848b8605Smrg * packet requires that the shader type bit be set, we must initialize all 1017848b8605Smrg * context registers needed for compute in this function. The registers 1018b8e80941Smrg * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the 1019848b8605Smrg * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending 1020848b8605Smrg * on the GPU family. 1021848b8605Smrg */ 1022b8e80941Smrgvoid evergreen_init_atom_start_compute_cs(struct r600_context *rctx) 1023848b8605Smrg{ 1024b8e80941Smrg struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd; 1025848b8605Smrg int num_threads; 1026848b8605Smrg int num_stack_entries; 1027848b8605Smrg 1028b8e80941Smrg /* since all required registers are initialized in the 1029848b8605Smrg * start_compute_cs_cmd atom, we can EMIT_EARLY here. 1030848b8605Smrg */ 1031848b8605Smrg r600_init_command_buffer(cb, 256); 1032848b8605Smrg cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; 1033848b8605Smrg 1034848b8605Smrg /* We're setting config registers here. */ 1035848b8605Smrg r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1036848b8605Smrg r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 1037848b8605Smrg 1038b8e80941Smrg switch (rctx->b.family) { 1039848b8605Smrg case CHIP_CEDAR: 1040848b8605Smrg default: 1041848b8605Smrg num_threads = 128; 1042848b8605Smrg num_stack_entries = 256; 1043848b8605Smrg break; 1044848b8605Smrg case CHIP_REDWOOD: 1045848b8605Smrg num_threads = 128; 1046848b8605Smrg num_stack_entries = 256; 1047848b8605Smrg break; 1048848b8605Smrg case CHIP_JUNIPER: 1049848b8605Smrg num_threads = 128; 1050848b8605Smrg num_stack_entries = 512; 1051848b8605Smrg break; 1052848b8605Smrg case CHIP_CYPRESS: 1053848b8605Smrg case CHIP_HEMLOCK: 1054848b8605Smrg num_threads = 128; 1055848b8605Smrg num_stack_entries = 512; 1056848b8605Smrg break; 1057848b8605Smrg case CHIP_PALM: 1058848b8605Smrg num_threads = 128; 1059848b8605Smrg num_stack_entries = 256; 1060848b8605Smrg break; 1061848b8605Smrg case CHIP_SUMO: 1062848b8605Smrg num_threads = 128; 1063848b8605Smrg num_stack_entries = 256; 1064848b8605Smrg break; 1065848b8605Smrg case CHIP_SUMO2: 1066848b8605Smrg num_threads = 128; 1067848b8605Smrg num_stack_entries = 512; 1068848b8605Smrg break; 1069848b8605Smrg case CHIP_BARTS: 1070848b8605Smrg num_threads = 128; 1071848b8605Smrg num_stack_entries = 512; 1072848b8605Smrg break; 1073848b8605Smrg case CHIP_TURKS: 1074848b8605Smrg num_threads = 128; 1075848b8605Smrg num_stack_entries = 256; 1076848b8605Smrg break; 1077848b8605Smrg case CHIP_CAICOS: 1078848b8605Smrg num_threads = 128; 1079848b8605Smrg num_stack_entries = 256; 1080848b8605Smrg break; 1081848b8605Smrg } 1082848b8605Smrg 1083848b8605Smrg /* The primitive type always needs to be POINTLIST for compute. */ 1084848b8605Smrg r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, 1085848b8605Smrg V_008958_DI_PT_POINTLIST); 1086848b8605Smrg 1087b8e80941Smrg if (rctx->b.chip_class < CAYMAN) { 1088848b8605Smrg 1089848b8605Smrg /* These registers control which simds can be used by each stage. 1090848b8605Smrg * The default for these registers is 0xffffffff, which means 1091848b8605Smrg * all simds are available for each stage. It's possible we may 1092848b8605Smrg * want to play around with these in the future, but for now 1093848b8605Smrg * the default value is fine. 1094848b8605Smrg * 1095848b8605Smrg * R_008E20_SQ_STATIC_THREAD_MGMT1 1096848b8605Smrg * R_008E24_SQ_STATIC_THREAD_MGMT2 1097848b8605Smrg * R_008E28_SQ_STATIC_THREAD_MGMT3 1098848b8605Smrg */ 1099848b8605Smrg 1100b8e80941Smrg /* XXX: We may need to adjust the thread and stack resource 1101848b8605Smrg * values for 3D/compute interop */ 1102848b8605Smrg 1103848b8605Smrg r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); 1104848b8605Smrg 1105848b8605Smrg /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 1106848b8605Smrg * Set the number of threads used by the PS/VS/GS/ES stage to 1107848b8605Smrg * 0. 1108848b8605Smrg */ 1109848b8605Smrg r600_store_value(cb, 0); 1110848b8605Smrg 1111848b8605Smrg /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 1112848b8605Smrg * Set the number of threads used by the CS (aka LS) stage to 1113848b8605Smrg * the maximum number of threads and set the number of threads 1114848b8605Smrg * for the HS stage to 0. */ 1115848b8605Smrg r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); 1116848b8605Smrg 1117848b8605Smrg /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 1118848b8605Smrg * Set the Control Flow stack entries to 0 for PS/VS stages */ 1119848b8605Smrg r600_store_value(cb, 0); 1120848b8605Smrg 1121848b8605Smrg /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 1122848b8605Smrg * Set the Control Flow stack entries to 0 for GS/ES stages */ 1123848b8605Smrg r600_store_value(cb, 0); 1124848b8605Smrg 1125848b8605Smrg /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 1126848b8605Smrg * Set the Contol Flow stack entries to 0 for the HS stage, and 1127848b8605Smrg * set it to the maximum value for the CS (aka LS) stage. */ 1128848b8605Smrg r600_store_value(cb, 1129848b8605Smrg S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); 1130848b8605Smrg } 1131848b8605Smrg /* Give the compute shader all the available LDS space. 1132848b8605Smrg * NOTE: This only sets the maximum number of dwords that a compute 1133848b8605Smrg * shader can allocate. When a shader is executed, we still need to 1134848b8605Smrg * allocate the appropriate amount of LDS dwords using the 1135848b8605Smrg * CM_R_0288E8_SQ_LDS_ALLOC register. 1136848b8605Smrg */ 1137b8e80941Smrg if (rctx->b.chip_class < CAYMAN) { 1138848b8605Smrg r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, 1139848b8605Smrg S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192)); 1140848b8605Smrg } else { 1141848b8605Smrg r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT, 1142848b8605Smrg S_0286FC_NUM_PS_LDS(0) | 1143848b8605Smrg S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */ 1144848b8605Smrg } 1145848b8605Smrg 1146848b8605Smrg /* Context Registers */ 1147848b8605Smrg 1148b8e80941Smrg if (rctx->b.chip_class < CAYMAN) { 1149848b8605Smrg /* workaround for hw issues with dyn gpr - must set all limits 1150848b8605Smrg * to 240 instead of 0, 0x1e == 240 / 8 1151848b8605Smrg */ 1152848b8605Smrg r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, 1153848b8605Smrg S_028838_PS_GPRS(0x1e) | 1154848b8605Smrg S_028838_VS_GPRS(0x1e) | 1155848b8605Smrg S_028838_GS_GPRS(0x1e) | 1156848b8605Smrg S_028838_ES_GPRS(0x1e) | 1157848b8605Smrg S_028838_HS_GPRS(0x1e) | 1158848b8605Smrg S_028838_LS_GPRS(0x1e)); 1159848b8605Smrg } 1160848b8605Smrg 1161848b8605Smrg /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ 1162848b8605Smrg r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, 1163848b8605Smrg S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); 1164848b8605Smrg 1165848b8605Smrg r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); 1166848b8605Smrg 1167848b8605Smrg r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, 1168b8e80941Smrg S_0286E8_TID_IN_GROUP_ENA(1) | 1169b8e80941Smrg S_0286E8_TGID_ENA(1) | 1170b8e80941Smrg S_0286E8_DISABLE_INDEX_PACK(1)); 1171848b8605Smrg 1172848b8605Smrg /* The LOOP_CONST registers are an optimizations for loops that allows 1173848b8605Smrg * you to store the initial counter, increment value, and maximum 1174848b8605Smrg * counter value in a register so that hardware can calculate the 1175848b8605Smrg * correct number of iterations for the loop, so that you don't need 1176848b8605Smrg * to have the loop counter in your shader code. We don't currently use 1177848b8605Smrg * this optimization, so we must keep track of the counter in the 1178848b8605Smrg * shader and use a break instruction to exit loops. However, the 1179848b8605Smrg * hardware will still uses this register to determine when to exit a 1180848b8605Smrg * loop, so we need to initialize the counter to 0, set the increment 1181848b8605Smrg * value to 1 and the maximum counter value to the 4095 (0xfff) which 1182848b8605Smrg * is the maximum value allowed. This gives us a maximum of 4096 1183848b8605Smrg * iterations for our loops, but hopefully our break instruction will 1184848b8605Smrg * execute before some time before the 4096th iteration. 1185848b8605Smrg */ 1186848b8605Smrg eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); 1187848b8605Smrg} 1188848b8605Smrg 1189b8e80941Smrgvoid evergreen_init_compute_state_functions(struct r600_context *rctx) 1190848b8605Smrg{ 1191b8e80941Smrg rctx->b.b.create_compute_state = evergreen_create_compute_state; 1192b8e80941Smrg rctx->b.b.delete_compute_state = evergreen_delete_compute_state; 1193b8e80941Smrg rctx->b.b.bind_compute_state = evergreen_bind_compute_state; 1194b8e80941Smrg// rctx->context.create_sampler_view = evergreen_compute_create_sampler_view; 1195b8e80941Smrg rctx->b.b.set_compute_resources = evergreen_set_compute_resources; 1196b8e80941Smrg rctx->b.b.set_global_binding = evergreen_set_global_binding; 1197b8e80941Smrg rctx->b.b.launch_grid = evergreen_launch_grid; 1198848b8605Smrg 1199848b8605Smrg} 1200848b8605Smrg 1201b8e80941Smrgstatic void *r600_compute_global_transfer_map(struct pipe_context *ctx, 1202b8e80941Smrg struct pipe_resource *resource, 1203b8e80941Smrg unsigned level, 1204b8e80941Smrg unsigned usage, 1205b8e80941Smrg const struct pipe_box *box, 1206b8e80941Smrg struct pipe_transfer **ptransfer) 1207848b8605Smrg{ 1208b8e80941Smrg struct r600_context *rctx = (struct r600_context*)ctx; 1209848b8605Smrg struct compute_memory_pool *pool = rctx->screen->global_pool; 1210848b8605Smrg struct r600_resource_global* buffer = 1211848b8605Smrg (struct r600_resource_global*)resource; 1212848b8605Smrg 1213848b8605Smrg struct compute_memory_item *item = buffer->chunk; 1214848b8605Smrg struct pipe_resource *dst = NULL; 1215848b8605Smrg unsigned offset = box->x; 1216848b8605Smrg 1217848b8605Smrg if (is_item_in_pool(item)) { 1218b8e80941Smrg compute_memory_demote_item(pool, item, ctx); 1219848b8605Smrg } 1220848b8605Smrg else { 1221848b8605Smrg if (item->real_buffer == NULL) { 1222b8e80941Smrg item->real_buffer = 1223848b8605Smrg r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4); 1224848b8605Smrg } 1225848b8605Smrg } 1226848b8605Smrg 1227848b8605Smrg dst = (struct pipe_resource*)item->real_buffer; 1228848b8605Smrg 1229848b8605Smrg if (usage & PIPE_TRANSFER_READ) 1230848b8605Smrg buffer->chunk->status |= ITEM_MAPPED_FOR_READING; 1231848b8605Smrg 1232848b8605Smrg COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n" 1233848b8605Smrg "level = %u, usage = %u, box(x = %u, y = %u, z = %u " 1234848b8605Smrg "width = %u, height = %u, depth = %u)\n", level, usage, 1235848b8605Smrg box->x, box->y, box->z, box->width, box->height, 1236848b8605Smrg box->depth); 1237848b8605Smrg COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = " 1238848b8605Smrg "%u (box.x)\n", item->id, box->x); 1239848b8605Smrg 1240848b8605Smrg 1241848b8605Smrg assert(resource->target == PIPE_BUFFER); 1242848b8605Smrg assert(resource->bind & PIPE_BIND_GLOBAL); 1243848b8605Smrg assert(box->x >= 0); 1244848b8605Smrg assert(box->y == 0); 1245848b8605Smrg assert(box->z == 0); 1246848b8605Smrg 1247848b8605Smrg ///TODO: do it better, mapping is not possible if the pool is too big 1248b8e80941Smrg return pipe_buffer_map_range(ctx, dst, 1249848b8605Smrg offset, box->width, usage, ptransfer); 1250848b8605Smrg} 1251848b8605Smrg 1252b8e80941Smrgstatic void r600_compute_global_transfer_unmap(struct pipe_context *ctx, 1253b8e80941Smrg struct pipe_transfer *transfer) 1254848b8605Smrg{ 1255848b8605Smrg /* struct r600_resource_global are not real resources, they just map 1256848b8605Smrg * to an offset within the compute memory pool. The function 1257848b8605Smrg * r600_compute_global_transfer_map() maps the memory pool 1258848b8605Smrg * resource rather than the struct r600_resource_global passed to 1259848b8605Smrg * it as an argument and then initalizes ptransfer->resource with 1260848b8605Smrg * the memory pool resource (via pipe_buffer_map_range). 1261848b8605Smrg * When transfer_unmap is called it uses the memory pool's 1262848b8605Smrg * vtable which calls r600_buffer_transfer_map() rather than 1263848b8605Smrg * this function. 1264848b8605Smrg */ 1265848b8605Smrg assert (!"This function should not be called"); 1266848b8605Smrg} 1267848b8605Smrg 1268b8e80941Smrgstatic void r600_compute_global_transfer_flush_region(struct pipe_context *ctx, 1269b8e80941Smrg struct pipe_transfer *transfer, 1270b8e80941Smrg const struct pipe_box *box) 1271848b8605Smrg{ 1272848b8605Smrg assert(0 && "TODO"); 1273848b8605Smrg} 1274848b8605Smrg 1275b8e80941Smrgstatic void r600_compute_global_buffer_destroy(struct pipe_screen *screen, 1276b8e80941Smrg struct pipe_resource *res) 1277848b8605Smrg{ 1278b8e80941Smrg struct r600_resource_global* buffer = NULL; 1279b8e80941Smrg struct r600_screen* rscreen = NULL; 1280b8e80941Smrg 1281b8e80941Smrg assert(res->target == PIPE_BUFFER); 1282b8e80941Smrg assert(res->bind & PIPE_BIND_GLOBAL); 1283b8e80941Smrg 1284b8e80941Smrg buffer = (struct r600_resource_global*)res; 1285b8e80941Smrg rscreen = (struct r600_screen*)screen; 1286b8e80941Smrg 1287b8e80941Smrg compute_memory_free(rscreen->global_pool, buffer->chunk->id); 1288b8e80941Smrg 1289b8e80941Smrg buffer->chunk = NULL; 1290b8e80941Smrg free(res); 1291b8e80941Smrg} 1292b8e80941Smrg 1293b8e80941Smrgstatic const struct u_resource_vtbl r600_global_buffer_vtbl = 1294b8e80941Smrg{ 1295b8e80941Smrg u_default_resource_get_handle, /* get_handle */ 1296b8e80941Smrg r600_compute_global_buffer_destroy, /* resource_destroy */ 1297b8e80941Smrg r600_compute_global_transfer_map, /* transfer_map */ 1298b8e80941Smrg r600_compute_global_transfer_flush_region,/* transfer_flush_region */ 1299b8e80941Smrg r600_compute_global_transfer_unmap, /* transfer_unmap */ 1300b8e80941Smrg}; 1301b8e80941Smrg 1302b8e80941Smrgstruct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen, 1303b8e80941Smrg const struct pipe_resource *templ) 1304b8e80941Smrg{ 1305b8e80941Smrg struct r600_resource_global* result = NULL; 1306b8e80941Smrg struct r600_screen* rscreen = NULL; 1307b8e80941Smrg int size_in_dw = 0; 1308b8e80941Smrg 1309b8e80941Smrg assert(templ->target == PIPE_BUFFER); 1310b8e80941Smrg assert(templ->bind & PIPE_BIND_GLOBAL); 1311b8e80941Smrg assert(templ->array_size == 1 || templ->array_size == 0); 1312b8e80941Smrg assert(templ->depth0 == 1 || templ->depth0 == 0); 1313b8e80941Smrg assert(templ->height0 == 1 || templ->height0 == 0); 1314b8e80941Smrg 1315b8e80941Smrg result = (struct r600_resource_global*) 1316b8e80941Smrg CALLOC(sizeof(struct r600_resource_global), 1); 1317b8e80941Smrg rscreen = (struct r600_screen*)screen; 1318b8e80941Smrg 1319b8e80941Smrg COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n"); 1320b8e80941Smrg COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0, 1321b8e80941Smrg templ->array_size); 1322b8e80941Smrg 1323b8e80941Smrg result->base.b.vtbl = &r600_global_buffer_vtbl; 1324b8e80941Smrg result->base.b.b = *templ; 1325b8e80941Smrg result->base.b.b.screen = screen; 1326b8e80941Smrg pipe_reference_init(&result->base.b.b.reference, 1); 1327b8e80941Smrg 1328b8e80941Smrg size_in_dw = (templ->width0+3) / 4; 1329b8e80941Smrg 1330b8e80941Smrg result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw); 1331b8e80941Smrg 1332b8e80941Smrg if (result->chunk == NULL) 1333b8e80941Smrg { 1334b8e80941Smrg free(result); 1335b8e80941Smrg return NULL; 1336b8e80941Smrg } 1337b8e80941Smrg 1338b8e80941Smrg return &result->base.b.b; 1339848b8605Smrg} 1340