1af69d88dSmrg/* 2af69d88dSmrg * Copyright 2013 Advanced Micro Devices, Inc. 301e04c3fSmrg * All Rights Reserved. 4af69d88dSmrg * 5af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 6af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 7af69d88dSmrg * to deal in the Software without restriction, including without limitation 8af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 9af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom 10af69d88dSmrg * the Software is furnished to do so, subject to the following conditions: 11af69d88dSmrg * 12af69d88dSmrg * The above copyright notice and this permission notice (including the next 13af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 14af69d88dSmrg * Software. 15af69d88dSmrg * 16af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 23af69d88dSmrg * 24af69d88dSmrg */ 25af69d88dSmrg 267ec681f3Smrg#include "si_compute.h" 2701e04c3fSmrg 287ec681f3Smrg#include "ac_rtld.h" 2901e04c3fSmrg#include "amd_kernel_code_t.h" 307ec681f3Smrg#include "nir/tgsi_to_nir.h" 3101e04c3fSmrg#include "si_build_pm4.h" 327ec681f3Smrg#include "util/u_async_debug.h" 337ec681f3Smrg#include "util/u_memory.h" 347ec681f3Smrg#include "util/u_upload_mgr.h" 3501e04c3fSmrg 367ec681f3Smrg#define COMPUTE_DBG(sscreen, fmt, args...) \ 377ec681f3Smrg do { \ 387ec681f3Smrg if ((sscreen->debug_flags & DBG(COMPUTE))) \ 397ec681f3Smrg fprintf(stderr, fmt, ##args); \ 407ec681f3Smrg } while (0); 4101e04c3fSmrg 4201e04c3fSmrgstruct dispatch_packet { 437ec681f3Smrg uint16_t header; 447ec681f3Smrg uint16_t setup; 457ec681f3Smrg uint16_t workgroup_size_x; 467ec681f3Smrg uint16_t workgroup_size_y; 477ec681f3Smrg uint16_t workgroup_size_z; 487ec681f3Smrg uint16_t reserved0; 497ec681f3Smrg uint32_t grid_size_x; 507ec681f3Smrg uint32_t grid_size_y; 517ec681f3Smrg uint32_t grid_size_z; 527ec681f3Smrg uint32_t private_segment_size; 537ec681f3Smrg uint32_t group_segment_size; 547ec681f3Smrg uint64_t kernel_object; 557ec681f3Smrg uint64_t kernarg_address; 567ec681f3Smrg uint64_t reserved2; 5701e04c3fSmrg}; 58af69d88dSmrg 597ec681f3Smrgstatic const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program, 607ec681f3Smrg uint64_t symbol_offset) 6101e04c3fSmrg{ 627ec681f3Smrg const struct si_shader_selector *sel = &program->sel; 637ec681f3Smrg 647ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE) 657ec681f3Smrg return NULL; 667ec681f3Smrg 677ec681f3Smrg struct ac_rtld_binary rtld; 687ec681f3Smrg if (!ac_rtld_open(&rtld, 697ec681f3Smrg (struct ac_rtld_open_info){.info = &sel->screen->info, 707ec681f3Smrg .shader_type = MESA_SHADER_COMPUTE, 717ec681f3Smrg .wave_size = sel->screen->compute_wave_size, 727ec681f3Smrg .num_parts = 1, 737ec681f3Smrg .elf_ptrs = &program->shader.binary.elf_buffer, 747ec681f3Smrg .elf_sizes = &program->shader.binary.elf_size})) 757ec681f3Smrg return NULL; 767ec681f3Smrg 777ec681f3Smrg const amd_kernel_code_t *result = NULL; 787ec681f3Smrg const char *text; 797ec681f3Smrg size_t size; 807ec681f3Smrg if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size)) 817ec681f3Smrg goto out; 827ec681f3Smrg 837ec681f3Smrg if (symbol_offset + sizeof(amd_kernel_code_t) > size) 847ec681f3Smrg goto out; 857ec681f3Smrg 867ec681f3Smrg result = (const amd_kernel_code_t *)(text + symbol_offset); 877ec681f3Smrg 887ec681f3Smrgout: 897ec681f3Smrg ac_rtld_close(&rtld); 907ec681f3Smrg return result; 9101e04c3fSmrg} 92af69d88dSmrg 9301e04c3fSmrgstatic void code_object_to_config(const amd_kernel_code_t *code_object, 947ec681f3Smrg struct ac_shader_config *out_config) 957ec681f3Smrg{ 967ec681f3Smrg 977ec681f3Smrg uint32_t rsrc1 = code_object->compute_pgm_resource_registers; 987ec681f3Smrg uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; 997ec681f3Smrg out_config->num_sgprs = code_object->wavefront_sgpr_count; 1007ec681f3Smrg out_config->num_vgprs = code_object->workitem_vgpr_count; 1017ec681f3Smrg out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); 1027ec681f3Smrg out_config->rsrc1 = rsrc1; 1037ec681f3Smrg out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2)); 1047ec681f3Smrg out_config->rsrc2 = rsrc2; 1057ec681f3Smrg out_config->scratch_bytes_per_wave = 1067ec681f3Smrg align(code_object->workitem_private_segment_byte_size * 64, 1024); 10701e04c3fSmrg} 108af69d88dSmrg 10901e04c3fSmrg/* Asynchronous compute shader compilation. */ 1107ec681f3Smrgstatic void si_create_compute_state_async(void *job, void *gdata, int thread_index) 11101e04c3fSmrg{ 1127ec681f3Smrg struct si_compute *program = (struct si_compute *)job; 1137ec681f3Smrg struct si_shader_selector *sel = &program->sel; 1147ec681f3Smrg struct si_shader *shader = &program->shader; 1157ec681f3Smrg struct ac_llvm_compiler *compiler; 1167ec681f3Smrg struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; 1177ec681f3Smrg struct si_screen *sscreen = sel->screen; 1187ec681f3Smrg 1197ec681f3Smrg assert(!debug->debug_message || debug->async); 1207ec681f3Smrg assert(thread_index >= 0); 1217ec681f3Smrg assert(thread_index < ARRAY_SIZE(sscreen->compiler)); 1227ec681f3Smrg compiler = &sscreen->compiler[thread_index]; 1237ec681f3Smrg 1247ec681f3Smrg if (!compiler->passes) 1257ec681f3Smrg si_init_compiler(sscreen, compiler); 1267ec681f3Smrg 1277ec681f3Smrg assert(program->ir_type == PIPE_SHADER_IR_NIR); 1287ec681f3Smrg si_nir_scan_shader(sel->nir, &sel->info); 1297ec681f3Smrg 1307ec681f3Smrg si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers, 1317ec681f3Smrg &sel->active_samplers_and_images); 1327ec681f3Smrg 1337ec681f3Smrg program->shader.is_monolithic = true; 1347ec681f3Smrg 1357ec681f3Smrg /* Variable block sizes need 10 bits (1 + log2(SI_MAX_VARIABLE_THREADS_PER_BLOCK)) per dim. 1367ec681f3Smrg * We pack them into a single user SGPR. 1377ec681f3Smrg */ 1387ec681f3Smrg unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) + 1397ec681f3Smrg (sel->info.uses_variable_block_size ? 1 : 0) + 1407ec681f3Smrg sel->info.base.cs.user_data_components_amd; 1417ec681f3Smrg 1427ec681f3Smrg /* Fast path for compute shaders - some descriptors passed via user SGPRs. */ 1437ec681f3Smrg /* Shader buffers in user SGPRs. */ 1447ec681f3Smrg for (unsigned i = 0; i < MIN2(3, sel->info.base.num_ssbos) && user_sgprs <= 12; i++) { 1457ec681f3Smrg user_sgprs = align(user_sgprs, 4); 1467ec681f3Smrg if (i == 0) 1477ec681f3Smrg sel->cs_shaderbufs_sgpr_index = user_sgprs; 1487ec681f3Smrg user_sgprs += 4; 1497ec681f3Smrg sel->cs_num_shaderbufs_in_user_sgprs++; 1507ec681f3Smrg } 1517ec681f3Smrg 1527ec681f3Smrg /* Images in user SGPRs. */ 1537ec681f3Smrg unsigned non_msaa_images = u_bit_consecutive(0, sel->info.base.num_images) & 1547ec681f3Smrg ~sel->info.base.msaa_images; 1557ec681f3Smrg 1567ec681f3Smrg for (unsigned i = 0; i < 3 && non_msaa_images & (1 << i); i++) { 1577ec681f3Smrg unsigned num_sgprs = sel->info.base.image_buffers & (1 << i) ? 4 : 8; 1587ec681f3Smrg 1597ec681f3Smrg if (align(user_sgprs, num_sgprs) + num_sgprs > 16) 1607ec681f3Smrg break; 1617ec681f3Smrg 1627ec681f3Smrg user_sgprs = align(user_sgprs, num_sgprs); 1637ec681f3Smrg if (i == 0) 1647ec681f3Smrg sel->cs_images_sgpr_index = user_sgprs; 1657ec681f3Smrg user_sgprs += num_sgprs; 1667ec681f3Smrg sel->cs_num_images_in_user_sgprs++; 1677ec681f3Smrg } 1687ec681f3Smrg sel->cs_images_num_sgprs = user_sgprs - sel->cs_images_sgpr_index; 1697ec681f3Smrg assert(user_sgprs <= 16); 1707ec681f3Smrg 1717ec681f3Smrg unsigned char ir_sha1_cache_key[20]; 1727ec681f3Smrg si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key); 1737ec681f3Smrg 1747ec681f3Smrg /* Try to load the shader from the shader cache. */ 1757ec681f3Smrg simple_mtx_lock(&sscreen->shader_cache_mutex); 1767ec681f3Smrg 1777ec681f3Smrg if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { 1787ec681f3Smrg simple_mtx_unlock(&sscreen->shader_cache_mutex); 1797ec681f3Smrg 1807ec681f3Smrg si_shader_dump_stats_for_shader_db(sscreen, shader, debug); 1817ec681f3Smrg si_shader_dump(sscreen, shader, debug, stderr, true); 1827ec681f3Smrg 1837ec681f3Smrg if (!si_shader_binary_upload(sscreen, shader, 0)) 1847ec681f3Smrg program->shader.compilation_failed = true; 1857ec681f3Smrg } else { 1867ec681f3Smrg simple_mtx_unlock(&sscreen->shader_cache_mutex); 1877ec681f3Smrg 1887ec681f3Smrg if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) { 1897ec681f3Smrg program->shader.compilation_failed = true; 1907ec681f3Smrg return; 1917ec681f3Smrg } 1927ec681f3Smrg 1937ec681f3Smrg bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0; 1947ec681f3Smrg 1957ec681f3Smrg shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) / 1967ec681f3Smrg ((sscreen->compute_wave_size == 32 || 1977ec681f3Smrg sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) | 1987ec681f3Smrg S_00B848_DX10_CLAMP(1) | 1997ec681f3Smrg S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) | 2007ec681f3Smrg S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) | 2017ec681f3Smrg S_00B848_FLOAT_MODE(shader->config.float_mode); 2027ec681f3Smrg 2037ec681f3Smrg if (sscreen->info.chip_class < GFX10) { 2047ec681f3Smrg shader->config.rsrc1 |= S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8); 2057ec681f3Smrg } 2067ec681f3Smrg 2077ec681f3Smrg shader->config.rsrc2 = S_00B84C_USER_SGPR(user_sgprs) | S_00B84C_SCRATCH_EN(scratch_enabled) | 2087ec681f3Smrg S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) | 2097ec681f3Smrg S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) | 2107ec681f3Smrg S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) | 2117ec681f3Smrg S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) | 2127ec681f3Smrg S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] 2137ec681f3Smrg ? 2 2147ec681f3Smrg : sel->info.uses_thread_id[1] ? 1 : 0) | 2157ec681f3Smrg S_00B84C_LDS_SIZE(shader->config.lds_size); 2167ec681f3Smrg 2177ec681f3Smrg simple_mtx_lock(&sscreen->shader_cache_mutex); 2187ec681f3Smrg si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true); 2197ec681f3Smrg simple_mtx_unlock(&sscreen->shader_cache_mutex); 2207ec681f3Smrg } 2217ec681f3Smrg 2227ec681f3Smrg ralloc_free(sel->nir); 2237ec681f3Smrg sel->nir = NULL; 22401e04c3fSmrg} 225af69d88dSmrg 2267ec681f3Smrgstatic void *si_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso) 227af69d88dSmrg{ 2287ec681f3Smrg struct si_context *sctx = (struct si_context *)ctx; 2297ec681f3Smrg struct si_screen *sscreen = (struct si_screen *)ctx->screen; 2307ec681f3Smrg struct si_compute *program = CALLOC_STRUCT(si_compute); 2317ec681f3Smrg struct si_shader_selector *sel = &program->sel; 2327ec681f3Smrg 2337ec681f3Smrg pipe_reference_init(&sel->base.reference, 1); 2347ec681f3Smrg sel->info.stage = MESA_SHADER_COMPUTE; 2357ec681f3Smrg sel->screen = sscreen; 2367ec681f3Smrg sel->const_and_shader_buf_descriptors_index = 2377ec681f3Smrg si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_COMPUTE); 2387ec681f3Smrg sel->sampler_and_images_descriptors_index = 2397ec681f3Smrg si_sampler_and_image_descriptors_idx(PIPE_SHADER_COMPUTE); 2407ec681f3Smrg sel->info.base.shared_size = cso->req_local_mem; 2417ec681f3Smrg program->shader.selector = &program->sel; 2427ec681f3Smrg program->ir_type = cso->ir_type; 2437ec681f3Smrg program->private_size = cso->req_private_mem; 2447ec681f3Smrg program->input_size = cso->req_input_mem; 2457ec681f3Smrg 2467ec681f3Smrg if (cso->ir_type != PIPE_SHADER_IR_NATIVE) { 2477ec681f3Smrg if (cso->ir_type == PIPE_SHADER_IR_TGSI) { 2487ec681f3Smrg program->ir_type = PIPE_SHADER_IR_NIR; 2497ec681f3Smrg sel->nir = tgsi_to_nir(cso->prog, ctx->screen, true); 2507ec681f3Smrg } else { 2517ec681f3Smrg assert(cso->ir_type == PIPE_SHADER_IR_NIR); 2527ec681f3Smrg sel->nir = (struct nir_shader *)cso->prog; 2537ec681f3Smrg } 2547ec681f3Smrg 2557ec681f3Smrg sel->compiler_ctx_state.debug = sctx->debug; 2567ec681f3Smrg sel->compiler_ctx_state.is_debug_context = sctx->is_debug; 2577ec681f3Smrg p_atomic_inc(&sscreen->num_shaders_created); 2587ec681f3Smrg 2597ec681f3Smrg si_schedule_initial_compile(sctx, MESA_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state, 2607ec681f3Smrg program, si_create_compute_state_async); 2617ec681f3Smrg } else { 2627ec681f3Smrg const struct pipe_binary_program_header *header; 2637ec681f3Smrg header = cso->prog; 2647ec681f3Smrg 2657ec681f3Smrg program->shader.binary.elf_size = header->num_bytes; 2667ec681f3Smrg program->shader.binary.elf_buffer = malloc(header->num_bytes); 2677ec681f3Smrg if (!program->shader.binary.elf_buffer) { 2687ec681f3Smrg FREE(program); 2697ec681f3Smrg return NULL; 2707ec681f3Smrg } 2717ec681f3Smrg memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes); 2727ec681f3Smrg 2737ec681f3Smrg const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0); 2747ec681f3Smrg code_object_to_config(code_object, &program->shader.config); 2757ec681f3Smrg 2767ec681f3Smrg si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true); 2777ec681f3Smrg if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) { 2787ec681f3Smrg fprintf(stderr, "LLVM failed to upload shader\n"); 2797ec681f3Smrg free((void *)program->shader.binary.elf_buffer); 2807ec681f3Smrg FREE(program); 2817ec681f3Smrg return NULL; 2827ec681f3Smrg } 2837ec681f3Smrg } 2847ec681f3Smrg 2857ec681f3Smrg return program; 286af69d88dSmrg} 287af69d88dSmrg 288af69d88dSmrgstatic void si_bind_compute_state(struct pipe_context *ctx, void *state) 289af69d88dSmrg{ 2907ec681f3Smrg struct si_context *sctx = (struct si_context *)ctx; 2917ec681f3Smrg struct si_compute *program = (struct si_compute *)state; 2927ec681f3Smrg struct si_shader_selector *sel = &program->sel; 2937ec681f3Smrg 2947ec681f3Smrg sctx->cs_shader_state.program = program; 2957ec681f3Smrg if (!program) 2967ec681f3Smrg return; 2977ec681f3Smrg 2987ec681f3Smrg /* Wait because we need active slot usage masks. */ 2997ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE) 3007ec681f3Smrg util_queue_fence_wait(&sel->ready); 3017ec681f3Smrg 3027ec681f3Smrg si_set_active_descriptors(sctx, 3037ec681f3Smrg SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, 3047ec681f3Smrg sel->active_const_and_shader_buffers); 3057ec681f3Smrg si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, 3067ec681f3Smrg sel->active_samplers_and_images); 3077ec681f3Smrg 3087ec681f3Smrg sctx->compute_shaderbuf_sgprs_dirty = true; 3097ec681f3Smrg sctx->compute_image_sgprs_dirty = true; 3107ec681f3Smrg 3117ec681f3Smrg if (unlikely((sctx->screen->debug_flags & DBG(SQTT)) && sctx->thread_trace)) { 3127ec681f3Smrg uint32_t pipeline_code_hash = _mesa_hash_data_with_seed( 3137ec681f3Smrg program->shader.binary.elf_buffer, 3147ec681f3Smrg program->shader.binary.elf_size, 3157ec681f3Smrg 0); 3167ec681f3Smrg uint64_t base_address = program->shader.bo->gpu_address; 3177ec681f3Smrg 3187ec681f3Smrg struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; 3197ec681f3Smrg if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { 3207ec681f3Smrg si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, true); 3217ec681f3Smrg } 3227ec681f3Smrg 3237ec681f3Smrg si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 1); 3247ec681f3Smrg } 325af69d88dSmrg} 326af69d88dSmrg 3277ec681f3Smrgstatic void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n, 3287ec681f3Smrg struct pipe_resource **resources, uint32_t **handles) 329af69d88dSmrg{ 3307ec681f3Smrg unsigned i; 3317ec681f3Smrg struct si_context *sctx = (struct si_context *)ctx; 3327ec681f3Smrg struct si_compute *program = sctx->cs_shader_state.program; 3337ec681f3Smrg 3347ec681f3Smrg if (first + n > program->max_global_buffers) { 3357ec681f3Smrg unsigned old_max = program->max_global_buffers; 3367ec681f3Smrg program->max_global_buffers = first + n; 3377ec681f3Smrg program->global_buffers = realloc( 3387ec681f3Smrg program->global_buffers, program->max_global_buffers * sizeof(program->global_buffers[0])); 3397ec681f3Smrg if (!program->global_buffers) { 3407ec681f3Smrg fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n"); 3417ec681f3Smrg return; 3427ec681f3Smrg } 3437ec681f3Smrg 3447ec681f3Smrg memset(&program->global_buffers[old_max], 0, 3457ec681f3Smrg (program->max_global_buffers - old_max) * sizeof(program->global_buffers[0])); 3467ec681f3Smrg } 3477ec681f3Smrg 3487ec681f3Smrg if (!resources) { 3497ec681f3Smrg for (i = 0; i < n; i++) { 3507ec681f3Smrg pipe_resource_reference(&program->global_buffers[first + i], NULL); 3517ec681f3Smrg } 3527ec681f3Smrg return; 3537ec681f3Smrg } 3547ec681f3Smrg 3557ec681f3Smrg for (i = 0; i < n; i++) { 3567ec681f3Smrg uint64_t va; 3577ec681f3Smrg uint32_t offset; 3587ec681f3Smrg pipe_resource_reference(&program->global_buffers[first + i], resources[i]); 3597ec681f3Smrg va = si_resource(resources[i])->gpu_address; 3607ec681f3Smrg offset = util_le32_to_cpu(*handles[i]); 3617ec681f3Smrg va += offset; 3627ec681f3Smrg va = util_cpu_to_le64(va); 3637ec681f3Smrg memcpy(handles[i], &va, sizeof(va)); 3647ec681f3Smrg } 365af69d88dSmrg} 366af69d88dSmrg 3677ec681f3Smrgvoid si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs) 368af69d88dSmrg{ 3697ec681f3Smrg radeon_begin(cs); 3707ec681f3Smrg radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI, 3717ec681f3Smrg S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); 3727ec681f3Smrg 3737ec681f3Smrg radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); 3747ec681f3Smrg /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, 3757ec681f3Smrg * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ 3767ec681f3Smrg radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 3777ec681f3Smrg radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 3787ec681f3Smrg 3797ec681f3Smrg if (sctx->chip_class == GFX6) { 3807ec681f3Smrg /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID 3817ec681f3Smrg * and is now per pipe, so it should be handled in the 3827ec681f3Smrg * kernel if we want to use something other than the default value. 3837ec681f3Smrg * 3847ec681f3Smrg * TODO: This should be: 3857ec681f3Smrg * (number of compute units) * 4 * (waves per simd) - 1 3867ec681f3Smrg */ 3877ec681f3Smrg radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); 3887ec681f3Smrg 3897ec681f3Smrg if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) { 3907ec681f3Smrg uint64_t bc_va = sctx->border_color_buffer->gpu_address; 3917ec681f3Smrg 3927ec681f3Smrg radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8); 3937ec681f3Smrg } 3947ec681f3Smrg } 3957ec681f3Smrg 3967ec681f3Smrg if (sctx->chip_class >= GFX7) { 3977ec681f3Smrg /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ 3987ec681f3Smrg radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); 3997ec681f3Smrg radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 4007ec681f3Smrg radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); 4017ec681f3Smrg 4027ec681f3Smrg /* Disable profiling on compute queues. */ 4037ec681f3Smrg if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) { 4047ec681f3Smrg radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); 4057ec681f3Smrg radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); 4067ec681f3Smrg } 4077ec681f3Smrg 4087ec681f3Smrg /* Set the pointer to border colors. */ 4097ec681f3Smrg /* Aldebaran doesn't support border colors. */ 4107ec681f3Smrg if (sctx->border_color_buffer) { 4117ec681f3Smrg uint64_t bc_va = sctx->border_color_buffer->gpu_address; 4127ec681f3Smrg 4137ec681f3Smrg radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false); 4147ec681f3Smrg radeon_emit(bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ 4157ec681f3Smrg radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ 4167ec681f3Smrg } 4177ec681f3Smrg } 4187ec681f3Smrg 4197ec681f3Smrg /* cs_preamble_state initializes this for the gfx queue, so only do this 4207ec681f3Smrg * if we are on a compute queue. 4217ec681f3Smrg */ 4227ec681f3Smrg if (sctx->chip_class >= GFX9 && 4237ec681f3Smrg (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) { 4247ec681f3Smrg radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY, 4257ec681f3Smrg sctx->chip_class >= GFX10 ? 0x20 : 0); 4267ec681f3Smrg } 4277ec681f3Smrg 4287ec681f3Smrg if (sctx->chip_class >= GFX10) { 4297ec681f3Smrg radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 5); 4307ec681f3Smrg radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */ 4317ec681f3Smrg radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */ 4327ec681f3Smrg radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */ 4337ec681f3Smrg radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */ 4347ec681f3Smrg radeon_emit(0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */ 4357ec681f3Smrg 4367ec681f3Smrg radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); 4377ec681f3Smrg } 4387ec681f3Smrg radeon_end(); 439af69d88dSmrg} 440af69d88dSmrg 4417ec681f3Smrgstatic bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader, 4427ec681f3Smrg struct ac_shader_config *config) 443af69d88dSmrg{ 4447ec681f3Smrg uint64_t scratch_bo_size, scratch_needed; 4457ec681f3Smrg scratch_bo_size = 0; 4467ec681f3Smrg scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves; 4477ec681f3Smrg if (sctx->compute_scratch_buffer) 4487ec681f3Smrg scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; 44901e04c3fSmrg 4507ec681f3Smrg if (scratch_bo_size < scratch_needed) { 4517ec681f3Smrg si_resource_reference(&sctx->compute_scratch_buffer, NULL); 45201e04c3fSmrg 4537ec681f3Smrg sctx->compute_scratch_buffer = 4547ec681f3Smrg si_aligned_buffer_create(&sctx->screen->b, 4557ec681f3Smrg SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, 4567ec681f3Smrg PIPE_USAGE_DEFAULT, 4577ec681f3Smrg scratch_needed, sctx->screen->info.pte_fragment_size); 45801e04c3fSmrg 4597ec681f3Smrg if (!sctx->compute_scratch_buffer) 4607ec681f3Smrg return false; 4617ec681f3Smrg } 46201e04c3fSmrg 4637ec681f3Smrg if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { 4647ec681f3Smrg uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; 46501e04c3fSmrg 4667ec681f3Smrg if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) 4677ec681f3Smrg return false; 46801e04c3fSmrg 4697ec681f3Smrg si_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer); 4707ec681f3Smrg } 47101e04c3fSmrg 4727ec681f3Smrg return true; 47301e04c3fSmrg} 47401e04c3fSmrg 4757ec681f3Smrgstatic bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program, 4767ec681f3Smrg struct si_shader *shader, const amd_kernel_code_t *code_object, 4777ec681f3Smrg unsigned offset, bool *prefetch) 47801e04c3fSmrg{ 4797ec681f3Smrg struct radeon_cmdbuf *cs = &sctx->gfx_cs; 4807ec681f3Smrg struct ac_shader_config inline_config = {0}; 4817ec681f3Smrg struct ac_shader_config *config; 4827ec681f3Smrg uint64_t shader_va; 4837ec681f3Smrg 4847ec681f3Smrg *prefetch = false; 4857ec681f3Smrg 4867ec681f3Smrg if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset) 4877ec681f3Smrg return true; 4887ec681f3Smrg 4897ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE) { 4907ec681f3Smrg config = &shader->config; 4917ec681f3Smrg } else { 4927ec681f3Smrg unsigned lds_blocks; 4937ec681f3Smrg 4947ec681f3Smrg config = &inline_config; 4957ec681f3Smrg code_object_to_config(code_object, config); 4967ec681f3Smrg 4977ec681f3Smrg lds_blocks = config->lds_size; 4987ec681f3Smrg /* XXX: We are over allocating LDS. For GFX6, the shader reports 4997ec681f3Smrg * LDS in blocks of 256 bytes, so if there are 4 bytes lds 5007ec681f3Smrg * allocated in the shader and 4 bytes allocated by the state 5017ec681f3Smrg * tracker, then we will set LDS_SIZE to 512 bytes rather than 256. 5027ec681f3Smrg */ 5037ec681f3Smrg if (sctx->chip_class <= GFX6) { 5047ec681f3Smrg lds_blocks += align(program->sel.info.base.shared_size, 256) >> 8; 5057ec681f3Smrg } else { 5067ec681f3Smrg lds_blocks += align(program->sel.info.base.shared_size, 512) >> 9; 5077ec681f3Smrg } 5087ec681f3Smrg 5097ec681f3Smrg /* TODO: use si_multiwave_lds_size_workaround */ 5107ec681f3Smrg assert(lds_blocks <= 0xFF); 5117ec681f3Smrg 5127ec681f3Smrg config->rsrc2 &= C_00B84C_LDS_SIZE; 5137ec681f3Smrg config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); 5147ec681f3Smrg } 5157ec681f3Smrg 5167ec681f3Smrg if (!si_setup_compute_scratch_buffer(sctx, shader, config)) 5177ec681f3Smrg return false; 5187ec681f3Smrg 5197ec681f3Smrg if (shader->scratch_bo) { 5207ec681f3Smrg COMPUTE_DBG(sctx->screen, 5217ec681f3Smrg "Waves: %u; Scratch per wave: %u bytes; " 5227ec681f3Smrg "Total Scratch: %u bytes\n", 5237ec681f3Smrg sctx->scratch_waves, config->scratch_bytes_per_wave, 5247ec681f3Smrg config->scratch_bytes_per_wave * sctx->scratch_waves); 5257ec681f3Smrg 5267ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE, 5277ec681f3Smrg RADEON_PRIO_SCRATCH_BUFFER); 5287ec681f3Smrg } 5297ec681f3Smrg 5307ec681f3Smrg shader_va = shader->bo->gpu_address + offset; 5317ec681f3Smrg if (program->ir_type == PIPE_SHADER_IR_NATIVE) { 5327ec681f3Smrg /* Shader code is placed after the amd_kernel_code_t 5337ec681f3Smrg * struct. */ 5347ec681f3Smrg shader_va += sizeof(amd_kernel_code_t); 5357ec681f3Smrg } 5367ec681f3Smrg 5377ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo, RADEON_USAGE_READ, 5387ec681f3Smrg RADEON_PRIO_SHADER_BINARY); 5397ec681f3Smrg 5407ec681f3Smrg radeon_begin(cs); 5417ec681f3Smrg radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); 5427ec681f3Smrg 5437ec681f3Smrg radeon_set_sh_reg_seq(R_00B848_COMPUTE_PGM_RSRC1, 2); 5447ec681f3Smrg radeon_emit(config->rsrc1); 5457ec681f3Smrg radeon_emit(config->rsrc2); 5467ec681f3Smrg 5477ec681f3Smrg COMPUTE_DBG(sctx->screen, 5487ec681f3Smrg "COMPUTE_PGM_RSRC1: 0x%08x " 5497ec681f3Smrg "COMPUTE_PGM_RSRC2: 0x%08x\n", 5507ec681f3Smrg config->rsrc1, config->rsrc2); 5517ec681f3Smrg 5527ec681f3Smrg sctx->max_seen_compute_scratch_bytes_per_wave = 5537ec681f3Smrg MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave); 5547ec681f3Smrg 5557ec681f3Smrg radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, 5567ec681f3Smrg S_00B860_WAVES(sctx->scratch_waves) | 5577ec681f3Smrg S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); 5587ec681f3Smrg radeon_end(); 5597ec681f3Smrg 5607ec681f3Smrg sctx->cs_shader_state.emitted_program = program; 5617ec681f3Smrg sctx->cs_shader_state.offset = offset; 5627ec681f3Smrg sctx->cs_shader_state.uses_scratch = config->scratch_bytes_per_wave != 0; 5637ec681f3Smrg 5647ec681f3Smrg *prefetch = true; 5657ec681f3Smrg return true; 56601e04c3fSmrg} 56701e04c3fSmrg 56801e04c3fSmrgstatic void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, 5697ec681f3Smrg const amd_kernel_code_t *code_object, unsigned user_sgpr) 57001e04c3fSmrg{ 5717ec681f3Smrg struct radeon_cmdbuf *cs = &sctx->gfx_cs; 5727ec681f3Smrg uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; 5737ec681f3Smrg 5747ec681f3Smrg unsigned max_private_element_size = 5757ec681f3Smrg AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE); 5767ec681f3Smrg 5777ec681f3Smrg uint32_t scratch_dword0 = scratch_va & 0xffffffff; 5787ec681f3Smrg uint32_t scratch_dword1 = 5797ec681f3Smrg S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); 5807ec681f3Smrg 5817ec681f3Smrg /* Disable address clamping */ 5827ec681f3Smrg uint32_t scratch_dword2 = 0xffffffff; 5837ec681f3Smrg uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(3) | S_008F0C_ADD_TID_ENABLE(1); 5847ec681f3Smrg 5857ec681f3Smrg if (sctx->chip_class >= GFX9) { 5867ec681f3Smrg assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */ 5877ec681f3Smrg } else { 5887ec681f3Smrg scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size); 5897ec681f3Smrg 5907ec681f3Smrg if (sctx->chip_class < GFX8) { 5917ec681f3Smrg /* BUF_DATA_FORMAT is ignored, but it cannot be 5927ec681f3Smrg * BUF_DATA_FORMAT_INVALID. */ 5937ec681f3Smrg scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8); 5947ec681f3Smrg } 5957ec681f3Smrg } 5967ec681f3Smrg 5977ec681f3Smrg radeon_begin(cs); 5987ec681f3Smrg radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); 5997ec681f3Smrg radeon_emit(scratch_dword0); 6007ec681f3Smrg radeon_emit(scratch_dword1); 6017ec681f3Smrg radeon_emit(scratch_dword2); 6027ec681f3Smrg radeon_emit(scratch_dword3); 6037ec681f3Smrg radeon_end(); 60401e04c3fSmrg} 60501e04c3fSmrg 6067ec681f3Smrgstatic void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object, 6077ec681f3Smrg const struct pipe_grid_info *info, uint64_t kernel_args_va) 60801e04c3fSmrg{ 6097ec681f3Smrg struct si_compute *program = sctx->cs_shader_state.program; 6107ec681f3Smrg struct radeon_cmdbuf *cs = &sctx->gfx_cs; 6117ec681f3Smrg 6127ec681f3Smrg static const enum amd_code_property_mask_t workgroup_count_masks[] = { 6137ec681f3Smrg AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 6147ec681f3Smrg AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 6157ec681f3Smrg AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z}; 6167ec681f3Smrg 6177ec681f3Smrg unsigned i, user_sgpr = 0; 6187ec681f3Smrg if (AMD_HSA_BITS_GET(code_object->code_properties, 6197ec681f3Smrg AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { 6207ec681f3Smrg if (code_object->workitem_private_segment_byte_size > 0) { 6217ec681f3Smrg setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr); 6227ec681f3Smrg } 6237ec681f3Smrg user_sgpr += 4; 6247ec681f3Smrg } 6257ec681f3Smrg 6267ec681f3Smrg radeon_begin(cs); 6277ec681f3Smrg 6287ec681f3Smrg if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { 6297ec681f3Smrg struct dispatch_packet dispatch; 6307ec681f3Smrg unsigned dispatch_offset; 6317ec681f3Smrg struct si_resource *dispatch_buf = NULL; 6327ec681f3Smrg uint64_t dispatch_va; 6337ec681f3Smrg 6347ec681f3Smrg /* Upload dispatch ptr */ 6357ec681f3Smrg memset(&dispatch, 0, sizeof(dispatch)); 6367ec681f3Smrg 6377ec681f3Smrg dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]); 6387ec681f3Smrg dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]); 6397ec681f3Smrg dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]); 6407ec681f3Smrg 6417ec681f3Smrg dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]); 6427ec681f3Smrg dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]); 6437ec681f3Smrg dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]); 6447ec681f3Smrg 6457ec681f3Smrg dispatch.private_segment_size = util_cpu_to_le32(program->private_size); 6467ec681f3Smrg dispatch.group_segment_size = util_cpu_to_le32(program->sel.info.base.shared_size); 6477ec681f3Smrg 6487ec681f3Smrg dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va); 6497ec681f3Smrg 6507ec681f3Smrg u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset, 6517ec681f3Smrg (struct pipe_resource **)&dispatch_buf); 6527ec681f3Smrg 6537ec681f3Smrg if (!dispatch_buf) { 6547ec681f3Smrg fprintf(stderr, "Error: Failed to allocate dispatch " 6557ec681f3Smrg "packet."); 6567ec681f3Smrg } 6577ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, dispatch_buf, RADEON_USAGE_READ, 6587ec681f3Smrg RADEON_PRIO_CONST_BUFFER); 6597ec681f3Smrg 6607ec681f3Smrg dispatch_va = dispatch_buf->gpu_address + dispatch_offset; 6617ec681f3Smrg 6627ec681f3Smrg radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); 6637ec681f3Smrg radeon_emit(dispatch_va); 6647ec681f3Smrg radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0)); 6657ec681f3Smrg 6667ec681f3Smrg si_resource_reference(&dispatch_buf, NULL); 6677ec681f3Smrg user_sgpr += 2; 6687ec681f3Smrg } 6697ec681f3Smrg 6707ec681f3Smrg if (AMD_HSA_BITS_GET(code_object->code_properties, 6717ec681f3Smrg AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { 6727ec681f3Smrg radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); 6737ec681f3Smrg radeon_emit(kernel_args_va); 6747ec681f3Smrg radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0)); 6757ec681f3Smrg user_sgpr += 2; 6767ec681f3Smrg } 6777ec681f3Smrg 6787ec681f3Smrg for (i = 0; i < 3 && user_sgpr < 16; i++) { 6797ec681f3Smrg if (code_object->code_properties & workgroup_count_masks[i]) { 6807ec681f3Smrg radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1); 6817ec681f3Smrg radeon_emit(info->grid[i]); 6827ec681f3Smrg user_sgpr += 1; 6837ec681f3Smrg } 6847ec681f3Smrg } 6857ec681f3Smrg radeon_end(); 68601e04c3fSmrg} 68701e04c3fSmrg 6887ec681f3Smrgstatic bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object, 6897ec681f3Smrg const struct pipe_grid_info *info) 69001e04c3fSmrg{ 6917ec681f3Smrg struct si_compute *program = sctx->cs_shader_state.program; 6927ec681f3Smrg struct si_resource *input_buffer = NULL; 6937ec681f3Smrg uint32_t kernel_args_offset = 0; 6947ec681f3Smrg uint32_t *kernel_args; 6957ec681f3Smrg void *kernel_args_ptr; 6967ec681f3Smrg uint64_t kernel_args_va; 6977ec681f3Smrg 6987ec681f3Smrg u_upload_alloc(sctx->b.const_uploader, 0, program->input_size, 6997ec681f3Smrg sctx->screen->info.tcc_cache_line_size, &kernel_args_offset, 7007ec681f3Smrg (struct pipe_resource **)&input_buffer, &kernel_args_ptr); 7017ec681f3Smrg 7027ec681f3Smrg if (unlikely(!kernel_args_ptr)) 7037ec681f3Smrg return false; 7047ec681f3Smrg 7057ec681f3Smrg kernel_args = (uint32_t *)kernel_args_ptr; 7067ec681f3Smrg kernel_args_va = input_buffer->gpu_address + kernel_args_offset; 7077ec681f3Smrg 7087ec681f3Smrg memcpy(kernel_args, info->input, program->input_size); 7097ec681f3Smrg 7107ec681f3Smrg for (unsigned i = 0; i < program->input_size / 4; i++) { 7117ec681f3Smrg COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]); 7127ec681f3Smrg } 7137ec681f3Smrg 7147ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, input_buffer, RADEON_USAGE_READ, 7157ec681f3Smrg RADEON_PRIO_CONST_BUFFER); 7167ec681f3Smrg 7177ec681f3Smrg si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va); 7187ec681f3Smrg si_resource_reference(&input_buffer, NULL); 7197ec681f3Smrg return true; 72001e04c3fSmrg} 721af69d88dSmrg 7227ec681f3Smrgstatic void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info) 72301e04c3fSmrg{ 7247ec681f3Smrg struct si_compute *program = sctx->cs_shader_state.program; 7257ec681f3Smrg struct si_shader_selector *sel = &program->sel; 7267ec681f3Smrg struct radeon_cmdbuf *cs = &sctx->gfx_cs; 7277ec681f3Smrg unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 4 * SI_NUM_RESOURCE_SGPRS; 7287ec681f3Smrg unsigned block_size_reg = grid_size_reg + 7297ec681f3Smrg /* 12 bytes = 3 dwords. */ 7307ec681f3Smrg 12 * sel->info.uses_grid_size; 7317ec681f3Smrg unsigned cs_user_data_reg = block_size_reg + 4 * program->sel.info.uses_variable_block_size; 7327ec681f3Smrg 7337ec681f3Smrg radeon_begin(cs); 7347ec681f3Smrg 7357ec681f3Smrg if (sel->info.uses_grid_size) { 7367ec681f3Smrg if (info->indirect) { 7377ec681f3Smrg radeon_end(); 7387ec681f3Smrg 7397ec681f3Smrg for (unsigned i = 0; i < 3; ++i) { 7407ec681f3Smrg si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i, 7417ec681f3Smrg COPY_DATA_SRC_MEM, si_resource(info->indirect), 7427ec681f3Smrg info->indirect_offset + 4 * i); 7437ec681f3Smrg } 7447ec681f3Smrg radeon_begin_again(cs); 7457ec681f3Smrg } else { 7467ec681f3Smrg radeon_set_sh_reg_seq(grid_size_reg, 3); 7477ec681f3Smrg radeon_emit(info->grid[0]); 7487ec681f3Smrg radeon_emit(info->grid[1]); 7497ec681f3Smrg radeon_emit(info->grid[2]); 7507ec681f3Smrg } 7517ec681f3Smrg } 7527ec681f3Smrg 7537ec681f3Smrg if (sel->info.uses_variable_block_size) { 7547ec681f3Smrg radeon_set_sh_reg(block_size_reg, 7557ec681f3Smrg info->block[0] | (info->block[1] << 10) | (info->block[2] << 20)); 7567ec681f3Smrg } 7577ec681f3Smrg 7587ec681f3Smrg if (sel->info.base.cs.user_data_components_amd) { 7597ec681f3Smrg radeon_set_sh_reg_seq(cs_user_data_reg, sel->info.base.cs.user_data_components_amd); 7607ec681f3Smrg radeon_emit_array(sctx->cs_user_data, sel->info.base.cs.user_data_components_amd); 7617ec681f3Smrg } 7627ec681f3Smrg radeon_end(); 76301e04c3fSmrg} 76401e04c3fSmrg 7657ec681f3Smrgstatic void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) 76601e04c3fSmrg{ 7677ec681f3Smrg struct si_screen *sscreen = sctx->screen; 7687ec681f3Smrg struct radeon_cmdbuf *cs = &sctx->gfx_cs; 7697ec681f3Smrg bool render_cond_bit = sctx->render_cond_enabled; 7707ec681f3Smrg unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2]; 7717ec681f3Smrg unsigned waves_per_threadgroup = 7727ec681f3Smrg DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size); 7737ec681f3Smrg unsigned threadgroups_per_cu = 1; 7747ec681f3Smrg 7757ec681f3Smrg if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) 7767ec681f3Smrg threadgroups_per_cu = 2; 7777ec681f3Smrg 7787ec681f3Smrg if (unlikely(sctx->thread_trace_enabled)) { 7797ec681f3Smrg si_write_event_with_dims_marker(sctx, &sctx->gfx_cs, 7807ec681f3Smrg info->indirect ? EventCmdDispatchIndirect : EventCmdDispatch, 7817ec681f3Smrg info->grid[0], info->grid[1], info->grid[2]); 7827ec681f3Smrg } 7837ec681f3Smrg 7847ec681f3Smrg radeon_begin(cs); 7857ec681f3Smrg radeon_set_sh_reg( 7867ec681f3Smrg R_00B854_COMPUTE_RESOURCE_LIMITS, 7877ec681f3Smrg ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup, 7887ec681f3Smrg sctx->cs_max_waves_per_sh, threadgroups_per_cu)); 7897ec681f3Smrg 7907ec681f3Smrg unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) | 7917ec681f3Smrg /* If the KMD allows it (there is a KMD hw register for it), 7927ec681f3Smrg * allow launching waves out-of-order. (same as Vulkan) */ 7937ec681f3Smrg S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) | 7947ec681f3Smrg S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32); 7957ec681f3Smrg 7967ec681f3Smrg const uint *last_block = info->last_block; 7977ec681f3Smrg bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; 7987ec681f3Smrg 7997ec681f3Smrg radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3); 8007ec681f3Smrg 8017ec681f3Smrg if (partial_block_en) { 8027ec681f3Smrg unsigned partial[3]; 8037ec681f3Smrg 8047ec681f3Smrg /* If no partial_block, these should be an entire block size, not 0. */ 8057ec681f3Smrg partial[0] = last_block[0] ? last_block[0] : info->block[0]; 8067ec681f3Smrg partial[1] = last_block[1] ? last_block[1] : info->block[1]; 8077ec681f3Smrg partial[2] = last_block[2] ? last_block[2] : info->block[2]; 8087ec681f3Smrg 8097ec681f3Smrg radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]) | 8107ec681f3Smrg S_00B81C_NUM_THREAD_PARTIAL(partial[0])); 8117ec681f3Smrg radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]) | 8127ec681f3Smrg S_00B820_NUM_THREAD_PARTIAL(partial[1])); 8137ec681f3Smrg radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]) | 8147ec681f3Smrg S_00B824_NUM_THREAD_PARTIAL(partial[2])); 8157ec681f3Smrg 8167ec681f3Smrg dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); 8177ec681f3Smrg } else { 8187ec681f3Smrg radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0])); 8197ec681f3Smrg radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1])); 8207ec681f3Smrg radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2])); 8217ec681f3Smrg } 8227ec681f3Smrg 8237ec681f3Smrg if (info->indirect) { 8247ec681f3Smrg uint64_t base_va = si_resource(info->indirect)->gpu_address; 8257ec681f3Smrg 8267ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ, 8277ec681f3Smrg RADEON_PRIO_DRAW_INDIRECT); 8287ec681f3Smrg 8297ec681f3Smrg radeon_emit(PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); 8307ec681f3Smrg radeon_emit(1); 8317ec681f3Smrg radeon_emit(base_va); 8327ec681f3Smrg radeon_emit(base_va >> 32); 8337ec681f3Smrg 8347ec681f3Smrg radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); 8357ec681f3Smrg radeon_emit(info->indirect_offset); 8367ec681f3Smrg radeon_emit(dispatch_initiator); 8377ec681f3Smrg } else { 8387ec681f3Smrg radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); 8397ec681f3Smrg radeon_emit(info->grid[0]); 8407ec681f3Smrg radeon_emit(info->grid[1]); 8417ec681f3Smrg radeon_emit(info->grid[2]); 8427ec681f3Smrg radeon_emit(dispatch_initiator); 8437ec681f3Smrg } 8447ec681f3Smrg 8457ec681f3Smrg if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) { 8467ec681f3Smrg radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 8477ec681f3Smrg radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); 8487ec681f3Smrg } 8497ec681f3Smrg radeon_end(); 85001e04c3fSmrg} 851af69d88dSmrg 8527ec681f3Smrgstatic bool si_check_needs_implicit_sync(struct si_context *sctx) 8537ec681f3Smrg{ 8547ec681f3Smrg /* If the compute shader is going to read from a texture/image written by a 8557ec681f3Smrg * previous draw, we must wait for its completion before continuing. 8567ec681f3Smrg * Buffers and image stores (from the draw) are not taken into consideration 8577ec681f3Smrg * because that's the app responsibility. 8587ec681f3Smrg * 8597ec681f3Smrg * The OpenGL 4.6 spec says: 8607ec681f3Smrg * 8617ec681f3Smrg * buffer object and texture stores performed by shaders are not 8627ec681f3Smrg * automatically synchronized 8637ec681f3Smrg * 8647ec681f3Smrg * TODO: Bindless textures are not handled, and thus are not synchronized. 8657ec681f3Smrg */ 8667ec681f3Smrg struct si_shader_info *info = &sctx->cs_shader_state.program->sel.info; 8677ec681f3Smrg struct si_samplers *samplers = &sctx->samplers[PIPE_SHADER_COMPUTE]; 8687ec681f3Smrg unsigned mask = samplers->enabled_mask & info->base.textures_used[0]; 8697ec681f3Smrg 8707ec681f3Smrg while (mask) { 8717ec681f3Smrg int i = u_bit_scan(&mask); 8727ec681f3Smrg struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; 8737ec681f3Smrg 8747ec681f3Smrg struct si_resource *res = si_resource(sview->base.texture); 8757ec681f3Smrg if (sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, res->buf, 8767ec681f3Smrg RADEON_USAGE_NEEDS_IMPLICIT_SYNC)) 8777ec681f3Smrg return true; 8787ec681f3Smrg } 8797ec681f3Smrg 8807ec681f3Smrg struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; 8817ec681f3Smrg mask = u_bit_consecutive(0, info->base.num_images) & images->enabled_mask; 8827ec681f3Smrg 8837ec681f3Smrg while (mask) { 8847ec681f3Smrg int i = u_bit_scan(&mask); 8857ec681f3Smrg struct pipe_image_view *sview = &images->views[i]; 8867ec681f3Smrg 8877ec681f3Smrg struct si_resource *res = si_resource(sview->resource); 8887ec681f3Smrg if (sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, res->buf, 8897ec681f3Smrg RADEON_USAGE_NEEDS_IMPLICIT_SYNC)) 8907ec681f3Smrg return true; 8917ec681f3Smrg } 8927ec681f3Smrg return false; 8937ec681f3Smrg} 89401e04c3fSmrg 8957ec681f3Smrgstatic void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info) 89601e04c3fSmrg{ 8977ec681f3Smrg struct si_context *sctx = (struct si_context *)ctx; 8987ec681f3Smrg struct si_screen *sscreen = sctx->screen; 8997ec681f3Smrg struct si_compute *program = sctx->cs_shader_state.program; 9007ec681f3Smrg const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc); 9017ec681f3Smrg int i; 9027ec681f3Smrg bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && 9037ec681f3Smrg info->block[0] * info->block[1] * info->block[2] > 256; 9047ec681f3Smrg 9057ec681f3Smrg if (cs_regalloc_hang) 9067ec681f3Smrg sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; 9077ec681f3Smrg 9087ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed) 9097ec681f3Smrg return; 9107ec681f3Smrg 9117ec681f3Smrg if (sctx->has_graphics) { 9127ec681f3Smrg if (sctx->last_num_draw_calls != sctx->num_draw_calls) { 9137ec681f3Smrg si_update_fb_dirtiness_after_rendering(sctx); 9147ec681f3Smrg sctx->last_num_draw_calls = sctx->num_draw_calls; 9157ec681f3Smrg 9167ec681f3Smrg if (sctx->force_cb_shader_coherent || si_check_needs_implicit_sync(sctx)) 9177ec681f3Smrg si_make_CB_shader_coherent(sctx, 0, 9187ec681f3Smrg sctx->framebuffer.CB_has_shader_readable_metadata, 9197ec681f3Smrg sctx->framebuffer.all_DCC_pipe_aligned); 9207ec681f3Smrg } 9217ec681f3Smrg 9227ec681f3Smrg si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE); 9237ec681f3Smrg } 9247ec681f3Smrg 9257ec681f3Smrg /* Add buffer sizes for memory checking in need_cs_space. */ 9267ec681f3Smrg si_context_add_resource_size(sctx, &program->shader.bo->b.b); 9277ec681f3Smrg /* TODO: add the scratch buffer */ 9287ec681f3Smrg 9297ec681f3Smrg if (info->indirect) { 9307ec681f3Smrg si_context_add_resource_size(sctx, info->indirect); 9317ec681f3Smrg 9327ec681f3Smrg /* Indirect buffers use TC L2 on GFX9, but not older hw. */ 9337ec681f3Smrg if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { 9347ec681f3Smrg sctx->flags |= SI_CONTEXT_WB_L2; 9357ec681f3Smrg si_resource(info->indirect)->TC_L2_dirty = false; 9367ec681f3Smrg } 9377ec681f3Smrg } 9387ec681f3Smrg 9397ec681f3Smrg si_need_gfx_cs_space(sctx, 0); 9407ec681f3Smrg 9417ec681f3Smrg /* If we're using a secure context, determine if cs must be secure or not */ 9427ec681f3Smrg if (unlikely(radeon_uses_secure_bos(sctx->ws))) { 9437ec681f3Smrg bool secure = si_compute_resources_check_encrypted(sctx); 9447ec681f3Smrg if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) { 9457ec681f3Smrg si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW | 9467ec681f3Smrg RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, 9477ec681f3Smrg NULL); 9487ec681f3Smrg } 9497ec681f3Smrg } 9507ec681f3Smrg 9517ec681f3Smrg if (sctx->bo_list_add_all_compute_resources) 9527ec681f3Smrg si_compute_resources_add_all_to_bo_list(sctx); 9537ec681f3Smrg 9547ec681f3Smrg if (!sctx->cs_shader_state.initialized) { 9557ec681f3Smrg si_emit_initial_compute_regs(sctx, &sctx->gfx_cs); 9567ec681f3Smrg 9577ec681f3Smrg sctx->cs_shader_state.emitted_program = NULL; 9587ec681f3Smrg sctx->cs_shader_state.initialized = true; 9597ec681f3Smrg } 9607ec681f3Smrg 9617ec681f3Smrg /* First emit registers. */ 9627ec681f3Smrg bool prefetch; 9637ec681f3Smrg if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch)) 9647ec681f3Smrg return; 9657ec681f3Smrg 9667ec681f3Smrg si_upload_compute_shader_descriptors(sctx); 9677ec681f3Smrg si_emit_compute_shader_pointers(sctx); 9687ec681f3Smrg 9697ec681f3Smrg if (program->ir_type == PIPE_SHADER_IR_NATIVE && 9707ec681f3Smrg unlikely(!si_upload_compute_input(sctx, code_object, info))) 9717ec681f3Smrg return; 9727ec681f3Smrg 9737ec681f3Smrg /* Global buffers */ 9747ec681f3Smrg for (i = 0; i < program->max_global_buffers; i++) { 9757ec681f3Smrg struct si_resource *buffer = si_resource(program->global_buffers[i]); 9767ec681f3Smrg if (!buffer) { 9777ec681f3Smrg continue; 9787ec681f3Smrg } 9797ec681f3Smrg radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, buffer, RADEON_USAGE_READWRITE, 9807ec681f3Smrg RADEON_PRIO_COMPUTE_GLOBAL); 9817ec681f3Smrg } 9827ec681f3Smrg 9837ec681f3Smrg /* Registers that are not read from memory should be set before this: */ 9847ec681f3Smrg if (sctx->flags) 9857ec681f3Smrg sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 9867ec681f3Smrg 9877ec681f3Smrg if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { 9887ec681f3Smrg sctx->atoms.s.render_cond.emit(sctx); 9897ec681f3Smrg si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false); 9907ec681f3Smrg } 9917ec681f3Smrg 9927ec681f3Smrg /* Prefetch the compute shader to L2. */ 9937ec681f3Smrg if (sctx->chip_class >= GFX7 && prefetch) 9947ec681f3Smrg si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0); 9957ec681f3Smrg 9967ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE) 9977ec681f3Smrg si_setup_nir_user_data(sctx, info); 9987ec681f3Smrg 9997ec681f3Smrg si_emit_dispatch_packets(sctx, info); 10007ec681f3Smrg 10017ec681f3Smrg if (unlikely(sctx->current_saved_cs)) { 10027ec681f3Smrg si_trace_emit(sctx); 10037ec681f3Smrg si_log_compute_state(sctx, sctx->log); 10047ec681f3Smrg } 10057ec681f3Smrg 10067ec681f3Smrg /* Mark displayable DCC as dirty for bound images. */ 10077ec681f3Smrg unsigned display_dcc_store_mask = sctx->images[PIPE_SHADER_COMPUTE].display_dcc_store_mask & 10087ec681f3Smrg BITFIELD_MASK(program->sel.info.base.num_images); 10097ec681f3Smrg while (display_dcc_store_mask) { 10107ec681f3Smrg struct si_texture *tex = (struct si_texture *) 10117ec681f3Smrg sctx->images[PIPE_SHADER_COMPUTE].views[u_bit_scan(&display_dcc_store_mask)].resource; 10127ec681f3Smrg 10137ec681f3Smrg si_mark_display_dcc_dirty(sctx, tex); 10147ec681f3Smrg } 10157ec681f3Smrg 10167ec681f3Smrg /* TODO: Bindless images don't set displayable_dcc_dirty after image stores. */ 10177ec681f3Smrg 10187ec681f3Smrg sctx->compute_is_busy = true; 10197ec681f3Smrg sctx->num_compute_calls++; 10207ec681f3Smrg 10217ec681f3Smrg if (cs_regalloc_hang) 10227ec681f3Smrg sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; 102301e04c3fSmrg} 102401e04c3fSmrg 102501e04c3fSmrgvoid si_destroy_compute(struct si_compute *program) 102601e04c3fSmrg{ 10277ec681f3Smrg struct si_shader_selector *sel = &program->sel; 10287ec681f3Smrg 10297ec681f3Smrg if (program->ir_type != PIPE_SHADER_IR_NATIVE) { 10307ec681f3Smrg util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready); 10317ec681f3Smrg util_queue_fence_destroy(&sel->ready); 10327ec681f3Smrg } 10337ec681f3Smrg 10347ec681f3Smrg for (unsigned i = 0; i < program->max_global_buffers; i++) 10357ec681f3Smrg pipe_resource_reference(&program->global_buffers[i], NULL); 10367ec681f3Smrg FREE(program->global_buffers); 10377ec681f3Smrg 10387ec681f3Smrg si_shader_destroy(&program->shader); 10397ec681f3Smrg ralloc_free(program->sel.nir); 10407ec681f3Smrg FREE(program); 1041af69d88dSmrg} 1042af69d88dSmrg 10437ec681f3Smrgstatic void si_delete_compute_state(struct pipe_context *ctx, void *state) 10447ec681f3Smrg{ 10457ec681f3Smrg struct si_compute *program = (struct si_compute *)state; 10467ec681f3Smrg struct si_context *sctx = (struct si_context *)ctx; 104701e04c3fSmrg 10487ec681f3Smrg if (!state) 10497ec681f3Smrg return; 105001e04c3fSmrg 10517ec681f3Smrg if (program == sctx->cs_shader_state.program) 10527ec681f3Smrg sctx->cs_shader_state.program = NULL; 105301e04c3fSmrg 10547ec681f3Smrg if (program == sctx->cs_shader_state.emitted_program) 10557ec681f3Smrg sctx->cs_shader_state.emitted_program = NULL; 105601e04c3fSmrg 10577ec681f3Smrg si_compute_reference(&program, NULL); 105801e04c3fSmrg} 105901e04c3fSmrg 10607ec681f3Smrgstatic void si_set_compute_resources(struct pipe_context *ctx_, unsigned start, unsigned count, 10617ec681f3Smrg struct pipe_surface **surfaces) 10627ec681f3Smrg{ 10637ec681f3Smrg} 1064af69d88dSmrg 1065af69d88dSmrgvoid si_init_compute_functions(struct si_context *sctx) 1066af69d88dSmrg{ 10677ec681f3Smrg sctx->b.create_compute_state = si_create_compute_state; 10687ec681f3Smrg sctx->b.delete_compute_state = si_delete_compute_state; 10697ec681f3Smrg sctx->b.bind_compute_state = si_bind_compute_state; 10707ec681f3Smrg sctx->b.set_compute_resources = si_set_compute_resources; 10717ec681f3Smrg sctx->b.set_global_binding = si_set_global_binding; 10727ec681f3Smrg sctx->b.launch_grid = si_launch_grid; 1073af69d88dSmrg} 1074