17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2021 Collabora, Ltd. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#include <stdio.h> 267ec681f3Smrg#include "pan_bo.h" 277ec681f3Smrg#include "pan_shader.h" 287ec681f3Smrg#include "pan_scoreboard.h" 297ec681f3Smrg#include "pan_encoder.h" 307ec681f3Smrg#include "pan_indirect_dispatch.h" 317ec681f3Smrg#include "pan_pool.h" 327ec681f3Smrg#include "pan_util.h" 337ec681f3Smrg#include "panfrost-quirks.h" 347ec681f3Smrg#include "compiler/nir/nir_builder.h" 357ec681f3Smrg#include "util/u_memory.h" 367ec681f3Smrg#include "util/macros.h" 377ec681f3Smrg 387ec681f3Smrgstruct indirect_dispatch_inputs { 397ec681f3Smrg mali_ptr job; 407ec681f3Smrg mali_ptr indirect_dim; 417ec681f3Smrg mali_ptr num_wg_sysval[3]; 427ec681f3Smrg}; 437ec681f3Smrg 447ec681f3Smrgstatic nir_ssa_def * 457ec681f3Smrgget_input_data(nir_builder *b, unsigned offset, unsigned size) 467ec681f3Smrg{ 477ec681f3Smrg assert(!(offset & 0x3)); 487ec681f3Smrg assert(size && !(size & 0x3)); 497ec681f3Smrg 507ec681f3Smrg return nir_load_ubo(b, 1, size, 517ec681f3Smrg nir_imm_int(b, 0), 527ec681f3Smrg nir_imm_int(b, offset), 537ec681f3Smrg .align_mul = 4, 547ec681f3Smrg .align_offset = 0, 557ec681f3Smrg .range_base = 0, 567ec681f3Smrg .range = ~0); 577ec681f3Smrg} 587ec681f3Smrg 597ec681f3Smrg#define get_input_field(b, name) \ 607ec681f3Smrg get_input_data(b, offsetof(struct indirect_dispatch_inputs, name), \ 617ec681f3Smrg sizeof(((struct indirect_dispatch_inputs *)0)->name) * 8) 627ec681f3Smrg 637ec681f3Smrgstatic mali_ptr 647ec681f3Smrgget_rsd(const struct panfrost_device *dev) 657ec681f3Smrg{ 667ec681f3Smrg return dev->indirect_dispatch.descs->ptr.gpu; 677ec681f3Smrg} 687ec681f3Smrg 697ec681f3Smrgstatic mali_ptr 707ec681f3Smrgget_tls(const struct panfrost_device *dev) 717ec681f3Smrg{ 727ec681f3Smrg return dev->indirect_dispatch.descs->ptr.gpu + 737ec681f3Smrg pan_size(RENDERER_STATE); 747ec681f3Smrg} 757ec681f3Smrg 767ec681f3Smrgstatic mali_ptr 777ec681f3Smrgget_ubos(struct pan_pool *pool, 787ec681f3Smrg const struct indirect_dispatch_inputs *inputs) 797ec681f3Smrg{ 807ec681f3Smrg struct panfrost_ptr inputs_buf = 817ec681f3Smrg pan_pool_alloc_aligned(pool, ALIGN_POT(sizeof(*inputs), 16), 16); 827ec681f3Smrg 837ec681f3Smrg memcpy(inputs_buf.cpu, inputs, sizeof(*inputs)); 847ec681f3Smrg 857ec681f3Smrg struct panfrost_ptr ubos_buf = 867ec681f3Smrg pan_pool_alloc_desc(pool, UNIFORM_BUFFER); 877ec681f3Smrg 887ec681f3Smrg pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) { 897ec681f3Smrg cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16); 907ec681f3Smrg cfg.pointer = inputs_buf.gpu; 917ec681f3Smrg } 927ec681f3Smrg 937ec681f3Smrg return ubos_buf.gpu; 947ec681f3Smrg} 957ec681f3Smrg 967ec681f3Smrgstatic mali_ptr 977ec681f3Smrgget_push_uniforms(struct pan_pool *pool, 987ec681f3Smrg const struct indirect_dispatch_inputs *inputs) 997ec681f3Smrg{ 1007ec681f3Smrg const struct panfrost_device *dev = pool->dev; 1017ec681f3Smrg struct panfrost_ptr push_consts_buf = 1027ec681f3Smrg pan_pool_alloc_aligned(pool, 1037ec681f3Smrg ALIGN(dev->indirect_dispatch.push.count * 4, 16), 1047ec681f3Smrg 16); 1057ec681f3Smrg uint32_t *out = push_consts_buf.cpu; 1067ec681f3Smrg uint8_t *in = (uint8_t *)inputs; 1077ec681f3Smrg 1087ec681f3Smrg for (unsigned i = 0; i < dev->indirect_dispatch.push.count; ++i) 1097ec681f3Smrg memcpy(out + i, in + dev->indirect_dispatch.push.words[i].offset, 4); 1107ec681f3Smrg 1117ec681f3Smrg return push_consts_buf.gpu; 1127ec681f3Smrg} 1137ec681f3Smrg 1147ec681f3Smrgunsigned 1157ec681f3SmrgGENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, 1167ec681f3Smrg struct pan_scoreboard *scoreboard, 1177ec681f3Smrg const struct pan_indirect_dispatch_info *dispatch_info) 1187ec681f3Smrg{ 1197ec681f3Smrg struct panfrost_device *dev = pool->dev; 1207ec681f3Smrg struct panfrost_ptr job = 1217ec681f3Smrg pan_pool_alloc_desc(pool, COMPUTE_JOB); 1227ec681f3Smrg void *invocation = 1237ec681f3Smrg pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); 1247ec681f3Smrg struct indirect_dispatch_inputs inputs = { 1257ec681f3Smrg .job = dispatch_info->job, 1267ec681f3Smrg .indirect_dim = dispatch_info->indirect_dim, 1277ec681f3Smrg .num_wg_sysval = { 1287ec681f3Smrg dispatch_info->num_wg_sysval[0], 1297ec681f3Smrg dispatch_info->num_wg_sysval[1], 1307ec681f3Smrg dispatch_info->num_wg_sysval[2], 1317ec681f3Smrg }, 1327ec681f3Smrg }; 1337ec681f3Smrg 1347ec681f3Smrg panfrost_pack_work_groups_compute(invocation, 1357ec681f3Smrg 1, 1, 1, 1, 1, 1, 1367ec681f3Smrg false, false); 1377ec681f3Smrg 1387ec681f3Smrg pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { 1397ec681f3Smrg cfg.job_task_split = 2; 1407ec681f3Smrg } 1417ec681f3Smrg 1427ec681f3Smrg pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { 1437ec681f3Smrg cfg.draw_descriptor_is_64b = true; 1447ec681f3Smrg cfg.state = get_rsd(dev); 1457ec681f3Smrg cfg.thread_storage = get_tls(pool->dev); 1467ec681f3Smrg cfg.uniform_buffers = get_ubos(pool, &inputs); 1477ec681f3Smrg cfg.push_uniforms = get_push_uniforms(pool, &inputs); 1487ec681f3Smrg } 1497ec681f3Smrg 1507ec681f3Smrg return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, 1517ec681f3Smrg false, true, 0, 0, &job, false); 1527ec681f3Smrg} 1537ec681f3Smrg 1547ec681f3Smrgvoid 1557ec681f3SmrgGENX(pan_indirect_dispatch_init)(struct panfrost_device *dev) 1567ec681f3Smrg{ 1577ec681f3Smrg nir_builder b = 1587ec681f3Smrg nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, 1597ec681f3Smrg GENX(pan_shader_get_compiler_options)(), 1607ec681f3Smrg "%s", "indirect_dispatch"); 1617ec681f3Smrg b.shader->info.internal = true; 1627ec681f3Smrg nir_variable_create(b.shader, nir_var_mem_ubo, 1637ec681f3Smrg glsl_uint_type(), "inputs"); 1647ec681f3Smrg b.shader->info.num_ubos++; 1657ec681f3Smrg 1667ec681f3Smrg nir_ssa_def *zero = nir_imm_int(&b, 0); 1677ec681f3Smrg nir_ssa_def *one = nir_imm_int(&b, 1); 1687ec681f3Smrg nir_ssa_def *num_wg = nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32); 1697ec681f3Smrg nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0); 1707ec681f3Smrg nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1); 1717ec681f3Smrg nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2); 1727ec681f3Smrg 1737ec681f3Smrg nir_ssa_def *job_hdr_ptr = get_input_field(&b, job); 1747ec681f3Smrg nir_ssa_def *num_wg_flat = nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z)); 1757ec681f3Smrg 1767ec681f3Smrg nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero)); 1777ec681f3Smrg { 1787ec681f3Smrg nir_ssa_def *type_ptr = nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4)); 1797ec681f3Smrg nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8); 1807ec681f3Smrg nir_store_global(&b, type_ptr, 1, ntype, 1); 1817ec681f3Smrg } 1827ec681f3Smrg nir_push_else(&b, NULL); 1837ec681f3Smrg { 1847ec681f3Smrg nir_ssa_def *job_dim_ptr = nir_iadd(&b, job_hdr_ptr, 1857ec681f3Smrg nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION))); 1867ec681f3Smrg nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one); 1877ec681f3Smrg nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one); 1887ec681f3Smrg nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one); 1897ec681f3Smrg nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32); 1907ec681f3Smrg nir_ssa_def *dims = nir_channel(&b, job_dim, 0); 1917ec681f3Smrg nir_ssa_def *split = nir_channel(&b, job_dim, 1); 1927ec681f3Smrg nir_ssa_def *num_wg_x_split = nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f); 1937ec681f3Smrg nir_ssa_def *num_wg_y_split = nir_iadd(&b, num_wg_x_split, 1947ec681f3Smrg nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1))); 1957ec681f3Smrg nir_ssa_def *num_wg_z_split = nir_iadd(&b, num_wg_y_split, 1967ec681f3Smrg nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1))); 1977ec681f3Smrg split = nir_ior(&b, split, 1987ec681f3Smrg nir_ior(&b, 1997ec681f3Smrg nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)), 2007ec681f3Smrg nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22)))); 2017ec681f3Smrg dims = nir_ior(&b, dims, 2027ec681f3Smrg nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split), 2037ec681f3Smrg nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split), 2047ec681f3Smrg nir_ishl(&b, num_wg_z_m1, num_wg_z_split)))); 2057ec681f3Smrg 2067ec681f3Smrg nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3); 2077ec681f3Smrg 2087ec681f3Smrg nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]); 2097ec681f3Smrg 2107ec681f3Smrg nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0))); 2117ec681f3Smrg { 2127ec681f3Smrg nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1); 2137ec681f3Smrg nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, num_wg_y, 1); 2147ec681f3Smrg nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, num_wg_z, 1); 2157ec681f3Smrg } 2167ec681f3Smrg nir_pop_if(&b, NULL); 2177ec681f3Smrg } 2187ec681f3Smrg 2197ec681f3Smrg nir_pop_if(&b, NULL); 2207ec681f3Smrg 2217ec681f3Smrg struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id }; 2227ec681f3Smrg struct pan_shader_info shader_info; 2237ec681f3Smrg struct util_dynarray binary; 2247ec681f3Smrg 2257ec681f3Smrg util_dynarray_init(&binary, NULL); 2267ec681f3Smrg GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); 2277ec681f3Smrg 2287ec681f3Smrg ralloc_free(b.shader); 2297ec681f3Smrg 2307ec681f3Smrg assert(!shader_info.tls_size); 2317ec681f3Smrg assert(!shader_info.wls_size); 2327ec681f3Smrg assert(!shader_info.sysvals.sysval_count); 2337ec681f3Smrg 2347ec681f3Smrg dev->indirect_dispatch.bin = 2357ec681f3Smrg panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE, 2367ec681f3Smrg "Indirect dispatch shader"); 2377ec681f3Smrg 2387ec681f3Smrg memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size); 2397ec681f3Smrg util_dynarray_fini(&binary); 2407ec681f3Smrg 2417ec681f3Smrg dev->indirect_dispatch.push = shader_info.push; 2427ec681f3Smrg dev->indirect_dispatch.descs = 2437ec681f3Smrg panfrost_bo_create(dev, 2447ec681f3Smrg pan_size(RENDERER_STATE) + 2457ec681f3Smrg pan_size(LOCAL_STORAGE), 2467ec681f3Smrg 0, "Indirect dispatch descriptors"); 2477ec681f3Smrg 2487ec681f3Smrg mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu; 2497ec681f3Smrg 2507ec681f3Smrg#if PAN_ARCH <= 5 2517ec681f3Smrg address |= shader_info.midgard.first_tag; 2527ec681f3Smrg#endif 2537ec681f3Smrg 2547ec681f3Smrg void *rsd = dev->indirect_dispatch.descs->ptr.cpu; 2557ec681f3Smrg pan_pack(rsd, RENDERER_STATE, cfg) { 2567ec681f3Smrg pan_shader_prepare_rsd(&shader_info, address, &cfg); 2577ec681f3Smrg } 2587ec681f3Smrg 2597ec681f3Smrg void *tsd = dev->indirect_dispatch.descs->ptr.cpu + 2607ec681f3Smrg pan_size(RENDERER_STATE); 2617ec681f3Smrg pan_pack(tsd, LOCAL_STORAGE, ls) { 2627ec681f3Smrg ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; 2637ec681f3Smrg }; 2647ec681f3Smrg} 2657ec681f3Smrg 2667ec681f3Smrgvoid 2677ec681f3SmrgGENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev) 2687ec681f3Smrg{ 2697ec681f3Smrg panfrost_bo_unreference(dev->indirect_dispatch.bin); 2707ec681f3Smrg panfrost_bo_unreference(dev->indirect_dispatch.descs); 2717ec681f3Smrg} 272