1/* 2 * Copyright (C) 2021 Collabora, Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 */ 24 25#include <stdio.h> 26#include "pan_bo.h" 27#include "pan_shader.h" 28#include "pan_scoreboard.h" 29#include "pan_encoder.h" 30#include "pan_indirect_dispatch.h" 31#include "pan_pool.h" 32#include "pan_util.h" 33#include "panfrost-quirks.h" 34#include "compiler/nir/nir_builder.h" 35#include "util/u_memory.h" 36#include "util/macros.h" 37 38struct indirect_dispatch_inputs { 39 mali_ptr job; 40 mali_ptr indirect_dim; 41 mali_ptr num_wg_sysval[3]; 42}; 43 44static nir_ssa_def * 45get_input_data(nir_builder *b, unsigned offset, unsigned size) 46{ 47 assert(!(offset & 0x3)); 48 assert(size && !(size & 0x3)); 49 50 return nir_load_ubo(b, 1, size, 51 nir_imm_int(b, 0), 52 nir_imm_int(b, offset), 53 .align_mul = 4, 54 .align_offset = 0, 55 .range_base = 0, 56 .range = ~0); 57} 58 59#define get_input_field(b, name) \ 60 get_input_data(b, offsetof(struct indirect_dispatch_inputs, name), \ 61 sizeof(((struct indirect_dispatch_inputs *)0)->name) * 8) 62 63static mali_ptr 64get_rsd(const struct panfrost_device *dev) 65{ 66 return dev->indirect_dispatch.descs->ptr.gpu; 67} 68 69static mali_ptr 70get_tls(const struct panfrost_device *dev) 71{ 72 return dev->indirect_dispatch.descs->ptr.gpu + 73 pan_size(RENDERER_STATE); 74} 75 76static mali_ptr 77get_ubos(struct pan_pool *pool, 78 const struct indirect_dispatch_inputs *inputs) 79{ 80 struct panfrost_ptr inputs_buf = 81 pan_pool_alloc_aligned(pool, ALIGN_POT(sizeof(*inputs), 16), 16); 82 83 memcpy(inputs_buf.cpu, inputs, sizeof(*inputs)); 84 85 struct panfrost_ptr ubos_buf = 86 pan_pool_alloc_desc(pool, UNIFORM_BUFFER); 87 88 pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) { 89 cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16); 90 cfg.pointer = inputs_buf.gpu; 91 } 92 93 return ubos_buf.gpu; 94} 95 96static mali_ptr 97get_push_uniforms(struct pan_pool *pool, 98 const struct indirect_dispatch_inputs *inputs) 99{ 100 const struct panfrost_device *dev = pool->dev; 101 struct panfrost_ptr push_consts_buf = 102 pan_pool_alloc_aligned(pool, 103 ALIGN(dev->indirect_dispatch.push.count * 4, 16), 104 16); 105 uint32_t *out = push_consts_buf.cpu; 106 uint8_t *in = (uint8_t *)inputs; 107 108 for (unsigned i = 0; i < dev->indirect_dispatch.push.count; ++i) 109 memcpy(out + i, in + dev->indirect_dispatch.push.words[i].offset, 4); 110 111 return push_consts_buf.gpu; 112} 113 114unsigned 115GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool, 116 struct pan_scoreboard *scoreboard, 117 const struct pan_indirect_dispatch_info *dispatch_info) 118{ 119 struct panfrost_device *dev = pool->dev; 120 struct panfrost_ptr job = 121 pan_pool_alloc_desc(pool, COMPUTE_JOB); 122 void *invocation = 123 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); 124 struct indirect_dispatch_inputs inputs = { 125 .job = dispatch_info->job, 126 .indirect_dim = dispatch_info->indirect_dim, 127 .num_wg_sysval = { 128 dispatch_info->num_wg_sysval[0], 129 dispatch_info->num_wg_sysval[1], 130 dispatch_info->num_wg_sysval[2], 131 }, 132 }; 133 134 panfrost_pack_work_groups_compute(invocation, 135 1, 1, 1, 1, 1, 1, 136 false, false); 137 138 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { 139 cfg.job_task_split = 2; 140 } 141 142 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { 143 cfg.draw_descriptor_is_64b = true; 144 cfg.state = get_rsd(dev); 145 cfg.thread_storage = get_tls(pool->dev); 146 cfg.uniform_buffers = get_ubos(pool, &inputs); 147 cfg.push_uniforms = get_push_uniforms(pool, &inputs); 148 } 149 150 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, 151 false, true, 0, 0, &job, false); 152} 153 154void 155GENX(pan_indirect_dispatch_init)(struct panfrost_device *dev) 156{ 157 nir_builder b = 158 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, 159 GENX(pan_shader_get_compiler_options)(), 160 "%s", "indirect_dispatch"); 161 b.shader->info.internal = true; 162 nir_variable_create(b.shader, nir_var_mem_ubo, 163 glsl_uint_type(), "inputs"); 164 b.shader->info.num_ubos++; 165 166 nir_ssa_def *zero = nir_imm_int(&b, 0); 167 nir_ssa_def *one = nir_imm_int(&b, 1); 168 nir_ssa_def *num_wg = nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32); 169 nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0); 170 nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1); 171 nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2); 172 173 nir_ssa_def *job_hdr_ptr = get_input_field(&b, job); 174 nir_ssa_def *num_wg_flat = nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z)); 175 176 nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero)); 177 { 178 nir_ssa_def *type_ptr = nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4)); 179 nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8); 180 nir_store_global(&b, type_ptr, 1, ntype, 1); 181 } 182 nir_push_else(&b, NULL); 183 { 184 nir_ssa_def *job_dim_ptr = nir_iadd(&b, job_hdr_ptr, 185 nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION))); 186 nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one); 187 nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one); 188 nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one); 189 nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32); 190 nir_ssa_def *dims = nir_channel(&b, job_dim, 0); 191 nir_ssa_def *split = nir_channel(&b, job_dim, 1); 192 nir_ssa_def *num_wg_x_split = nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f); 193 nir_ssa_def *num_wg_y_split = nir_iadd(&b, num_wg_x_split, 194 nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1))); 195 nir_ssa_def *num_wg_z_split = nir_iadd(&b, num_wg_y_split, 196 nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1))); 197 split = nir_ior(&b, split, 198 nir_ior(&b, 199 nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)), 200 nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22)))); 201 dims = nir_ior(&b, dims, 202 nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split), 203 nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split), 204 nir_ishl(&b, num_wg_z_m1, num_wg_z_split)))); 205 206 nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3); 207 208 nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]); 209 210 nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0))); 211 { 212 nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1); 213 nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, num_wg_y, 1); 214 nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, num_wg_z, 1); 215 } 216 nir_pop_if(&b, NULL); 217 } 218 219 nir_pop_if(&b, NULL); 220 221 struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id }; 222 struct pan_shader_info shader_info; 223 struct util_dynarray binary; 224 225 util_dynarray_init(&binary, NULL); 226 GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); 227 228 ralloc_free(b.shader); 229 230 assert(!shader_info.tls_size); 231 assert(!shader_info.wls_size); 232 assert(!shader_info.sysvals.sysval_count); 233 234 dev->indirect_dispatch.bin = 235 panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE, 236 "Indirect dispatch shader"); 237 238 memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size); 239 util_dynarray_fini(&binary); 240 241 dev->indirect_dispatch.push = shader_info.push; 242 dev->indirect_dispatch.descs = 243 panfrost_bo_create(dev, 244 pan_size(RENDERER_STATE) + 245 pan_size(LOCAL_STORAGE), 246 0, "Indirect dispatch descriptors"); 247 248 mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu; 249 250#if PAN_ARCH <= 5 251 address |= shader_info.midgard.first_tag; 252#endif 253 254 void *rsd = dev->indirect_dispatch.descs->ptr.cpu; 255 pan_pack(rsd, RENDERER_STATE, cfg) { 256 pan_shader_prepare_rsd(&shader_info, address, &cfg); 257 } 258 259 void *tsd = dev->indirect_dispatch.descs->ptr.cpu + 260 pan_size(RENDERER_STATE); 261 pan_pack(tsd, LOCAL_STORAGE, ls) { 262 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; 263 }; 264} 265 266void 267GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev) 268{ 269 panfrost_bo_unreference(dev->indirect_dispatch.bin); 270 panfrost_bo_unreference(dev->indirect_dispatch.descs); 271} 272