17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2021 Collabora, Ltd.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include <stdio.h>
267ec681f3Smrg#include "pan_bo.h"
277ec681f3Smrg#include "pan_shader.h"
287ec681f3Smrg#include "pan_scoreboard.h"
297ec681f3Smrg#include "pan_encoder.h"
307ec681f3Smrg#include "pan_indirect_dispatch.h"
317ec681f3Smrg#include "pan_pool.h"
327ec681f3Smrg#include "pan_util.h"
337ec681f3Smrg#include "panfrost-quirks.h"
347ec681f3Smrg#include "compiler/nir/nir_builder.h"
357ec681f3Smrg#include "util/u_memory.h"
367ec681f3Smrg#include "util/macros.h"
377ec681f3Smrg
387ec681f3Smrgstruct indirect_dispatch_inputs {
397ec681f3Smrg        mali_ptr job;
407ec681f3Smrg        mali_ptr indirect_dim;
417ec681f3Smrg        mali_ptr num_wg_sysval[3];
427ec681f3Smrg};
437ec681f3Smrg
447ec681f3Smrgstatic nir_ssa_def *
457ec681f3Smrgget_input_data(nir_builder *b, unsigned offset, unsigned size)
467ec681f3Smrg{
477ec681f3Smrg        assert(!(offset & 0x3));
487ec681f3Smrg        assert(size && !(size & 0x3));
497ec681f3Smrg
507ec681f3Smrg        return nir_load_ubo(b, 1, size,
517ec681f3Smrg                            nir_imm_int(b, 0),
527ec681f3Smrg                            nir_imm_int(b, offset),
537ec681f3Smrg                            .align_mul = 4,
547ec681f3Smrg                            .align_offset = 0,
557ec681f3Smrg                            .range_base = 0,
567ec681f3Smrg                            .range = ~0);
577ec681f3Smrg}
587ec681f3Smrg
597ec681f3Smrg#define get_input_field(b, name) \
607ec681f3Smrg        get_input_data(b, offsetof(struct indirect_dispatch_inputs, name), \
617ec681f3Smrg                       sizeof(((struct indirect_dispatch_inputs *)0)->name) * 8)
627ec681f3Smrg
637ec681f3Smrgstatic mali_ptr
647ec681f3Smrgget_rsd(const struct panfrost_device *dev)
657ec681f3Smrg{
667ec681f3Smrg        return dev->indirect_dispatch.descs->ptr.gpu;
677ec681f3Smrg}
687ec681f3Smrg
697ec681f3Smrgstatic mali_ptr
707ec681f3Smrgget_tls(const struct panfrost_device *dev)
717ec681f3Smrg{
727ec681f3Smrg        return dev->indirect_dispatch.descs->ptr.gpu +
737ec681f3Smrg               pan_size(RENDERER_STATE);
747ec681f3Smrg}
757ec681f3Smrg
767ec681f3Smrgstatic mali_ptr
777ec681f3Smrgget_ubos(struct pan_pool *pool,
787ec681f3Smrg         const struct indirect_dispatch_inputs *inputs)
797ec681f3Smrg{
807ec681f3Smrg        struct panfrost_ptr inputs_buf =
817ec681f3Smrg                pan_pool_alloc_aligned(pool, ALIGN_POT(sizeof(*inputs), 16), 16);
827ec681f3Smrg
837ec681f3Smrg        memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
847ec681f3Smrg
857ec681f3Smrg        struct panfrost_ptr ubos_buf =
867ec681f3Smrg                pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
877ec681f3Smrg
887ec681f3Smrg        pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
897ec681f3Smrg                cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
907ec681f3Smrg                cfg.pointer = inputs_buf.gpu;
917ec681f3Smrg        }
927ec681f3Smrg
937ec681f3Smrg        return ubos_buf.gpu;
947ec681f3Smrg}
957ec681f3Smrg
967ec681f3Smrgstatic mali_ptr
977ec681f3Smrgget_push_uniforms(struct pan_pool *pool,
987ec681f3Smrg                  const struct indirect_dispatch_inputs *inputs)
997ec681f3Smrg{
1007ec681f3Smrg        const struct panfrost_device *dev = pool->dev;
1017ec681f3Smrg        struct panfrost_ptr push_consts_buf =
1027ec681f3Smrg                pan_pool_alloc_aligned(pool,
1037ec681f3Smrg                                       ALIGN(dev->indirect_dispatch.push.count * 4, 16),
1047ec681f3Smrg                                       16);
1057ec681f3Smrg        uint32_t *out = push_consts_buf.cpu;
1067ec681f3Smrg        uint8_t *in = (uint8_t *)inputs;
1077ec681f3Smrg
1087ec681f3Smrg        for (unsigned i = 0; i < dev->indirect_dispatch.push.count; ++i)
1097ec681f3Smrg                memcpy(out + i, in +  dev->indirect_dispatch.push.words[i].offset, 4);
1107ec681f3Smrg
1117ec681f3Smrg        return push_consts_buf.gpu;
1127ec681f3Smrg}
1137ec681f3Smrg
1147ec681f3Smrgunsigned
1157ec681f3SmrgGENX(pan_indirect_dispatch_emit)(struct pan_pool *pool,
1167ec681f3Smrg                                 struct pan_scoreboard *scoreboard,
1177ec681f3Smrg                                 const struct pan_indirect_dispatch_info *dispatch_info)
1187ec681f3Smrg{
1197ec681f3Smrg        struct panfrost_device *dev = pool->dev;
1207ec681f3Smrg        struct panfrost_ptr job =
1217ec681f3Smrg                pan_pool_alloc_desc(pool, COMPUTE_JOB);
1227ec681f3Smrg        void *invocation =
1237ec681f3Smrg                pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1247ec681f3Smrg        struct indirect_dispatch_inputs inputs = {
1257ec681f3Smrg                .job = dispatch_info->job,
1267ec681f3Smrg                .indirect_dim = dispatch_info->indirect_dim,
1277ec681f3Smrg                .num_wg_sysval = {
1287ec681f3Smrg                        dispatch_info->num_wg_sysval[0],
1297ec681f3Smrg                        dispatch_info->num_wg_sysval[1],
1307ec681f3Smrg                        dispatch_info->num_wg_sysval[2],
1317ec681f3Smrg                },
1327ec681f3Smrg        };
1337ec681f3Smrg
1347ec681f3Smrg        panfrost_pack_work_groups_compute(invocation,
1357ec681f3Smrg                                          1, 1, 1, 1, 1, 1,
1367ec681f3Smrg                                          false, false);
1377ec681f3Smrg
1387ec681f3Smrg        pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1397ec681f3Smrg                cfg.job_task_split = 2;
1407ec681f3Smrg        }
1417ec681f3Smrg
1427ec681f3Smrg        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1437ec681f3Smrg                cfg.draw_descriptor_is_64b = true;
1447ec681f3Smrg                cfg.state = get_rsd(dev);
1457ec681f3Smrg                cfg.thread_storage = get_tls(pool->dev);
1467ec681f3Smrg                cfg.uniform_buffers = get_ubos(pool, &inputs);
1477ec681f3Smrg                cfg.push_uniforms = get_push_uniforms(pool, &inputs);
1487ec681f3Smrg        }
1497ec681f3Smrg
1507ec681f3Smrg        return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1517ec681f3Smrg                                false, true, 0, 0, &job, false);
1527ec681f3Smrg}
1537ec681f3Smrg
1547ec681f3Smrgvoid
1557ec681f3SmrgGENX(pan_indirect_dispatch_init)(struct panfrost_device *dev)
1567ec681f3Smrg{
1577ec681f3Smrg        nir_builder b =
1587ec681f3Smrg                nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1597ec681f3Smrg                                               GENX(pan_shader_get_compiler_options)(),
1607ec681f3Smrg                                               "%s", "indirect_dispatch");
1617ec681f3Smrg        b.shader->info.internal = true;
1627ec681f3Smrg        nir_variable_create(b.shader, nir_var_mem_ubo,
1637ec681f3Smrg                            glsl_uint_type(), "inputs");
1647ec681f3Smrg        b.shader->info.num_ubos++;
1657ec681f3Smrg
1667ec681f3Smrg        nir_ssa_def *zero = nir_imm_int(&b, 0);
1677ec681f3Smrg        nir_ssa_def *one = nir_imm_int(&b, 1);
1687ec681f3Smrg        nir_ssa_def *num_wg = nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32);
1697ec681f3Smrg        nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0);
1707ec681f3Smrg        nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1);
1717ec681f3Smrg        nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2);
1727ec681f3Smrg
1737ec681f3Smrg        nir_ssa_def *job_hdr_ptr = get_input_field(&b, job);
1747ec681f3Smrg        nir_ssa_def *num_wg_flat = nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z));
1757ec681f3Smrg
1767ec681f3Smrg        nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero));
1777ec681f3Smrg        {
1787ec681f3Smrg                nir_ssa_def *type_ptr = nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4));
1797ec681f3Smrg                nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8);
1807ec681f3Smrg                nir_store_global(&b, type_ptr, 1, ntype, 1);
1817ec681f3Smrg        }
1827ec681f3Smrg        nir_push_else(&b, NULL);
1837ec681f3Smrg        {
1847ec681f3Smrg                nir_ssa_def *job_dim_ptr = nir_iadd(&b, job_hdr_ptr,
1857ec681f3Smrg                                nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION)));
1867ec681f3Smrg                nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one);
1877ec681f3Smrg                nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one);
1887ec681f3Smrg                nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one);
1897ec681f3Smrg                nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32);
1907ec681f3Smrg                nir_ssa_def *dims = nir_channel(&b, job_dim, 0);
1917ec681f3Smrg                nir_ssa_def *split = nir_channel(&b, job_dim, 1);
1927ec681f3Smrg                nir_ssa_def *num_wg_x_split = nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f);
1937ec681f3Smrg                nir_ssa_def *num_wg_y_split = nir_iadd(&b, num_wg_x_split,
1947ec681f3Smrg                                nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1)));
1957ec681f3Smrg                nir_ssa_def *num_wg_z_split = nir_iadd(&b, num_wg_y_split,
1967ec681f3Smrg                                nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1)));
1977ec681f3Smrg                split = nir_ior(&b, split,
1987ec681f3Smrg                                nir_ior(&b,
1997ec681f3Smrg                                        nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)),
2007ec681f3Smrg                                        nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22))));
2017ec681f3Smrg                dims = nir_ior(&b, dims,
2027ec681f3Smrg                               nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split),
2037ec681f3Smrg                                       nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split),
2047ec681f3Smrg                                               nir_ishl(&b, num_wg_z_m1, num_wg_z_split))));
2057ec681f3Smrg
2067ec681f3Smrg                nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3);
2077ec681f3Smrg
2087ec681f3Smrg                nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]);
2097ec681f3Smrg
2107ec681f3Smrg                nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0)));
2117ec681f3Smrg                {
2127ec681f3Smrg                        nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1);
2137ec681f3Smrg                        nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, num_wg_y, 1);
2147ec681f3Smrg                        nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, num_wg_z, 1);
2157ec681f3Smrg                }
2167ec681f3Smrg                nir_pop_if(&b, NULL);
2177ec681f3Smrg        }
2187ec681f3Smrg
2197ec681f3Smrg        nir_pop_if(&b, NULL);
2207ec681f3Smrg
2217ec681f3Smrg        struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
2227ec681f3Smrg        struct pan_shader_info shader_info;
2237ec681f3Smrg        struct util_dynarray binary;
2247ec681f3Smrg
2257ec681f3Smrg        util_dynarray_init(&binary, NULL);
2267ec681f3Smrg        GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info);
2277ec681f3Smrg
2287ec681f3Smrg        ralloc_free(b.shader);
2297ec681f3Smrg
2307ec681f3Smrg        assert(!shader_info.tls_size);
2317ec681f3Smrg        assert(!shader_info.wls_size);
2327ec681f3Smrg        assert(!shader_info.sysvals.sysval_count);
2337ec681f3Smrg
2347ec681f3Smrg        dev->indirect_dispatch.bin =
2357ec681f3Smrg                panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE,
2367ec681f3Smrg                                "Indirect dispatch shader");
2377ec681f3Smrg
2387ec681f3Smrg        memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size);
2397ec681f3Smrg        util_dynarray_fini(&binary);
2407ec681f3Smrg
2417ec681f3Smrg        dev->indirect_dispatch.push = shader_info.push;
2427ec681f3Smrg        dev->indirect_dispatch.descs =
2437ec681f3Smrg                panfrost_bo_create(dev,
2447ec681f3Smrg                                   pan_size(RENDERER_STATE) +
2457ec681f3Smrg                                   pan_size(LOCAL_STORAGE),
2467ec681f3Smrg                                   0, "Indirect dispatch descriptors");
2477ec681f3Smrg
2487ec681f3Smrg        mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu;
2497ec681f3Smrg
2507ec681f3Smrg#if PAN_ARCH <= 5
2517ec681f3Smrg        address |= shader_info.midgard.first_tag;
2527ec681f3Smrg#endif
2537ec681f3Smrg
2547ec681f3Smrg        void *rsd = dev->indirect_dispatch.descs->ptr.cpu;
2557ec681f3Smrg        pan_pack(rsd, RENDERER_STATE, cfg) {
2567ec681f3Smrg                pan_shader_prepare_rsd(&shader_info, address, &cfg);
2577ec681f3Smrg        }
2587ec681f3Smrg
2597ec681f3Smrg        void *tsd = dev->indirect_dispatch.descs->ptr.cpu +
2607ec681f3Smrg                    pan_size(RENDERER_STATE);
2617ec681f3Smrg        pan_pack(tsd, LOCAL_STORAGE, ls) {
2627ec681f3Smrg                ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
2637ec681f3Smrg        };
2647ec681f3Smrg}
2657ec681f3Smrg
2667ec681f3Smrgvoid
2677ec681f3SmrgGENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev)
2687ec681f3Smrg{
2697ec681f3Smrg        panfrost_bo_unreference(dev->indirect_dispatch.bin);
2707ec681f3Smrg        panfrost_bo_unreference(dev->indirect_dispatch.descs);
2717ec681f3Smrg}
272