17e102996Smaya/*
27e102996Smaya * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
37e102996Smaya *
47e102996Smaya * Permission is hereby granted, free of charge, to any person obtaining a
57e102996Smaya * copy of this software and associated documentation files (the "Software"),
67e102996Smaya * to deal in the Software without restriction, including without limitation
77e102996Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87e102996Smaya * and/or sell copies of the Software, and to permit persons to whom the
97e102996Smaya * Software is furnished to do so, subject to the following conditions:
107e102996Smaya *
117e102996Smaya * The above copyright notice and this permission notice (including the next
127e102996Smaya * paragraph) shall be included in all copies or substantial portions of the
137e102996Smaya * Software.
147e102996Smaya *
157e102996Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167e102996Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177e102996Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187e102996Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197e102996Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207e102996Smaya * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217e102996Smaya * SOFTWARE.
227e102996Smaya *
237e102996Smaya * Authors:
247e102996Smaya *    Rob Clark <robclark@freedesktop.org>
257e102996Smaya */
267e102996Smaya
277e102996Smaya#include "util/ralloc.h"
287e102996Smaya
297ec681f3Smrg#include "freedreno_dev_info.h"
307ec681f3Smrg
317e102996Smaya#include "ir3_compiler.h"
327e102996Smaya
337e102996Smayastatic const struct debug_named_value shader_debug_options[] = {
347ec681f3Smrg   /* clang-format off */
357ec681f3Smrg   {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
367ec681f3Smrg   {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
377ec681f3Smrg   {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
387ec681f3Smrg   {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
397ec681f3Smrg   {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
407ec681f3Smrg   {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
417ec681f3Smrg   {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
427ec681f3Smrg   {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
437ec681f3Smrg   {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
447ec681f3Smrg   {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
457ec681f3Smrg   {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
467ec681f3Smrg   {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
477ec681f3Smrg   {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
487ec681f3Smrg#ifdef DEBUG
497ec681f3Smrg   /* DEBUG-only options: */
507ec681f3Smrg   {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
517ec681f3Smrg   {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
527ec681f3Smrg#endif
537ec681f3Smrg   DEBUG_NAMED_VALUE_END
547ec681f3Smrg   /* clang-format on */
557e102996Smaya};
567e102996Smaya
577ec681f3SmrgDEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
587ec681f3Smrg                            shader_debug_options, 0)
597ec681f3SmrgDEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
607ec681f3Smrg                      NULL)
617e102996Smaya
627e102996Smayaenum ir3_shader_debug ir3_shader_debug = 0;
637ec681f3Smrgconst char *ir3_shader_override_path = NULL;
647e102996Smaya
657ec681f3Smrgvoid
667ec681f3Smrgir3_compiler_destroy(struct ir3_compiler *compiler)
677e102996Smaya{
687ec681f3Smrg   disk_cache_destroy(compiler->disk_cache);
697ec681f3Smrg   ralloc_free(compiler);
707ec681f3Smrg}
717ec681f3Smrg
727ec681f3Smrgstruct ir3_compiler *
737ec681f3Smrgir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
747ec681f3Smrg                    bool robust_ubo_access)
757ec681f3Smrg{
767ec681f3Smrg   struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
777ec681f3Smrg
787ec681f3Smrg   ir3_shader_debug = debug_get_option_ir3_shader_debug();
797ec681f3Smrg   ir3_shader_override_path =
807ec681f3Smrg      !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
817ec681f3Smrg
827ec681f3Smrg   if (ir3_shader_override_path) {
837ec681f3Smrg      ir3_shader_debug |= IR3_DBG_NOCACHE;
847ec681f3Smrg   }
857ec681f3Smrg
867ec681f3Smrg   compiler->dev = dev;
877ec681f3Smrg   compiler->dev_id = dev_id;
887ec681f3Smrg   compiler->gen = fd_dev_gen(dev_id);
897ec681f3Smrg   compiler->robust_ubo_access = robust_ubo_access;
907ec681f3Smrg
917ec681f3Smrg   /* All known GPU's have 32k local memory (aka shared) */
927ec681f3Smrg   compiler->local_mem_size = 32 * 1024;
937ec681f3Smrg   /* TODO see if older GPU's were different here */
947ec681f3Smrg   compiler->branchstack_size = 64;
957ec681f3Smrg   compiler->wave_granularity = 2;
967ec681f3Smrg   compiler->max_waves = 16;
977ec681f3Smrg
987ec681f3Smrg   if (compiler->gen >= 6) {
997ec681f3Smrg      compiler->samgq_workaround = true;
1007ec681f3Smrg      /* a6xx split the pipeline state into geometry and fragment state, in
1017ec681f3Smrg       * order to let the VS run ahead of the FS. As a result there are now
1027ec681f3Smrg       * separate const files for the the fragment shader and everything
1037ec681f3Smrg       * else, and separate limits. There seems to be a shared limit, but
1047ec681f3Smrg       * it's higher than the vert or frag limits.
1057ec681f3Smrg       *
1067ec681f3Smrg       * TODO: The shared limit seems to be different on different on
1077ec681f3Smrg       * different models.
1087ec681f3Smrg       */
1097ec681f3Smrg      compiler->max_const_pipeline = 640;
1107ec681f3Smrg      compiler->max_const_frag = 512;
1117ec681f3Smrg      compiler->max_const_geom = 512;
1127ec681f3Smrg      compiler->max_const_safe = 128;
1137ec681f3Smrg
1147ec681f3Smrg      /* Compute shaders don't share a const file with the FS. Instead they
1157ec681f3Smrg       * have their own file, which is smaller than the FS one.
1167ec681f3Smrg       *
1177ec681f3Smrg       * TODO: is this true on earlier gen's?
1187ec681f3Smrg       */
1197ec681f3Smrg      compiler->max_const_compute = 256;
1207ec681f3Smrg
1217ec681f3Smrg      /* TODO: implement clip+cull distances on earlier gen's */
1227ec681f3Smrg      compiler->has_clip_cull = true;
1237ec681f3Smrg
1247ec681f3Smrg      /* TODO: implement private memory on earlier gen's */
1257ec681f3Smrg      compiler->has_pvtmem = true;
1267ec681f3Smrg
1277ec681f3Smrg      compiler->tess_use_shared =
1287ec681f3Smrg            fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared;
1297ec681f3Smrg
1307ec681f3Smrg      compiler->storage_16bit =
1317ec681f3Smrg            fd_dev_info(compiler->dev_id)->a6xx.storage_16bit;
1327ec681f3Smrg   } else {
1337ec681f3Smrg      compiler->max_const_pipeline = 512;
1347ec681f3Smrg      compiler->max_const_geom = 512;
1357ec681f3Smrg      compiler->max_const_frag = 512;
1367ec681f3Smrg      compiler->max_const_compute = 512;
1377ec681f3Smrg
1387ec681f3Smrg      /* Note: this will have to change if/when we support tess+GS on
1397ec681f3Smrg       * earlier gen's.
1407ec681f3Smrg       */
1417ec681f3Smrg      compiler->max_const_safe = 256;
1427ec681f3Smrg   }
1437ec681f3Smrg
1447ec681f3Smrg   if (compiler->gen >= 6) {
1457ec681f3Smrg      compiler->reg_size_vec4 =
1467ec681f3Smrg            fd_dev_info(compiler->dev_id)->a6xx.reg_size_vec4;
1477ec681f3Smrg   } else if (compiler->gen >= 4) {
1487ec681f3Smrg      /* On a4xx-a5xx, using r24.x and above requires using the smallest
1497ec681f3Smrg       * threadsize.
1507ec681f3Smrg       */
1517ec681f3Smrg      compiler->reg_size_vec4 = 48;
1527ec681f3Smrg   } else {
1537ec681f3Smrg      /* TODO: confirm this */
1547ec681f3Smrg      compiler->reg_size_vec4 = 96;
1557ec681f3Smrg   }
1567ec681f3Smrg
1577ec681f3Smrg   if (compiler->gen >= 6) {
1587ec681f3Smrg      compiler->threadsize_base = 64;
1597ec681f3Smrg   } else if (compiler->gen >= 4) {
1607ec681f3Smrg      /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
1617ec681f3Smrg       * 1.1 subgroupSize which is 32.
1627ec681f3Smrg       */
1637ec681f3Smrg      compiler->threadsize_base = 32;
1647ec681f3Smrg   } else {
1657ec681f3Smrg      compiler->threadsize_base = 8;
1667ec681f3Smrg   }
1677ec681f3Smrg
1687ec681f3Smrg   if (compiler->gen >= 4) {
1697ec681f3Smrg      /* need special handling for "flat" */
1707ec681f3Smrg      compiler->flat_bypass = true;
1717ec681f3Smrg      compiler->levels_add_one = false;
1727ec681f3Smrg      compiler->unminify_coords = false;
1737ec681f3Smrg      compiler->txf_ms_with_isaml = false;
1747ec681f3Smrg      compiler->array_index_add_half = true;
1757ec681f3Smrg      compiler->instr_align = 16;
1767ec681f3Smrg      compiler->const_upload_unit = 4;
1777ec681f3Smrg   } else {
1787ec681f3Smrg      /* no special handling for "flat" */
1797ec681f3Smrg      compiler->flat_bypass = false;
1807ec681f3Smrg      compiler->levels_add_one = true;
1817ec681f3Smrg      compiler->unminify_coords = true;
1827ec681f3Smrg      compiler->txf_ms_with_isaml = true;
1837ec681f3Smrg      compiler->array_index_add_half = false;
1847ec681f3Smrg      compiler->instr_align = 4;
1857ec681f3Smrg      compiler->const_upload_unit = 8;
1867ec681f3Smrg   }
1877ec681f3Smrg
1887ec681f3Smrg   ir3_disk_cache_init(compiler);
1897ec681f3Smrg
1907ec681f3Smrg   return compiler;
1917e102996Smaya}
192