17e102996Smaya/* 27e102996Smaya * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 37e102996Smaya * 47e102996Smaya * Permission is hereby granted, free of charge, to any person obtaining a 57e102996Smaya * copy of this software and associated documentation files (the "Software"), 67e102996Smaya * to deal in the Software without restriction, including without limitation 77e102996Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87e102996Smaya * and/or sell copies of the Software, and to permit persons to whom the 97e102996Smaya * Software is furnished to do so, subject to the following conditions: 107e102996Smaya * 117e102996Smaya * The above copyright notice and this permission notice (including the next 127e102996Smaya * paragraph) shall be included in all copies or substantial portions of the 137e102996Smaya * Software. 147e102996Smaya * 157e102996Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167e102996Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177e102996Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187e102996Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197e102996Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207e102996Smaya * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217e102996Smaya * SOFTWARE. 227e102996Smaya * 237e102996Smaya * Authors: 247e102996Smaya * Rob Clark <robclark@freedesktop.org> 257e102996Smaya */ 267e102996Smaya 277e102996Smaya#include "util/ralloc.h" 287e102996Smaya 297ec681f3Smrg#include "freedreno_dev_info.h" 307ec681f3Smrg 317e102996Smaya#include "ir3_compiler.h" 327e102996Smaya 337e102996Smayastatic const struct debug_named_value shader_debug_options[] = { 347ec681f3Smrg /* clang-format off */ 357ec681f3Smrg {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"}, 367ec681f3Smrg {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"}, 377ec681f3Smrg {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"}, 387ec681f3Smrg {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"}, 397ec681f3Smrg {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"}, 407ec681f3Smrg {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"}, 417ec681f3Smrg {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"}, 427ec681f3Smrg {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"}, 437ec681f3Smrg {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"}, 447ec681f3Smrg {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"}, 457ec681f3Smrg {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"}, 467ec681f3Smrg {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"}, 477ec681f3Smrg {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"}, 487ec681f3Smrg#ifdef DEBUG 497ec681f3Smrg /* DEBUG-only options: */ 507ec681f3Smrg {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"}, 517ec681f3Smrg {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"}, 527ec681f3Smrg#endif 537ec681f3Smrg DEBUG_NAMED_VALUE_END 547ec681f3Smrg /* clang-format on */ 557e102996Smaya}; 567e102996Smaya 577ec681f3SmrgDEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", 587ec681f3Smrg shader_debug_options, 0) 597ec681f3SmrgDEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", 607ec681f3Smrg NULL) 617e102996Smaya 627e102996Smayaenum ir3_shader_debug ir3_shader_debug = 0; 637ec681f3Smrgconst char *ir3_shader_override_path = NULL; 647e102996Smaya 657ec681f3Smrgvoid 667ec681f3Smrgir3_compiler_destroy(struct ir3_compiler *compiler) 677e102996Smaya{ 687ec681f3Smrg disk_cache_destroy(compiler->disk_cache); 697ec681f3Smrg ralloc_free(compiler); 707ec681f3Smrg} 717ec681f3Smrg 727ec681f3Smrgstruct ir3_compiler * 737ec681f3Smrgir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, 747ec681f3Smrg bool robust_ubo_access) 757ec681f3Smrg{ 767ec681f3Smrg struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler); 777ec681f3Smrg 787ec681f3Smrg ir3_shader_debug = debug_get_option_ir3_shader_debug(); 797ec681f3Smrg ir3_shader_override_path = 807ec681f3Smrg !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL; 817ec681f3Smrg 827ec681f3Smrg if (ir3_shader_override_path) { 837ec681f3Smrg ir3_shader_debug |= IR3_DBG_NOCACHE; 847ec681f3Smrg } 857ec681f3Smrg 867ec681f3Smrg compiler->dev = dev; 877ec681f3Smrg compiler->dev_id = dev_id; 887ec681f3Smrg compiler->gen = fd_dev_gen(dev_id); 897ec681f3Smrg compiler->robust_ubo_access = robust_ubo_access; 907ec681f3Smrg 917ec681f3Smrg /* All known GPU's have 32k local memory (aka shared) */ 927ec681f3Smrg compiler->local_mem_size = 32 * 1024; 937ec681f3Smrg /* TODO see if older GPU's were different here */ 947ec681f3Smrg compiler->branchstack_size = 64; 957ec681f3Smrg compiler->wave_granularity = 2; 967ec681f3Smrg compiler->max_waves = 16; 977ec681f3Smrg 987ec681f3Smrg if (compiler->gen >= 6) { 997ec681f3Smrg compiler->samgq_workaround = true; 1007ec681f3Smrg /* a6xx split the pipeline state into geometry and fragment state, in 1017ec681f3Smrg * order to let the VS run ahead of the FS. As a result there are now 1027ec681f3Smrg * separate const files for the the fragment shader and everything 1037ec681f3Smrg * else, and separate limits. There seems to be a shared limit, but 1047ec681f3Smrg * it's higher than the vert or frag limits. 1057ec681f3Smrg * 1067ec681f3Smrg * TODO: The shared limit seems to be different on different on 1077ec681f3Smrg * different models. 1087ec681f3Smrg */ 1097ec681f3Smrg compiler->max_const_pipeline = 640; 1107ec681f3Smrg compiler->max_const_frag = 512; 1117ec681f3Smrg compiler->max_const_geom = 512; 1127ec681f3Smrg compiler->max_const_safe = 128; 1137ec681f3Smrg 1147ec681f3Smrg /* Compute shaders don't share a const file with the FS. Instead they 1157ec681f3Smrg * have their own file, which is smaller than the FS one. 1167ec681f3Smrg * 1177ec681f3Smrg * TODO: is this true on earlier gen's? 1187ec681f3Smrg */ 1197ec681f3Smrg compiler->max_const_compute = 256; 1207ec681f3Smrg 1217ec681f3Smrg /* TODO: implement clip+cull distances on earlier gen's */ 1227ec681f3Smrg compiler->has_clip_cull = true; 1237ec681f3Smrg 1247ec681f3Smrg /* TODO: implement private memory on earlier gen's */ 1257ec681f3Smrg compiler->has_pvtmem = true; 1267ec681f3Smrg 1277ec681f3Smrg compiler->tess_use_shared = 1287ec681f3Smrg fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared; 1297ec681f3Smrg 1307ec681f3Smrg compiler->storage_16bit = 1317ec681f3Smrg fd_dev_info(compiler->dev_id)->a6xx.storage_16bit; 1327ec681f3Smrg } else { 1337ec681f3Smrg compiler->max_const_pipeline = 512; 1347ec681f3Smrg compiler->max_const_geom = 512; 1357ec681f3Smrg compiler->max_const_frag = 512; 1367ec681f3Smrg compiler->max_const_compute = 512; 1377ec681f3Smrg 1387ec681f3Smrg /* Note: this will have to change if/when we support tess+GS on 1397ec681f3Smrg * earlier gen's. 1407ec681f3Smrg */ 1417ec681f3Smrg compiler->max_const_safe = 256; 1427ec681f3Smrg } 1437ec681f3Smrg 1447ec681f3Smrg if (compiler->gen >= 6) { 1457ec681f3Smrg compiler->reg_size_vec4 = 1467ec681f3Smrg fd_dev_info(compiler->dev_id)->a6xx.reg_size_vec4; 1477ec681f3Smrg } else if (compiler->gen >= 4) { 1487ec681f3Smrg /* On a4xx-a5xx, using r24.x and above requires using the smallest 1497ec681f3Smrg * threadsize. 1507ec681f3Smrg */ 1517ec681f3Smrg compiler->reg_size_vec4 = 48; 1527ec681f3Smrg } else { 1537ec681f3Smrg /* TODO: confirm this */ 1547ec681f3Smrg compiler->reg_size_vec4 = 96; 1557ec681f3Smrg } 1567ec681f3Smrg 1577ec681f3Smrg if (compiler->gen >= 6) { 1587ec681f3Smrg compiler->threadsize_base = 64; 1597ec681f3Smrg } else if (compiler->gen >= 4) { 1607ec681f3Smrg /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan 1617ec681f3Smrg * 1.1 subgroupSize which is 32. 1627ec681f3Smrg */ 1637ec681f3Smrg compiler->threadsize_base = 32; 1647ec681f3Smrg } else { 1657ec681f3Smrg compiler->threadsize_base = 8; 1667ec681f3Smrg } 1677ec681f3Smrg 1687ec681f3Smrg if (compiler->gen >= 4) { 1697ec681f3Smrg /* need special handling for "flat" */ 1707ec681f3Smrg compiler->flat_bypass = true; 1717ec681f3Smrg compiler->levels_add_one = false; 1727ec681f3Smrg compiler->unminify_coords = false; 1737ec681f3Smrg compiler->txf_ms_with_isaml = false; 1747ec681f3Smrg compiler->array_index_add_half = true; 1757ec681f3Smrg compiler->instr_align = 16; 1767ec681f3Smrg compiler->const_upload_unit = 4; 1777ec681f3Smrg } else { 1787ec681f3Smrg /* no special handling for "flat" */ 1797ec681f3Smrg compiler->flat_bypass = false; 1807ec681f3Smrg compiler->levels_add_one = true; 1817ec681f3Smrg compiler->unminify_coords = true; 1827ec681f3Smrg compiler->txf_ms_with_isaml = true; 1837ec681f3Smrg compiler->array_index_add_half = false; 1847ec681f3Smrg compiler->instr_align = 4; 1857ec681f3Smrg compiler->const_upload_unit = 8; 1867ec681f3Smrg } 1877ec681f3Smrg 1887ec681f3Smrg ir3_disk_cache_init(compiler); 1897ec681f3Smrg 1907ec681f3Smrg return compiler; 1917e102996Smaya} 192