101e04c3fSmrg/*
201e04c3fSmrg * Copyright 2014 Advanced Micro Devices, Inc.
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2001e04c3fSmrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2101e04c3fSmrg * SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#include "ac_binary.h"
2501e04c3fSmrg
267ec681f3Smrg#include "ac_gpu_info.h"
2701e04c3fSmrg#include "util/u_math.h"
2801e04c3fSmrg#include "util/u_memory.h"
2901e04c3fSmrg
3001e04c3fSmrg#include <gelf.h>
3101e04c3fSmrg#include <libelf.h>
3201e04c3fSmrg#include <sid.h>
337ec681f3Smrg#include <stdio.h>
3401e04c3fSmrg
357ec681f3Smrg#define SPILLED_SGPRS 0x4
367ec681f3Smrg#define SPILLED_VGPRS 0x8
3701e04c3fSmrg
387ec681f3Smrg/* Parse configuration data in .AMDGPU.config section format. */
397ec681f3Smrgvoid ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
407ec681f3Smrg                                   bool really_needs_scratch, const struct radeon_info *info,
417ec681f3Smrg                                   struct ac_shader_config *conf)
4201e04c3fSmrg{
437ec681f3Smrg   uint32_t scratch_size = 0;
447ec681f3Smrg
457ec681f3Smrg   for (size_t i = 0; i < nbytes; i += 8) {
467ec681f3Smrg      unsigned reg = util_le32_to_cpu(*(uint32_t *)(data + i));
477ec681f3Smrg      unsigned value = util_le32_to_cpu(*(uint32_t *)(data + i + 4));
487ec681f3Smrg      switch (reg) {
497ec681f3Smrg      case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
507ec681f3Smrg      case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
517ec681f3Smrg      case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
527ec681f3Smrg      case R_00B848_COMPUTE_PGM_RSRC1:
537ec681f3Smrg      case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
547ec681f3Smrg         if (wave_size == 32 || info->wave64_vgpr_alloc_granularity == 8)
557ec681f3Smrg            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
567ec681f3Smrg         else
577ec681f3Smrg            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
587ec681f3Smrg
597ec681f3Smrg         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
607ec681f3Smrg         /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
617ec681f3Smrg         conf->float_mode = G_00B028_FLOAT_MODE(value);
627ec681f3Smrg         conf->rsrc1 = value;
637ec681f3Smrg         break;
647ec681f3Smrg      case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
657ec681f3Smrg         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
667ec681f3Smrg         /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
677ec681f3Smrg         conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
687ec681f3Smrg         conf->rsrc2 = value;
697ec681f3Smrg         break;
707ec681f3Smrg      case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
717ec681f3Smrg         conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
727ec681f3Smrg         conf->rsrc2 = value;
737ec681f3Smrg         break;
747ec681f3Smrg      case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
757ec681f3Smrg         conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
767ec681f3Smrg         conf->rsrc2 = value;
777ec681f3Smrg         break;
787ec681f3Smrg      case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
797ec681f3Smrg         conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
807ec681f3Smrg         conf->rsrc2 = value;
817ec681f3Smrg         break;
827ec681f3Smrg      case R_00B84C_COMPUTE_PGM_RSRC2:
837ec681f3Smrg         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
847ec681f3Smrg         conf->rsrc2 = value;
857ec681f3Smrg         break;
867ec681f3Smrg      case R_00B8A0_COMPUTE_PGM_RSRC3:
877ec681f3Smrg         conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
887ec681f3Smrg         conf->rsrc3 = value;
897ec681f3Smrg         break;
907ec681f3Smrg      case R_0286CC_SPI_PS_INPUT_ENA:
917ec681f3Smrg         conf->spi_ps_input_ena = value;
927ec681f3Smrg         break;
937ec681f3Smrg      case R_0286D0_SPI_PS_INPUT_ADDR:
947ec681f3Smrg         conf->spi_ps_input_addr = value;
957ec681f3Smrg         break;
967ec681f3Smrg      case R_0286E8_SPI_TMPRING_SIZE:
977ec681f3Smrg      case R_00B860_COMPUTE_TMPRING_SIZE:
987ec681f3Smrg         /* WAVESIZE is in units of 256 dwords. */
997ec681f3Smrg         scratch_size = value;
1007ec681f3Smrg         break;
1017ec681f3Smrg      case SPILLED_SGPRS:
1027ec681f3Smrg         conf->spilled_sgprs = value;
1037ec681f3Smrg         break;
1047ec681f3Smrg      case SPILLED_VGPRS:
1057ec681f3Smrg         conf->spilled_vgprs = value;
1067ec681f3Smrg         break;
1077ec681f3Smrg      default: {
1087ec681f3Smrg         static bool printed;
1097ec681f3Smrg
1107ec681f3Smrg         if (!printed) {
1117ec681f3Smrg            fprintf(stderr,
1127ec681f3Smrg                    "Warning: LLVM emitted unknown "
1137ec681f3Smrg                    "config register: 0x%x\n",
1147ec681f3Smrg                    reg);
1157ec681f3Smrg            printed = true;
1167ec681f3Smrg         }
1177ec681f3Smrg      } break;
1187ec681f3Smrg      }
1197ec681f3Smrg   }
1207ec681f3Smrg
1217ec681f3Smrg   if (!conf->spi_ps_input_addr)
1227ec681f3Smrg      conf->spi_ps_input_addr = conf->spi_ps_input_ena;
1237ec681f3Smrg
1247ec681f3Smrg   if (really_needs_scratch) {
1257ec681f3Smrg      /* sgprs spills aren't spilling */
1267ec681f3Smrg      conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
1277ec681f3Smrg   }
1287ec681f3Smrg
1297ec681f3Smrg   /* GFX 10.3 internally:
1307ec681f3Smrg    * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1317ec681f3Smrg    * - aligns LDS to 1024
1327ec681f3Smrg    *
1337ec681f3Smrg    * For shader-db stats, set num_vgprs that the hw actually uses.
1347ec681f3Smrg    */
1357ec681f3Smrg   if (info->chip_class >= GFX10_3) {
1367ec681f3Smrg      conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
1377ec681f3Smrg   }
1387ec681f3Smrg
1397ec681f3Smrg   /* Enable 64-bit and 16-bit denormals, because there is no performance
1407ec681f3Smrg    * cost.
1417ec681f3Smrg    *
1427ec681f3Smrg    * Don't enable denormals for 32-bit floats, because:
1437ec681f3Smrg    * - denormals disable output modifiers
1447ec681f3Smrg    * - denormals break v_mad_f32
1457ec681f3Smrg    * - GFX6 & GFX7 would be very slow
1467ec681f3Smrg    */
1477ec681f3Smrg   conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
1487ec681f3Smrg   conf->float_mode |= V_00B028_FP_64_DENORMS;
14901e04c3fSmrg}
150