1/* 2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org> 3 * Copyright © 2018 Google, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 * Authors: 25 * Rob Clark <robclark@freedesktop.org> 26 */ 27 28#include "pipe/p_state.h" 29#include "util/bitset.h" 30#include "util/format/u_format.h" 31#include "util/u_inlines.h" 32#include "util/u_memory.h" 33#include "util/u_string.h" 34 35#include "freedreno_program.h" 36 37#include "fd6_const.h" 38#include "fd6_emit.h" 39#include "fd6_format.h" 40#include "fd6_pack.h" 41#include "fd6_program.h" 42#include "fd6_texture.h" 43 44void 45fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, 46 const struct ir3_shader_variant *so) 47{ 48 enum a6xx_state_block sb = fd6_stage2shadersb(so->type); 49 50 uint32_t first_exec_offset = 0; 51 uint32_t instrlen = 0; 52 uint32_t hw_stack_offset = 0; 53 54 switch (so->type) { 55 case MESA_SHADER_VERTEX: 56 first_exec_offset = REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET; 57 instrlen = REG_A6XX_SP_VS_INSTRLEN; 58 hw_stack_offset = REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET; 59 break; 60 case MESA_SHADER_TESS_CTRL: 61 first_exec_offset = REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET; 62 instrlen = REG_A6XX_SP_HS_INSTRLEN; 63 hw_stack_offset = REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET; 64 break; 65 case MESA_SHADER_TESS_EVAL: 66 first_exec_offset = REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET; 67 instrlen = REG_A6XX_SP_DS_INSTRLEN; 68 hw_stack_offset = REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET; 69 break; 70 case MESA_SHADER_GEOMETRY: 71 first_exec_offset = REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET; 72 instrlen = REG_A6XX_SP_GS_INSTRLEN; 73 hw_stack_offset = REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET; 74 break; 75 case MESA_SHADER_FRAGMENT: 76 first_exec_offset = REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET; 77 instrlen = REG_A6XX_SP_FS_INSTRLEN; 78 hw_stack_offset = REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET; 79 break; 80 case MESA_SHADER_COMPUTE: 81 case MESA_SHADER_KERNEL: 82 first_exec_offset = REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET; 83 instrlen = REG_A6XX_SP_CS_INSTRLEN; 84 hw_stack_offset = REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET; 85 break; 86 case MESA_SHADER_TASK: 87 case MESA_SHADER_MESH: 88 case MESA_SHADER_RAYGEN: 89 case MESA_SHADER_ANY_HIT: 90 case MESA_SHADER_CLOSEST_HIT: 91 case MESA_SHADER_MISS: 92 case MESA_SHADER_INTERSECTION: 93 case MESA_SHADER_CALLABLE: 94 unreachable("Unsupported shader stage"); 95 case MESA_SHADER_NONE: 96 unreachable(""); 97 } 98 99#ifdef DEBUG 100 /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */ 101 const char *name = so->shader->nir->info.name; 102 if (name) 103 fd_emit_string5(ring, name, strlen(name)); 104#endif 105 106 uint32_t fibers_per_sp = ctx->screen->info->a6xx.fibers_per_sp; 107 uint32_t num_sp_cores = ctx->screen->info->num_sp_cores; 108 109 uint32_t per_fiber_size = ALIGN(so->pvtmem_size, 512); 110 if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) { 111 if (ctx->pvtmem[so->pvtmem_per_wave].bo) 112 fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo); 113 ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size; 114 uint32_t total_size = 115 ALIGN(per_fiber_size * fibers_per_sp, 1 << 12) * num_sp_cores; 116 ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new( 117 ctx->screen->dev, total_size, 0, 118 "pvtmem_%s_%d", so->pvtmem_per_wave ? "per_wave" : "per_fiber", 119 per_fiber_size); 120 } else { 121 per_fiber_size = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size; 122 } 123 124 uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12); 125 126 OUT_PKT4(ring, instrlen, 1); 127 OUT_RING(ring, so->instrlen); 128 129 OUT_PKT4(ring, first_exec_offset, 7); 130 OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */ 131 OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */ 132 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size)); 133 if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ 134 OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0); 135 } else { 136 OUT_RING(ring, 0); 137 OUT_RING(ring, 0); 138 } 139 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | 140 COND(so->pvtmem_per_wave, 141 A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); 142 143 OUT_PKT4(ring, hw_stack_offset, 1); 144 OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size)); 145 146 OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); 147 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 148 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 149 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 150 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 151 CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen)); 152 OUT_RELOC(ring, so->bo, 0, 0, 0); 153} 154 155/** 156 * Build a pre-baked state-obj to disable SO, so that we aren't dynamically 157 * building this at draw time whenever we transition from SO enabled->disabled 158 */ 159static void 160setup_stream_out_disable(struct fd_context *ctx) 161{ 162 unsigned sizedw = 4; 163 164 if (ctx->screen->info->a6xx.tess_use_shared) 165 sizedw += 2; 166 167 struct fd_ringbuffer *ring = 168 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); 169 170 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); 171 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); 172 OUT_RING(ring, 0); 173 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); 174 OUT_RING(ring, 0); 175 176 if (ctx->screen->info->a6xx.tess_use_shared) { 177 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL); 178 OUT_RING(ring, 0); 179 } 180 181 fd6_context(ctx)->streamout_disable_stateobj = ring; 182} 183 184static void 185setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state, 186 const struct ir3_shader_variant *v, 187 struct ir3_shader_linkage *l) 188{ 189 const struct ir3_stream_output_info *strmout = &v->shader->stream_output; 190 191 uint32_t ncomp[PIPE_MAX_SO_BUFFERS]; 192 uint32_t prog[256 / 2]; 193 uint32_t prog_count; 194 195 memset(ncomp, 0, sizeof(ncomp)); 196 memset(prog, 0, sizeof(prog)); 197 198 prog_count = align(l->max_loc, 2) / 2; 199 200 debug_assert(prog_count < ARRAY_SIZE(prog)); 201 202 for (unsigned i = 0; i < strmout->num_outputs; i++) { 203 const struct ir3_stream_output *out = &strmout->output[i]; 204 unsigned k = out->register_index; 205 unsigned idx; 206 207 ncomp[out->output_buffer] += out->num_components; 208 209 /* linkage map sorted by order frag shader wants things, so 210 * a bit less ideal here.. 211 */ 212 for (idx = 0; idx < l->cnt; idx++) 213 if (l->var[idx].regid == v->outputs[k].regid) 214 break; 215 216 debug_assert(idx < l->cnt); 217 218 for (unsigned j = 0; j < out->num_components; j++) { 219 unsigned c = j + out->start_component; 220 unsigned loc = l->var[idx].loc + c; 221 unsigned off = j + out->dst_offset; /* in dwords */ 222 223 if (loc & 1) { 224 prog[loc / 2] |= A6XX_VPC_SO_PROG_B_EN | 225 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 226 A6XX_VPC_SO_PROG_B_OFF(off * 4); 227 } else { 228 prog[loc / 2] |= A6XX_VPC_SO_PROG_A_EN | 229 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 230 A6XX_VPC_SO_PROG_A_OFF(off * 4); 231 } 232 } 233 } 234 235 unsigned sizedw = 12 + (2 * prog_count); 236 if (ctx->screen->info->a6xx.tess_use_shared) 237 sizedw += 2; 238 239 struct fd_ringbuffer *ring = 240 fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); 241 242 OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); 243 OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); 244 OUT_RING(ring, 245 A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) | 246 COND(ncomp[0] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1)) | 247 COND(ncomp[1] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1)) | 248 COND(ncomp[2] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1)) | 249 COND(ncomp[3] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1))); 250 OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(0)); 251 OUT_RING(ring, ncomp[0]); 252 OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(1)); 253 OUT_RING(ring, ncomp[1]); 254 OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(2)); 255 OUT_RING(ring, ncomp[2]); 256 OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(3)); 257 OUT_RING(ring, ncomp[3]); 258 OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); 259 OUT_RING(ring, A6XX_VPC_SO_CNTL_RESET); 260 for (unsigned i = 0; i < prog_count; i++) { 261 OUT_RING(ring, REG_A6XX_VPC_SO_PROG); 262 OUT_RING(ring, prog[i]); 263 } 264 if (ctx->screen->info->a6xx.tess_use_shared) { 265 /* Possibly not tess_use_shared related, but the combination of 266 * tess + xfb fails some tests if we don't emit this. 267 */ 268 OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL); 269 OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE); 270 } 271 272 state->streamout_stateobj = ring; 273} 274 275static void 276setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state) 277{ 278 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4); 279 280 OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, 281 .ds_state = true, .gs_state = true, 282 .fs_state = true, .cs_state = true, 283 .gfx_ibo = true, .cs_ibo = true, )); 284 285 debug_assert(state->vs->constlen >= state->bs->constlen); 286 287 OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); 288 OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) | 289 A6XX_HLSQ_VS_CNTL_ENABLED); 290 OUT_RING(ring, COND(state->hs, 291 A6XX_HLSQ_HS_CNTL_ENABLED | 292 A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen))); 293 OUT_RING(ring, COND(state->ds, 294 A6XX_HLSQ_DS_CNTL_ENABLED | 295 A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen))); 296 OUT_RING(ring, COND(state->gs, 297 A6XX_HLSQ_GS_CNTL_ENABLED | 298 A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen))); 299 OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); 300 OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) | 301 A6XX_HLSQ_FS_CNTL_ENABLED); 302 303 OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); 304 OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | 305 A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | 306 A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | 307 A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); 308 309 OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); 310 OUT_RING(ring, COND(state->hs, 311 A6XX_SP_HS_CONFIG_ENABLED | 312 A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | 313 A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | 314 A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); 315 316 OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); 317 OUT_RING(ring, COND(state->ds, 318 A6XX_SP_DS_CONFIG_ENABLED | 319 A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | 320 A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | 321 A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); 322 323 OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); 324 OUT_RING(ring, COND(state->gs, 325 A6XX_SP_GS_CONFIG_ENABLED | 326 A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | 327 A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | 328 A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); 329 330 OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); 331 OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | 332 A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | 333 A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | 334 A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); 335 336 OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); 337 OUT_RING(ring, ir3_shader_nibo(state->fs)); 338 339 state->config_stateobj = ring; 340} 341 342static inline uint32_t 343next_regid(uint32_t reg, uint32_t increment) 344{ 345 if (VALIDREG(reg)) 346 return reg + increment; 347 else 348 return regid(63, 0); 349} 350 351static void 352setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, 353 struct fd6_program_state *state, 354 const struct ir3_cache_key *cache_key, 355 bool binning_pass) assert_dt 356{ 357 const struct ir3_shader_key *key = &cache_key->key; 358 uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; 359 uint32_t clip0_regid, clip1_regid; 360 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 361 uint32_t smask_in_regid, smask_regid; 362 uint32_t stencilref_regid; 363 uint32_t vertex_regid, instance_regid, layer_regid, vs_primitive_regid; 364 uint32_t hs_invocation_regid; 365 uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_rel_patch_regid, 366 ds_rel_patch_regid, ds_primitive_regid; 367 uint32_t ij_regid[IJ_COUNT]; 368 uint32_t gs_header_regid; 369 enum a6xx_threadsize fssz; 370 uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0; 371 uint8_t clip0_loc, clip1_loc; 372 int i, j; 373 374 static const struct ir3_shader_variant dummy_fs = {0}; 375 const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; 376 const struct ir3_shader_variant *hs = state->hs; 377 const struct ir3_shader_variant *ds = state->ds; 378 const struct ir3_shader_variant *gs = state->gs; 379 const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; 380 381 /* binning VS is wrong when GS is present, so use nonbinning VS 382 * TODO: compile both binning VS/GS variants correctly 383 */ 384 if (binning_pass && state->gs) 385 vs = state->vs; 386 387 bool sample_shading = fs->per_samp | key->sample_shading; 388 389 fssz = fs->info.double_threadsize ? THREAD128 : THREAD64; 390 391 pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS); 392 psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); 393 clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0); 394 clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1); 395 layer_regid = ir3_find_output_regid(vs, VARYING_SLOT_LAYER); 396 vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); 397 instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); 398 if (hs) 399 vs_primitive_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); 400 else if (gs) 401 vs_primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); 402 else 403 vs_primitive_regid = regid(63, 0); 404 405 bool hs_reads_primid = false, ds_reads_primid = false; 406 if (hs) { 407 tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); 408 tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); 409 hs_reads_primid = VALIDREG(ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID)); 410 ds_reads_primid = VALIDREG(ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID)); 411 hs_rel_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3); 412 ds_rel_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3); 413 ds_primitive_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); 414 hs_invocation_regid = 415 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); 416 417 pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); 418 psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); 419 clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0); 420 clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1); 421 } else { 422 tess_coord_x_regid = regid(63, 0); 423 tess_coord_y_regid = regid(63, 0); 424 hs_rel_patch_regid = regid(63, 0); 425 ds_rel_patch_regid = regid(63, 0); 426 ds_primitive_regid = regid(63, 0); 427 hs_invocation_regid = regid(63, 0); 428 } 429 430 bool gs_reads_primid = false; 431 if (gs) { 432 gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); 433 gs_reads_primid = VALIDREG(ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID)); 434 pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); 435 psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); 436 clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0); 437 clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1); 438 layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER); 439 } else { 440 gs_header_regid = regid(63, 0); 441 } 442 443 if (fs->color0_mrt) { 444 color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = 445 color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = 446 ir3_find_output_regid(fs, FRAG_RESULT_COLOR); 447 } else { 448 color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0); 449 color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1); 450 color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2); 451 color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3); 452 color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4); 453 color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5); 454 color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6); 455 color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7); 456 } 457 458 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); 459 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); 460 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); 461 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); 462 zwcoord_regid = next_regid(coord_regid, 2); 463 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); 464 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); 465 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); 466 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) 467 ij_regid[i] = 468 ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); 469 470 /* If we have pre-dispatch texture fetches, then ij_pix should not 471 * be DCE'd, even if not actually used in the shader itself: 472 */ 473 if (fs->num_sampler_prefetch > 0) { 474 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); 475 /* also, it seems like ij_pix is *required* to be r0.x */ 476 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); 477 } 478 479 /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we 480 * end up masking the single sample!! 481 */ 482 if (!key->msaa) 483 smask_regid = regid(63, 0); 484 485 /* we could probably divide this up into things that need to be 486 * emitted if frag-prog is dirty vs if vert-prog is dirty.. 487 */ 488 489 OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); 490 OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | 491 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | 492 0x7000); // XXX 493 for (int i = 0; i < fs->num_sampler_prefetch; i++) { 494 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; 495 OUT_RING(ring, 496 A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | 497 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | 498 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | 499 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | 500 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | 501 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | 502 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); 503 } 504 505 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); 506 OUT_RING(ring, 0); 507 508 OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1); 509 OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); 510 511 bool fs_has_dual_src_color = 512 !binning_pass && fs->shader->nir->info.fs.color_is_dual_source; 513 514 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); 515 OUT_RING(ring, 516 A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 517 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 518 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | 519 COND(fs_has_dual_src_color, 520 A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 521 522 OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); 523 OUT_RING( 524 ring, 525 A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | 526 A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vs->info.max_half_reg + 1) | 527 COND(vs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) | 528 A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(vs))); 529 530 fd6_emit_shader(ctx, ring, vs); 531 fd6_emit_immediates(ctx->screen, vs, ring); 532 533 struct ir3_shader_linkage l = {0}; 534 const struct ir3_shader_variant *last_shader = fd6_last_shader(state); 535 536 bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0); 537 uint8_t clip_mask = last_shader->clip_mask, 538 cull_mask = last_shader->cull_mask; 539 uint8_t clip_cull_mask = clip_mask | cull_mask; 540 541 clip_mask &= cache_key->clip_plane_enable; 542 543 /* If we have streamout, link against the real FS, rather than the 544 * dummy FS used for binning pass state, to ensure the OUTLOC's 545 * match. Depending on whether we end up doing sysmem or gmem, 546 * the actual streamout could happen with either the binning pass 547 * or draw pass program, but the same streamout stateobj is used 548 * in either case: 549 */ 550 ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true); 551 552 bool primid_passthru = l.primid_loc != 0xff; 553 clip0_loc = l.clip0_loc; 554 clip1_loc = l.clip1_loc; 555 556 OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); 557 OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ 558 OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ 559 OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ 560 OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ 561 562 /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ 563 ir3_link_stream_out(&l, last_shader); 564 565 if (VALIDREG(layer_regid)) { 566 layer_loc = l.max_loc; 567 ir3_link_add(&l, layer_regid, 0x1, l.max_loc); 568 } 569 570 if (VALIDREG(pos_regid)) { 571 pos_loc = l.max_loc; 572 ir3_link_add(&l, pos_regid, 0xf, l.max_loc); 573 } 574 575 if (VALIDREG(psize_regid)) { 576 psize_loc = l.max_loc; 577 ir3_link_add(&l, psize_regid, 0x1, l.max_loc); 578 } 579 580 /* Handle the case where clip/cull distances aren't read by the FS. Make 581 * sure to avoid adding an output with an empty writemask if the user 582 * disables all the clip distances in the API so that the slot is unused. 583 */ 584 if (clip0_loc == 0xff && VALIDREG(clip0_regid) && 585 (clip_cull_mask & 0xf) != 0) { 586 clip0_loc = l.max_loc; 587 ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc); 588 } 589 590 if (clip1_loc == 0xff && VALIDREG(clip1_regid) && 591 (clip_cull_mask >> 4) != 0) { 592 clip1_loc = l.max_loc; 593 ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc); 594 } 595 596 /* If we have stream-out, we use the full shader for binning 597 * pass, rather than the optimized binning pass one, so that we 598 * have all the varying outputs available for xfb. So streamout 599 * state should always be derived from the non-binning pass 600 * program: 601 */ 602 if (do_streamout && !binning_pass) { 603 setup_stream_out(ctx, state, last_shader, &l); 604 605 if (!fd6_context(ctx)->streamout_disable_stateobj) 606 setup_stream_out_disable(ctx); 607 } 608 609 debug_assert(l.cnt <= 32); 610 if (gs) 611 OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 612 else if (ds) 613 OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 614 else 615 OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); 616 617 for (j = 0; j < l.cnt;) { 618 uint32_t reg = 0; 619 620 reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); 621 reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); 622 j++; 623 624 reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); 625 reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); 626 j++; 627 628 OUT_RING(ring, reg); 629 } 630 631 if (gs) 632 OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 633 else if (ds) 634 OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 635 else 636 OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); 637 638 for (j = 0; j < l.cnt;) { 639 uint32_t reg = 0; 640 641 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); 642 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); 643 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); 644 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); 645 646 OUT_RING(ring, reg); 647 } 648 649 if (hs) { 650 assert(vs->mergedregs == hs->mergedregs); 651 OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); 652 OUT_RING( 653 ring, 654 A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) | 655 A6XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(hs->info.max_half_reg + 1) | 656 A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(hs))); 657 658 fd6_emit_shader(ctx, ring, hs); 659 fd6_emit_immediates(ctx->screen, hs, ring); 660 fd6_emit_link_map(ctx->screen, vs, hs, ring); 661 662 OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1); 663 OUT_RING( 664 ring, 665 A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) | 666 A6XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(ds->info.max_half_reg + 1) | 667 COND(ds->mergedregs, A6XX_SP_DS_CTRL_REG0_MERGEDREGS) | 668 A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(ds))); 669 670 fd6_emit_shader(ctx, ring, ds); 671 fd6_emit_immediates(ctx->screen, ds, ring); 672 fd6_emit_link_map(ctx->screen, hs, ds, ring); 673 674 shader_info *hs_info = &hs->shader->nir->info; 675 OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); 676 OUT_RING(ring, hs_info->tess.tcs_vertices_out); 677 678 if (ctx->screen->info->a6xx.tess_use_shared) { 679 unsigned hs_input_size = 6 + (3 * (vs->output_size - 1)); 680 unsigned wave_input_size = 681 MIN2(64, DIV_ROUND_UP(hs_input_size * 4, 682 hs_info->tess.tcs_vertices_out)); 683 684 OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); 685 OUT_RING(ring, hs_input_size); 686 687 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 688 OUT_RING(ring, wave_input_size); 689 } else { 690 uint32_t hs_input_size = 691 hs_info->tess.tcs_vertices_out * vs->output_size / 4; 692 693 /* Total attribute slots in HS incoming patch. */ 694 OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); 695 OUT_RING(ring, hs_input_size); 696 697 const uint32_t wavesize = 64; 698 const uint32_t max_wave_input_size = 64; 699 const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; 700 701 /* note: if HS is really just the VS extended, then this 702 * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) 703 * however that doesn't match the blob, and fails some dEQP tests. 704 */ 705 uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; 706 uint32_t max_prims_per_wave = max_wave_input_size * wavesize / 707 (vs->output_size * patch_control_points); 708 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); 709 710 uint32_t total_size = 711 vs->output_size * patch_control_points * prims_per_wave; 712 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); 713 714 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 715 OUT_RING(ring, wave_input_size); 716 } 717 718 shader_info *ds_info = &ds->shader->nir->info; 719 OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); 720 uint32_t output; 721 if (ds_info->tess.point_mode) 722 output = TESS_POINTS; 723 else if (ds_info->tess.primitive_mode == GL_ISOLINES) 724 output = TESS_LINES; 725 else if (ds_info->tess.ccw) 726 output = TESS_CCW_TRIS; 727 else 728 output = TESS_CW_TRIS; 729 730 OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING( 731 fd6_gl2spacing(ds_info->tess.spacing)) | 732 A6XX_PC_TESS_CNTL_OUTPUT(output)); 733 734 OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1); 735 OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 736 A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 737 A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 738 739 OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1); 740 OUT_RING(ring, 0x0000ffff); 741 742 OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1); 743 OUT_RING(ring, 0x0); 744 745 OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1); 746 OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) | 747 A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask)); 748 749 OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); 750 OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | 751 A6XX_VPC_VS_PACK_PSIZELOC(255) | 752 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); 753 754 OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1); 755 OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) | 756 A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) | 757 A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc)); 758 759 OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); 760 OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt)); 761 762 OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1); 763 OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 764 CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) | 765 COND(ds_reads_primid, A6XX_PC_DS_OUT_CNTL_PRIMITIVE_ID) | 766 A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 767 768 OUT_PKT4(ring, REG_A6XX_PC_HS_OUT_CNTL, 1); 769 OUT_RING(ring, COND(hs_reads_primid, A6XX_PC_HS_OUT_CNTL_PRIMITIVE_ID)); 770 } else { 771 OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); 772 OUT_RING(ring, 0); 773 } 774 775 OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1); 776 OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt)); 777 778 bool enable_varyings = fs->total_in > 0; 779 780 OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); 781 OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | 782 COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | 783 A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) | 784 A6XX_VPC_CNTL_0_VIEWIDLOC(0xff)); 785 786 OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1); 787 OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 788 CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | 789 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | 790 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 791 792 OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 793 OUT_RING(ring, 0x7); /* XXX */ 794 OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 795 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 796 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 797 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); 798 OUT_RING( 799 ring, 800 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | 801 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | 802 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( 803 ij_regid[IJ_PERSP_CENTROID]) | 804 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID( 805 ij_regid[IJ_LINEAR_CENTROID])); 806 OUT_RING( 807 ring, 808 A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 809 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 810 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | 811 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); 812 OUT_RING(ring, 0xfcfc); /* line length (?), foveation quality */ 813 814 OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1); 815 OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(fssz) | 816 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); 817 818 OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1); 819 OUT_RING( 820 ring, 821 A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | 822 COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) | 0x1000000 | 823 A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | 824 A6XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fs->info.max_half_reg + 1) | 825 COND(fs->mergedregs, A6XX_SP_FS_CTRL_REG0_MERGEDREGS) | 826 A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(fs)) | 827 COND(fs->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE)); 828 829 OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1); 830 OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | 831 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(0xff)); 832 833 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; 834 bool need_size_persamp = false; 835 if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) { 836 if (sample_shading) 837 need_size_persamp = true; 838 else 839 need_size = true; 840 } 841 842 OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); 843 OUT_RING( 844 ring, 845 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | 846 CONDREG(ij_regid[IJ_PERSP_CENTROID], 847 A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | 848 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | 849 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 850 CONDREG(ij_regid[IJ_LINEAR_CENTROID], 851 A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | 852 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 853 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | 854 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | 855 COND(fs->fragcoord_compmask != 0, 856 A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); 857 858 OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); 859 OUT_RING( 860 ring, 861 CONDREG(ij_regid[IJ_PERSP_PIXEL], 862 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | 863 CONDREG(ij_regid[IJ_PERSP_CENTROID], 864 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | 865 CONDREG(ij_regid[IJ_PERSP_SAMPLE], 866 A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | 867 CONDREG(ij_regid[IJ_LINEAR_PIXEL], 868 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 869 CONDREG(ij_regid[IJ_LINEAR_CENTROID], 870 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | 871 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], 872 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 873 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | 874 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 875 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | 876 COND(fs->fragcoord_compmask != 0, 877 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); 878 879 OUT_RING(ring, 880 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 881 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 882 CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) | 883 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 884 885 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); 886 OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 887 888 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); 889 OUT_RING(ring, 890 CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | 891 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( 892 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); 893 894 OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 895 OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 896 897 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 898 for (i = 0; i < 8; i++) { 899 OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | 900 COND(color_regid[i] & HALF_REG_ID, 901 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); 902 if (VALIDREG(color_regid[i])) { 903 state->mrt_components |= 0xf << (i * 4); 904 } 905 } 906 907 /* dual source blending has an extra fs output in the 2nd slot */ 908 if (fs_has_dual_src_color) { 909 state->mrt_components |= 0xf << 4; 910 } 911 912 OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); 913 OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | 914 A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) | 915 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); 916 917 if (gs) { 918 assert(gs->mergedregs == (ds ? ds->mergedregs : vs->mergedregs)); 919 OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); 920 OUT_RING( 921 ring, 922 A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | 923 A6XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(gs->info.max_half_reg + 1) | 924 A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(gs))); 925 926 fd6_emit_shader(ctx, ring, gs); 927 fd6_emit_immediates(ctx->screen, gs, ring); 928 if (ds) 929 fd6_emit_link_map(ctx->screen, ds, gs, ring); 930 else 931 fd6_emit_link_map(ctx->screen, vs, gs, ring); 932 933 OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1); 934 OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) | 935 A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) | 936 A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc)); 937 938 OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1); 939 OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | 0xff00); 940 941 OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1); 942 OUT_RING(ring, 943 CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER)); 944 945 uint32_t flags_regid = 946 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); 947 948 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1); 949 OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) | 950 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); 951 952 OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1); 953 OUT_RING(ring, 954 A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | 955 CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) | 956 CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) | 957 COND(gs_reads_primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | 958 A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); 959 960 uint32_t output; 961 switch (gs->shader->nir->info.gs.output_primitive) { 962 case GL_POINTS: 963 output = TESS_POINTS; 964 break; 965 case GL_LINE_STRIP: 966 output = TESS_LINES; 967 break; 968 case GL_TRIANGLE_STRIP: 969 output = TESS_CW_TRIS; 970 break; 971 default: 972 unreachable(""); 973 } 974 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); 975 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT( 976 gs->shader->nir->info.gs.vertices_out - 1) | 977 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | 978 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS( 979 gs->shader->nir->info.gs.invocations - 1)); 980 981 OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1); 982 OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) | 983 A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask)); 984 985 OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1); 986 OUT_RING(ring, 0xff); 987 988 OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1); 989 OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 990 A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 991 A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 992 993 const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; 994 995 /* Size of per-primitive alloction in ldlw memory in vec4s. */ 996 uint32_t vec4_size = gs->shader->nir->info.gs.vertices_in * 997 DIV_ROUND_UP(prev->output_size, 4); 998 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 999 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); 1000 1001 OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); 1002 OUT_RING(ring, 0); 1003 1004 uint32_t prim_size = prev->output_size; 1005 if (prim_size > 64) 1006 prim_size = 64; 1007 else if (prim_size == 64) 1008 prim_size = 63; 1009 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1010 OUT_RING(ring, prim_size); 1011 } else { 1012 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); 1013 OUT_RING(ring, 0); 1014 OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); 1015 OUT_RING(ring, 0); 1016 1017 OUT_PKT4(ring, REG_A6XX_GRAS_VS_LAYER_CNTL, 1); 1018 OUT_RING(ring, 1019 CONDREG(layer_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_LAYER)); 1020 } 1021 1022 OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1); 1023 OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | 1024 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | 1025 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); 1026 1027 OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1); 1028 OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | 1029 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); 1030 1031 OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); 1032 OUT_RING(ring, 0); 1033 1034 if (fs->instrlen) 1035 fd6_emit_shader(ctx, ring, fs); 1036 1037 OUT_REG(ring, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); 1038 1039 uint32_t non_sysval_input_count = 0; 1040 for (uint32_t i = 0; i < vs->inputs_count; i++) 1041 if (!vs->inputs[i].sysval) 1042 non_sysval_input_count++; 1043 1044 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); 1045 OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) | 1046 A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count)); 1047 1048 OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count); 1049 for (uint32_t i = 0; i < non_sysval_input_count; i++) { 1050 assert(vs->inputs[i].compmask); 1051 OUT_RING(ring, 1052 A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | 1053 A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); 1054 } 1055 1056 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); 1057 OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | 1058 A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | 1059 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitive_regid) | 1060 0xfc000000); 1061 OUT_RING(ring, 1062 A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | 1063 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); 1064 OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | 1065 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | 1066 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 1067 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitive_regid)); 1068 OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ 1069 OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | 1070 0xfc00); /* VFD_CONTROL_5 */ 1071 OUT_RING(ring, COND(primid_passthru, 1072 A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ 1073 1074 if (!binning_pass) 1075 fd6_emit_immediates(ctx->screen, fs, ring); 1076} 1077 1078static void emit_interp_state(struct fd_ringbuffer *ring, 1079 struct ir3_shader_variant *fs, bool rasterflat, 1080 bool sprite_coord_mode, 1081 uint32_t sprite_coord_enable); 1082 1083static struct fd_ringbuffer * 1084create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) 1085{ 1086 struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); 1087 1088 emit_interp_state(ring, state->fs, false, false, 0); 1089 1090 return ring; 1091} 1092 1093/* build the program streaming state which is not part of the pre- 1094 * baked stateobj because of dependency on other gl state (rasterflat 1095 * or sprite-coord-replacement) 1096 */ 1097struct fd_ringbuffer * 1098fd6_program_interp_state(struct fd6_emit *emit) 1099{ 1100 const struct fd6_program_state *state = fd6_emit_get_prog(emit); 1101 1102 if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { 1103 /* fastpath: */ 1104 return fd_ringbuffer_ref(state->interp_stateobj); 1105 } else { 1106 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 1107 emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); 1108 1109 emit_interp_state(ring, state->fs, emit->rasterflat, 1110 emit->sprite_coord_mode, emit->sprite_coord_enable); 1111 1112 return ring; 1113 } 1114} 1115 1116static void 1117emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs, 1118 bool rasterflat, bool sprite_coord_mode, 1119 uint32_t sprite_coord_enable) 1120{ 1121 uint32_t vinterp[8], vpsrepl[8]; 1122 1123 memset(vinterp, 0, sizeof(vinterp)); 1124 memset(vpsrepl, 0, sizeof(vpsrepl)); 1125 1126 for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) { 1127 1128 /* NOTE: varyings are packed, so if compmask is 0xb 1129 * then first, third, and fourth component occupy 1130 * three consecutive varying slots: 1131 */ 1132 unsigned compmask = fs->inputs[j].compmask; 1133 1134 uint32_t inloc = fs->inputs[j].inloc; 1135 1136 if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) { 1137 uint32_t loc = inloc; 1138 1139 for (int i = 0; i < 4; i++) { 1140 if (compmask & (1 << i)) { 1141 vinterp[loc / 16] |= 1 << ((loc % 16) * 2); 1142 loc++; 1143 } 1144 } 1145 } 1146 1147 bool coord_mode = sprite_coord_mode; 1148 if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) { 1149 /* mask is two 2-bit fields, where: 1150 * '01' -> S 1151 * '10' -> T 1152 * '11' -> 1 - T (flip mode) 1153 */ 1154 unsigned mask = coord_mode ? 0b1101 : 0b1001; 1155 uint32_t loc = inloc; 1156 if (compmask & 0x1) { 1157 vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); 1158 loc++; 1159 } 1160 if (compmask & 0x2) { 1161 vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); 1162 loc++; 1163 } 1164 if (compmask & 0x4) { 1165 /* .z <- 0.0f */ 1166 vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); 1167 loc++; 1168 } 1169 if (compmask & 0x8) { 1170 /* .w <- 1.0f */ 1171 vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); 1172 loc++; 1173 } 1174 } 1175 } 1176 1177 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 1178 for (int i = 0; i < 8; i++) 1179 OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ 1180 1181 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 1182 for (int i = 0; i < 8; i++) 1183 OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ 1184} 1185 1186static struct ir3_program_state * 1187fd6_program_create(void *data, struct ir3_shader_variant *bs, 1188 struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, 1189 struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, 1190 struct ir3_shader_variant *fs, 1191 const struct ir3_cache_key *key) in_dt 1192{ 1193 struct fd_context *ctx = fd_context(data); 1194 struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); 1195 1196 tc_assert_driver_thread(ctx->tc); 1197 1198 /* if we have streamout, use full VS in binning pass, as the 1199 * binning pass VS will have outputs on other than position/psize 1200 * stripped out: 1201 */ 1202 state->bs = vs->shader->stream_output.num_outputs ? vs : bs; 1203 state->vs = vs; 1204 state->hs = hs; 1205 state->ds = ds; 1206 state->gs = gs; 1207 state->fs = fs; 1208 state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 1209 state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 1210 1211#ifdef DEBUG 1212 if (!ds) { 1213 for (unsigned i = 0; i < bs->inputs_count; i++) { 1214 if (vs->inputs[i].sysval) 1215 continue; 1216 debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); 1217 } 1218 } 1219#endif 1220 1221 setup_config_stateobj(ctx, state); 1222 setup_stateobj(state->binning_stateobj, ctx, state, key, true); 1223 setup_stateobj(state->stateobj, ctx, state, key, false); 1224 state->interp_stateobj = create_interp_stateobj(ctx, state); 1225 1226 struct ir3_stream_output_info *stream_output = 1227 &fd6_last_shader(state)->shader->stream_output; 1228 if (stream_output->num_outputs > 0) 1229 state->stream_output = stream_output; 1230 1231 return &state->base; 1232} 1233 1234static void 1235fd6_program_destroy(void *data, struct ir3_program_state *state) 1236{ 1237 struct fd6_program_state *so = fd6_program_state(state); 1238 fd_ringbuffer_del(so->stateobj); 1239 fd_ringbuffer_del(so->binning_stateobj); 1240 fd_ringbuffer_del(so->config_stateobj); 1241 fd_ringbuffer_del(so->interp_stateobj); 1242 if (so->streamout_stateobj) 1243 fd_ringbuffer_del(so->streamout_stateobj); 1244 free(so); 1245} 1246 1247static const struct ir3_cache_funcs cache_funcs = { 1248 .create_state = fd6_program_create, 1249 .destroy_state = fd6_program_destroy, 1250}; 1251 1252void 1253fd6_prog_init(struct pipe_context *pctx) 1254{ 1255 struct fd_context *ctx = fd_context(pctx); 1256 1257 ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); 1258 1259 ir3_prog_init(pctx); 1260 1261 fd_prog_init(pctx); 1262} 1263