1/* 2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org> 3 * Copyright © 2018 Google, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 * Authors: 25 * Rob Clark <robclark@freedesktop.org> 26 */ 27 28#include "pipe/p_state.h" 29#include "util/u_string.h" 30#include "util/u_memory.h" 31#include "util/u_inlines.h" 32#include "util/u_format.h" 33#include "util/bitset.h" 34 35#include "freedreno_program.h" 36 37#include "fd6_program.h" 38#include "fd6_emit.h" 39#include "fd6_texture.h" 40#include "fd6_format.h" 41 42static struct ir3_shader * 43create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso, 44 gl_shader_stage type) 45{ 46 struct fd_context *ctx = fd_context(pctx); 47 struct ir3_compiler *compiler = ctx->screen->compiler; 48 return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen); 49} 50 51static void * 52fd6_fp_state_create(struct pipe_context *pctx, 53 const struct pipe_shader_state *cso) 54{ 55 return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT); 56} 57 58static void 59fd6_fp_state_delete(struct pipe_context *pctx, void *hwcso) 60{ 61 struct ir3_shader *so = hwcso; 62 struct fd_context *ctx = fd_context(pctx); 63 ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); 64 ir3_shader_destroy(so); 65} 66 67static void * 68fd6_vp_state_create(struct pipe_context *pctx, 69 const struct pipe_shader_state *cso) 70{ 71 return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX); 72} 73 74static void 75fd6_vp_state_delete(struct pipe_context *pctx, void *hwcso) 76{ 77 struct ir3_shader *so = hwcso; 78 struct fd_context *ctx = fd_context(pctx); 79 ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); 80 ir3_shader_destroy(so); 81} 82 83void 84fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) 85{ 86 const struct ir3_info *si = &so->info; 87 enum a6xx_state_block sb = fd6_stage2shadersb(so->type); 88 enum a6xx_state_src src; 89 uint32_t i, sz, *bin; 90 unsigned opcode; 91 92 if (fd_mesa_debug & FD_DBG_DIRECT) { 93 sz = si->sizedwords; 94 src = SS6_DIRECT; 95 bin = fd_bo_map(so->bo); 96 } else { 97 sz = 0; 98 src = SS6_INDIRECT; 99 bin = NULL; 100 } 101 102 switch (so->type) { 103 case MESA_SHADER_VERTEX: 104 opcode = CP_LOAD_STATE6_GEOM; 105 break; 106 case MESA_SHADER_FRAGMENT: 107 case MESA_SHADER_COMPUTE: 108 case MESA_SHADER_KERNEL: 109 opcode = CP_LOAD_STATE6_FRAG; 110 break; 111 default: 112 unreachable("bad shader type"); 113 } 114 115 OUT_PKT7(ring, opcode, 3 + sz); 116 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 117 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 118 CP_LOAD_STATE6_0_STATE_SRC(src) | 119 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 120 CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen)); 121 if (bin) { 122 OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); 123 OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); 124 } else { 125 OUT_RELOCD(ring, so->bo, 0, 0, 0); 126 } 127 128 /* for how clever coverity is, it is sometimes rather dull, and 129 * doesn't realize that the only case where bin==NULL, sz==0: 130 */ 131 assume(bin || (sz == 0)); 132 133 for (i = 0; i < sz; i++) { 134 OUT_RING(ring, bin[i]); 135 } 136} 137 138/* Add any missing varyings needed for stream-out. Otherwise varyings not 139 * used by fragment shader will be stripped out. 140 */ 141static void 142link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v) 143{ 144 const struct ir3_stream_output_info *strmout = &v->shader->stream_output; 145 146 /* 147 * First, any stream-out varyings not already in linkage map (ie. also 148 * consumed by frag shader) need to be added: 149 */ 150 for (unsigned i = 0; i < strmout->num_outputs; i++) { 151 const struct ir3_stream_output *out = &strmout->output[i]; 152 unsigned k = out->register_index; 153 unsigned compmask = 154 (1 << (out->num_components + out->start_component)) - 1; 155 unsigned idx, nextloc = 0; 156 157 /* psize/pos need to be the last entries in linkage map, and will 158 * get added link_stream_out, so skip over them: 159 */ 160 if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) || 161 (v->outputs[k].slot == VARYING_SLOT_POS)) 162 continue; 163 164 for (idx = 0; idx < l->cnt; idx++) { 165 if (l->var[idx].regid == v->outputs[k].regid) 166 break; 167 nextloc = MAX2(nextloc, l->var[idx].loc + 4); 168 } 169 170 /* add if not already in linkage map: */ 171 if (idx == l->cnt) 172 ir3_link_add(l, v->outputs[k].regid, compmask, nextloc); 173 174 /* expand component-mask if needed, ie streaming out all components 175 * but frag shader doesn't consume all components: 176 */ 177 if (compmask & ~l->var[idx].compmask) { 178 l->var[idx].compmask |= compmask; 179 l->max_loc = MAX2(l->max_loc, 180 l->var[idx].loc + util_last_bit(l->var[idx].compmask)); 181 } 182 } 183} 184 185static void 186setup_stream_out(struct fd6_program_state *state, const struct ir3_shader_variant *v, 187 struct ir3_shader_linkage *l) 188{ 189 const struct ir3_stream_output_info *strmout = &v->shader->stream_output; 190 struct fd6_streamout_state *tf = &state->tf; 191 192 memset(tf, 0, sizeof(*tf)); 193 194 tf->prog_count = align(l->max_loc, 2) / 2; 195 196 debug_assert(tf->prog_count < ARRAY_SIZE(tf->prog)); 197 198 for (unsigned i = 0; i < strmout->num_outputs; i++) { 199 const struct ir3_stream_output *out = &strmout->output[i]; 200 unsigned k = out->register_index; 201 unsigned idx; 202 203 tf->ncomp[out->output_buffer] += out->num_components; 204 205 /* linkage map sorted by order frag shader wants things, so 206 * a bit less ideal here.. 207 */ 208 for (idx = 0; idx < l->cnt; idx++) 209 if (l->var[idx].regid == v->outputs[k].regid) 210 break; 211 212 debug_assert(idx < l->cnt); 213 214 for (unsigned j = 0; j < out->num_components; j++) { 215 unsigned c = j + out->start_component; 216 unsigned loc = l->var[idx].loc + c; 217 unsigned off = j + out->dst_offset; /* in dwords */ 218 219 if (loc & 1) { 220 tf->prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN | 221 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | 222 A6XX_VPC_SO_PROG_B_OFF(off * 4); 223 } else { 224 tf->prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN | 225 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | 226 A6XX_VPC_SO_PROG_A_OFF(off * 4); 227 } 228 } 229 } 230 231 tf->vpc_so_buf_cntl = A6XX_VPC_SO_BUF_CNTL_ENABLE | 232 COND(tf->ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) | 233 COND(tf->ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) | 234 COND(tf->ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) | 235 COND(tf->ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3); 236} 237 238struct stage { 239 const struct ir3_shader_variant *v; 240 const struct ir3_info *i; 241 /* const sizes are in units of vec4, aligned to 4*vec4 */ 242 uint16_t constlen; 243 /* instr sizes are in units of 16 instructions */ 244 uint16_t instrlen; 245}; 246 247enum { 248 VS = 0, 249 FS = 1, 250 HS = 2, 251 DS = 3, 252 GS = 4, 253 MAX_STAGES 254}; 255 256static void 257setup_stages(struct fd6_program_state *state, struct stage *s, bool binning_pass) 258{ 259 unsigned i; 260 261 if (binning_pass) { 262 static const struct ir3_shader_variant dummy_fs = {0}; 263 264 s[VS].v = state->bs; 265 s[FS].v = &dummy_fs; 266 } else { 267 s[VS].v = state->vs; 268 s[FS].v = state->fs; 269 } 270 271 s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ 272 273 for (i = 0; i < MAX_STAGES; i++) { 274 if (s[i].v) { 275 s[i].i = &s[i].v->info; 276 s[i].constlen = align(s[i].v->constlen, 4); 277 /* instrlen is already in units of 16 instr.. although 278 * probably we should ditch that and not make the compiler 279 * care about instruction group size of a3xx vs a5xx 280 */ 281 s[i].instrlen = s[i].v->instrlen; 282 } else { 283 s[i].i = NULL; 284 s[i].constlen = 0; 285 s[i].instrlen = 0; 286 } 287 } 288} 289 290static inline uint32_t 291next_regid(uint32_t reg, uint32_t increment) 292{ 293 if (reg == regid(63,0)) 294 return regid(63,0); 295 else 296 return reg + increment; 297} 298 299#define VALIDREG(r) ((r) != regid(63,0)) 300#define CONDREG(r, val) COND(VALIDREG(r), (val)) 301 302static void 303setup_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *state, 304 const struct ir3_shader_key *key, bool binning_pass) 305{ 306 struct stage s[MAX_STAGES]; 307 uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; 308 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; 309 uint32_t smask_in_regid, smask_regid; 310 uint32_t vertex_regid, instance_regid; 311 uint32_t ij_pix_regid, ij_samp_regid, ij_cent_regid, ij_size_regid; 312 enum a3xx_threadsize fssz; 313 uint8_t psize_loc = ~0; 314 int i, j; 315 316 setup_stages(state, s, binning_pass); 317 318 bool sample_shading = s[FS].v->per_samp | key->sample_shading; 319 320 fssz = FOUR_QUADS; 321 322 pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); 323 psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); 324 vertex_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID); 325 instance_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_INSTANCE_ID); 326 327 if (s[FS].v->color0_mrt) { 328 color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = 329 color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = 330 ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); 331 } else { 332 color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); 333 color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); 334 color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); 335 color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); 336 color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); 337 color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); 338 color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); 339 color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); 340 } 341 342 samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID); 343 smask_in_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN); 344 face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); 345 coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); 346 zwcoord_regid = next_regid(coord_regid, 2); 347 ij_pix_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PIXEL); 348 ij_samp_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_SAMPLE); 349 ij_cent_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_CENTROID); 350 ij_size_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_SIZE); 351 posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH); 352 smask_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_SAMPLE_MASK); 353 354 /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we 355 * end up masking the single sample!! 356 */ 357 if (!key->msaa) 358 smask_regid = regid(63, 0); 359 360 /* we could probably divide this up into things that need to be 361 * emitted if frag-prog is dirty vs if vert-prog is dirty.. 362 */ 363 364 OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 2); 365 OUT_RING(ring, COND(s[VS].v, A6XX_SP_VS_CONFIG_ENABLED) | 366 A6XX_SP_VS_CONFIG_NIBO(s[VS].v->image_mapping.num_ibo) | 367 A6XX_SP_VS_CONFIG_NTEX(s[VS].v->num_samp) | 368 A6XX_SP_VS_CONFIG_NSAMP(s[VS].v->num_samp)); /* SP_VS_CONFIG */ 369 OUT_RING(ring, s[VS].instrlen); /* SP_VS_INSTRLEN */ 370 371 OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1); 372 OUT_RING(ring, 0); 373 374 OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 2); 375 OUT_RING(ring, COND(s[HS].v, A6XX_SP_HS_CONFIG_ENABLED)); /* SP_HS_CONFIG */ 376 OUT_RING(ring, s[HS].instrlen); /* SP_HS_INSTRLEN */ 377 378 OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 2); 379 OUT_RING(ring, COND(s[DS].v, A6XX_SP_DS_CONFIG_ENABLED)); /* SP_DS_CONFIG */ 380 OUT_RING(ring, s[DS].instrlen); /* SP_DS_INSTRLEN */ 381 382 OUT_PKT4(ring, REG_A6XX_SP_GS_UNKNOWN_A871, 1); 383 OUT_RING(ring, 0); 384 385 OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 2); 386 OUT_RING(ring, COND(s[GS].v, A6XX_SP_GS_CONFIG_ENABLED)); /* SP_GS_CONFIG */ 387 OUT_RING(ring, s[GS].instrlen); /* SP_GS_INSTRLEN */ 388 389 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A99E, 1); 390 OUT_RING(ring, 0x7fc0); 391 392 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); 393 OUT_RING(ring, 0); 394 395 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_AB00, 1); 396 OUT_RING(ring, 0x5); 397 398 OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 2); 399 OUT_RING(ring, COND(s[FS].v, A6XX_SP_FS_CONFIG_ENABLED) | 400 A6XX_SP_FS_CONFIG_NIBO(s[FS].v->image_mapping.num_ibo) | 401 A6XX_SP_FS_CONFIG_NTEX(s[FS].v->num_samp) | 402 A6XX_SP_FS_CONFIG_NSAMP(s[FS].v->num_samp)); /* SP_FS_CONFIG */ 403 OUT_RING(ring, s[FS].instrlen); /* SP_FS_INSTRLEN */ 404 405 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); 406 OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | 407 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | 408 0xfc000000); 409 410 OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); 411 OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(s[VS].constlen) | 412 A6XX_HLSQ_VS_CNTL_ENABLED); 413 OUT_RING(ring, A6XX_HLSQ_HS_CNTL_CONSTLEN(s[HS].constlen)); /* HLSQ_HS_CONSTLEN */ 414 OUT_RING(ring, A6XX_HLSQ_DS_CNTL_CONSTLEN(s[DS].constlen)); /* HLSQ_DS_CONSTLEN */ 415 OUT_RING(ring, A6XX_HLSQ_GS_CNTL_CONSTLEN(s[GS].constlen)); /* HLSQ_GS_CONSTLEN */ 416 417 OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); 418 OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(s[FS].constlen) | 419 A6XX_HLSQ_FS_CNTL_ENABLED); 420 421 OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); 422 OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_THREADSIZE(fssz) | 423 A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | 424 A6XX_SP_VS_CTRL_REG0_MERGEDREGS | 425 A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(s[VS].v->branchstack) | 426 COND(s[VS].v->need_pixlod, A6XX_SP_VS_CTRL_REG0_PIXLODENABLE)); 427 428 struct ir3_shader_linkage l = {0}; 429 ir3_link_shaders(&l, s[VS].v, s[FS].v); 430 431 if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass) 432 link_stream_out(&l, s[VS].v); 433 434 BITSET_DECLARE(varbs, 128) = {0}; 435 uint32_t *varmask = (uint32_t *)varbs; 436 437 for (i = 0; i < l.cnt; i++) 438 for (j = 0; j < util_last_bit(l.var[i].compmask); j++) 439 BITSET_SET(varbs, l.var[i].loc + j); 440 441 OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); 442 OUT_RING(ring, ~varmask[0]); /* VPC_VAR[0].DISABLE */ 443 OUT_RING(ring, ~varmask[1]); /* VPC_VAR[1].DISABLE */ 444 OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */ 445 OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */ 446 447 /* a6xx appends pos/psize to end of the linkage map: */ 448 if (VALIDREG(pos_regid)) 449 ir3_link_add(&l, pos_regid, 0xf, l.max_loc); 450 451 if (VALIDREG(psize_regid)) { 452 psize_loc = l.max_loc; 453 ir3_link_add(&l, psize_regid, 0x1, l.max_loc); 454 } 455 456 if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass) { 457 setup_stream_out(state, s[VS].v, &l); 458 } 459 460 for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { 461 uint32_t reg = 0; 462 463 OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(i), 1); 464 465 reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); 466 reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); 467 j++; 468 469 reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); 470 reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); 471 j++; 472 473 OUT_RING(ring, reg); 474 } 475 476 for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { 477 uint32_t reg = 0; 478 479 OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(i), 1); 480 481 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); 482 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); 483 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); 484 reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); 485 486 OUT_RING(ring, reg); 487 } 488 489 OUT_PKT4(ring, REG_A6XX_SP_VS_OBJ_START_LO, 2); 490 OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_LO/HI */ 491 492 if (s[VS].instrlen) 493 fd6_emit_shader(ring, s[VS].v); 494 495 // TODO depending on other bits in this reg (if any) set somewhere else? 496#if 0 497 OUT_PKT4(ring, REG_A6XX_PC_PRIM_VTX_CNTL, 1); 498 OUT_RING(ring, COND(s[VS].v->writes_psize, A6XX_PC_PRIM_VTX_CNTL_PSIZE)); 499#endif 500 501 OUT_PKT4(ring, REG_A6XX_SP_PRIMITIVE_CNTL, 1); 502 OUT_RING(ring, A6XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt)); 503 504 bool enable_varyings = s[FS].v->total_in > 0; 505 506 OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); 507 OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(s[FS].v->total_in) | 508 COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | 509 0xff00ff00); 510 511 OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1); 512 OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(l.max_loc) | 513 CONDREG(psize_regid, 0x100)); 514 515 if (binning_pass) { 516 OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2); 517 OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_LO */ 518 OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_HI */ 519 } else { 520 OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2); 521 OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_LO/HI */ 522 } 523 524 OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); 525 OUT_RING(ring, 0x7); /* XXX */ 526 OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | 527 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | 528 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | 529 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_size_regid)); 530 OUT_RING(ring, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(ij_pix_regid) | 531 A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(ij_cent_regid) | 532 0xfc00fc00); /* XXX */ 533 OUT_RING(ring, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | 534 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | 535 A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(ij_samp_regid) | 536 0x0000fc00); /* XXX */ 537 OUT_RING(ring, 0xfc); /* XXX */ 538 539 OUT_PKT4(ring, REG_A6XX_HLSQ_UNKNOWN_B980, 1); 540 OUT_RING(ring, enable_varyings ? 3 : 1); 541 542 OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1); 543 OUT_RING(ring, A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | 544 COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) | 545 COND(s[FS].v->frag_coord, A6XX_SP_FS_CTRL_REG0_VARYING) | 546 0x1000000 | 547 A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | 548 A6XX_SP_FS_CTRL_REG0_MERGEDREGS | 549 A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(s[FS].v->branchstack) | 550 COND(s[FS].v->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE)); 551 552 OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A982, 1); 553 OUT_RING(ring, 0); /* XXX */ 554 555 OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1); 556 OUT_RING(ring, 0xff); /* XXX */ 557 558 OUT_PKT4(ring, REG_A6XX_VPC_GS_SIV_CNTL, 1); 559 OUT_RING(ring, 0x0000ffff); /* XXX */ 560 561#if 0 562 OUT_PKT4(ring, REG_A6XX_SP_SP_CNTL, 1); 563 OUT_RING(ring, 0x00000010); /* XXX */ 564#endif 565 566 OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); 567 OUT_RING(ring, 568 CONDREG(ij_pix_regid, A6XX_GRAS_CNTL_VARYING) | 569 CONDREG(ij_cent_regid, A6XX_GRAS_CNTL_CENTROID) | 570 CONDREG(ij_samp_regid, A6XX_GRAS_CNTL_PERSAMP_VARYING) | 571 COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_GRAS_CNTL_SIZE) | 572 COND(VALIDREG(ij_size_regid) && sample_shading, A6XX_GRAS_CNTL_SIZE_PERSAMP) | 573 COND(s[FS].v->frag_coord, 574 A6XX_GRAS_CNTL_SIZE | 575 A6XX_GRAS_CNTL_XCOORD | 576 A6XX_GRAS_CNTL_YCOORD | 577 A6XX_GRAS_CNTL_ZCOORD | 578 A6XX_GRAS_CNTL_WCOORD) | 579 COND(s[FS].v->frag_face, A6XX_GRAS_CNTL_SIZE)); 580 581 OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); 582 OUT_RING(ring, 583 CONDREG(ij_pix_regid, A6XX_RB_RENDER_CONTROL0_VARYING) | 584 CONDREG(ij_cent_regid, A6XX_RB_RENDER_CONTROL0_CENTROID) | 585 CONDREG(ij_samp_regid, A6XX_RB_RENDER_CONTROL0_PERSAMP_VARYING) | 586 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | 587 COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE) | 588 COND(VALIDREG(ij_size_regid) && sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) | 589 COND(s[FS].v->frag_coord, 590 A6XX_RB_RENDER_CONTROL0_SIZE | 591 A6XX_RB_RENDER_CONTROL0_XCOORD | 592 A6XX_RB_RENDER_CONTROL0_YCOORD | 593 A6XX_RB_RENDER_CONTROL0_ZCOORD | 594 A6XX_RB_RENDER_CONTROL0_WCOORD) | 595 COND(s[FS].v->frag_face, A6XX_RB_RENDER_CONTROL0_SIZE)); 596 597 OUT_RING(ring, 598 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | 599 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | 600 CONDREG(ij_size_regid, A6XX_RB_RENDER_CONTROL1_SIZE) | 601 COND(s[FS].v->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); 602 603 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); 604 OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); 605 606 OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8101, 1); 607 OUT_RING(ring, COND(sample_shading, 0x6)); // XXX 608 609 OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); 610 OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); 611 612 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); 613 for (i = 0; i < 8; i++) { 614 // TODO we could have a mix of half and full precision outputs, 615 // we really need to figure out half-precision from IR3_REG_HALF 616 OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | 617 COND(false, 618 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); 619 } 620 621 OUT_PKT4(ring, REG_A6XX_VPC_PACK, 1); 622 OUT_RING(ring, A6XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) | 623 A6XX_VPC_PACK_PSIZELOC(psize_loc) | 624 A6XX_VPC_PACK_STRIDE_IN_VPC(l.max_loc)); 625 626 if (!binning_pass) { 627 /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ 628 for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) { 629 /* NOTE: varyings are packed, so if compmask is 0xb 630 * then first, third, and fourth component occupy 631 * three consecutive varying slots: 632 */ 633 unsigned compmask = s[FS].v->inputs[j].compmask; 634 635 uint32_t inloc = s[FS].v->inputs[j].inloc; 636 637 if (s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) { 638 uint32_t loc = inloc; 639 640 for (i = 0; i < 4; i++) { 641 if (compmask & (1 << i)) { 642 state->vinterp[loc / 16] |= 1 << ((loc % 16) * 2); 643 loc++; 644 } 645 } 646 } 647 } 648 } 649 650 if (!binning_pass) 651 if (s[FS].instrlen) 652 fd6_emit_shader(ring, s[FS].v); 653 654 OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); 655 OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | 656 A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | 657 0xfcfc0000); 658 OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ 659 OUT_RING(ring, 0xfcfcfcfc); /* VFD_CONTROL_3 */ 660 OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ 661 OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_5 */ 662 OUT_RING(ring, 0x00000000); /* VFD_CONTROL_6 */ 663 664 bool fragz = s[FS].v->no_earlyz | s[FS].v->writes_pos; 665 666 OUT_PKT4(ring, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1); 667 OUT_RING(ring, COND(fragz, A6XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z)); 668 669 OUT_PKT4(ring, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); 670 OUT_RING(ring, COND(fragz, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z)); 671} 672 673/* emits the program state which is not part of the stateobj because of 674 * dependency on other gl state (rasterflat or sprite-coord-replacement) 675 */ 676void 677fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit) 678{ 679 const struct fd6_program_state *state = fd6_emit_get_prog(emit); 680 681 if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { 682 /* fastpath: */ 683 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 684 for (int i = 0; i < 8; i++) 685 OUT_RING(ring, state->vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ 686 687 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 688 for (int i = 0; i < 8; i++) 689 OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[i] */ 690 } else { 691 /* slow-path: */ 692 struct ir3_shader_variant *fs = state->fs; 693 uint32_t vinterp[8], vpsrepl[8]; 694 695 memset(vinterp, 0, sizeof(vinterp)); 696 memset(vpsrepl, 0, sizeof(vpsrepl)); 697 698 for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) { 699 700 /* NOTE: varyings are packed, so if compmask is 0xb 701 * then first, third, and fourth component occupy 702 * three consecutive varying slots: 703 */ 704 unsigned compmask = fs->inputs[j].compmask; 705 706 uint32_t inloc = fs->inputs[j].inloc; 707 708 if ((fs->inputs[j].interpolate == INTERP_MODE_FLAT) || 709 (fs->inputs[j].rasterflat && emit->rasterflat)) { 710 uint32_t loc = inloc; 711 712 for (int i = 0; i < 4; i++) { 713 if (compmask & (1 << i)) { 714 vinterp[loc / 16] |= 1 << ((loc % 16) * 2); 715 loc++; 716 } 717 } 718 } 719 720 gl_varying_slot slot = fs->inputs[j].slot; 721 722 /* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */ 723 if (slot >= VARYING_SLOT_VAR0) { 724 unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); 725 /* Replace the .xy coordinates with S/T from the point sprite. Set 726 * interpolation bits for .zw such that they become .01 727 */ 728 if (emit->sprite_coord_enable & texmask) { 729 /* mask is two 2-bit fields, where: 730 * '01' -> S 731 * '10' -> T 732 * '11' -> 1 - T (flip mode) 733 */ 734 unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001; 735 uint32_t loc = inloc; 736 if (compmask & 0x1) { 737 vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); 738 loc++; 739 } 740 if (compmask & 0x2) { 741 vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); 742 loc++; 743 } 744 if (compmask & 0x4) { 745 /* .z <- 0.0f */ 746 vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); 747 loc++; 748 } 749 if (compmask & 0x8) { 750 /* .w <- 1.0f */ 751 vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); 752 loc++; 753 } 754 } 755 } 756 } 757 758 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); 759 for (int i = 0; i < 8; i++) 760 OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ 761 762 OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); 763 for (int i = 0; i < 8; i++) 764 OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ 765 } 766} 767 768static struct ir3_program_state * 769fd6_program_create(void *data, struct ir3_shader_variant *bs, 770 struct ir3_shader_variant *vs, 771 struct ir3_shader_variant *fs, 772 const struct ir3_shader_key *key) 773{ 774 struct fd_context *ctx = data; 775 struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); 776 777 state->bs = bs; 778 state->vs = vs; 779 state->fs = fs; 780 state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 781 state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); 782 783 setup_stateobj(state->binning_stateobj, state, key, true); 784 setup_stateobj(state->stateobj, state, key, false); 785 786 return &state->base; 787} 788 789static void 790fd6_program_destroy(void *data, struct ir3_program_state *state) 791{ 792 struct fd6_program_state *so = fd6_program_state(state); 793 fd_ringbuffer_del(so->stateobj); 794 fd_ringbuffer_del(so->binning_stateobj); 795 free(so); 796} 797 798static const struct ir3_cache_funcs cache_funcs = { 799 .create_state = fd6_program_create, 800 .destroy_state = fd6_program_destroy, 801}; 802 803void 804fd6_prog_init(struct pipe_context *pctx) 805{ 806 struct fd_context *ctx = fd_context(pctx); 807 808 fd6_context(ctx)->shader_cache = ir3_cache_create(&cache_funcs, ctx); 809 810 pctx->create_fs_state = fd6_fp_state_create; 811 pctx->delete_fs_state = fd6_fp_state_delete; 812 813 pctx->create_vs_state = fd6_vp_state_create; 814 pctx->delete_vs_state = fd6_vp_state_delete; 815 816 fd_prog_init(pctx); 817} 818