1/* 2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org> 3 * Copyright © 2018 Google, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 * Authors: 25 * Rob Clark <robclark@freedesktop.org> 26 */ 27 28#include "pipe/p_state.h" 29#include "util/format/u_format.h" 30#include "util/u_helpers.h" 31#include "util/u_memory.h" 32#include "util/u_string.h" 33#include "util/u_viewport.h" 34 35#include "common/freedreno_guardband.h" 36#include "freedreno_query_hw.h" 37#include "freedreno_resource.h" 38#include "freedreno_state.h" 39#include "freedreno_tracepoints.h" 40 41#include "fd6_blend.h" 42#include "fd6_const.h" 43#include "fd6_context.h" 44#include "fd6_emit.h" 45#include "fd6_format.h" 46#include "fd6_image.h" 47#include "fd6_pack.h" 48#include "fd6_program.h" 49#include "fd6_rasterizer.h" 50#include "fd6_texture.h" 51#include "fd6_zsa.h" 52 53/* Border color layout is diff from a4xx/a5xx.. if it turns out to be 54 * the same as a6xx then move this somewhere common ;-) 55 * 56 * Entry layout looks like (total size, 0x60 bytes): 57 */ 58 59struct PACKED bcolor_entry { 60 uint32_t fp32[4]; 61 uint16_t ui16[4]; 62 int16_t si16[4]; 63 uint16_t fp16[4]; 64 uint16_t rgb565; 65 uint16_t rgb5a1; 66 uint16_t rgba4; 67 uint8_t __pad0[2]; 68 uint8_t ui8[4]; 69 int8_t si8[4]; 70 uint32_t rgb10a2; 71 uint32_t z24; /* also s8? */ 72 uint16_t 73 srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ 74 uint8_t __pad1[56]; 75}; 76 77#define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry) 78#define FD6_BORDER_COLOR_UPLOAD_SIZE \ 79 (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE) 80 81static void 82setup_border_colors(struct fd_texture_stateobj *tex, 83 struct bcolor_entry *entries) 84{ 85 unsigned i, j; 86 STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); 87 88 for (i = 0; i < tex->num_samplers; i++) { 89 struct bcolor_entry *e = &entries[i]; 90 struct pipe_sampler_state *sampler = tex->samplers[i]; 91 union pipe_color_union *bc; 92 93 if (!sampler) 94 continue; 95 96 bc = &sampler->border_color; 97 98 /* 99 * XXX HACK ALERT XXX 100 * 101 * The border colors need to be swizzled in a particular 102 * format-dependent order. Even though samplers don't know about 103 * formats, we can assume that with a GL state tracker, there's a 104 * 1:1 correspondence between sampler and texture. Take advantage 105 * of that knowledge. 106 */ 107 if ((i >= tex->num_textures) || !tex->textures[i]) 108 continue; 109 110 struct pipe_sampler_view *view = tex->textures[i]; 111 enum pipe_format format = view->format; 112 const struct util_format_description *desc = 113 util_format_description(format); 114 const struct fd_resource *rsc = fd_resource(view->texture); 115 116 e->rgb565 = 0; 117 e->rgb5a1 = 0; 118 e->rgba4 = 0; 119 e->rgb10a2 = 0; 120 e->z24 = 0; 121 122 unsigned char swiz[4]; 123 124 fd6_tex_swiz(format, rsc->layout.tile_mode, swiz, view->swizzle_r, view->swizzle_g, 125 view->swizzle_b, view->swizzle_a); 126 127 for (j = 0; j < 4; j++) { 128 int c = swiz[j]; 129 int cd = c; 130 131 /* 132 * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the 133 * stencil border color value in bc->ui[0] but according 134 * to desc->swizzle and desc->channel, the .x/.w component 135 * is NONE and the stencil value is in the y component. 136 * Meanwhile the hardware wants this in the .w component 137 * for x24s8 and the .x component for x32_s8x24. 138 */ 139 if ((format == PIPE_FORMAT_X24S8_UINT) || 140 (format == PIPE_FORMAT_X32_S8X24_UINT)) { 141 if (j == 0) { 142 c = 1; 143 cd = (format == PIPE_FORMAT_X32_S8X24_UINT) ? 0 : 3; 144 } else { 145 continue; 146 } 147 } 148 149 if (c >= 4) 150 continue; 151 152 if (desc->channel[c].pure_integer) { 153 uint16_t clamped; 154 switch (desc->channel[c].size) { 155 case 2: 156 assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); 157 clamped = CLAMP(bc->ui[j], 0, 0x3); 158 break; 159 case 8: 160 if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) 161 clamped = CLAMP(bc->i[j], -128, 127); 162 else 163 clamped = CLAMP(bc->ui[j], 0, 255); 164 break; 165 case 10: 166 assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); 167 clamped = CLAMP(bc->ui[j], 0, 0x3ff); 168 break; 169 case 16: 170 if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) 171 clamped = CLAMP(bc->i[j], -32768, 32767); 172 else 173 clamped = CLAMP(bc->ui[j], 0, 65535); 174 break; 175 default: 176 assert(!"Unexpected bit size"); 177 case 32: 178 clamped = 0; 179 break; 180 } 181 e->fp32[cd] = bc->ui[j]; 182 e->fp16[cd] = clamped; 183 } else { 184 float f = bc->f[j]; 185 float f_u = CLAMP(f, 0, 1); 186 float f_s = CLAMP(f, -1, 1); 187 188 e->fp32[c] = fui(f); 189 e->fp16[c] = _mesa_float_to_half(f); 190 e->srgb[c] = _mesa_float_to_half(f_u); 191 e->ui16[c] = f_u * 0xffff; 192 e->si16[c] = f_s * 0x7fff; 193 e->ui8[c] = f_u * 0xff; 194 e->si8[c] = f_s * 0x7f; 195 if (c == 1) 196 e->rgb565 |= (int)(f_u * 0x3f) << 5; 197 else if (c < 3) 198 e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0); 199 if (c == 3) 200 e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0; 201 else 202 e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5); 203 if (c == 3) 204 e->rgb10a2 |= (int)(f_u * 0x3) << 30; 205 else 206 e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10); 207 e->rgba4 |= (int)(f_u * 0xf) << (c * 4); 208 if (c == 0) 209 e->z24 = f_u * 0xffffff; 210 } 211 } 212 213#ifdef DEBUG 214 memset(&e->__pad0, 0, sizeof(e->__pad0)); 215 memset(&e->__pad1, 0, sizeof(e->__pad1)); 216#endif 217 } 218} 219 220static void 221emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt 222{ 223 struct fd6_context *fd6_ctx = fd6_context(ctx); 224 struct bcolor_entry *entries; 225 unsigned off; 226 void *ptr; 227 228 STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); 229 230 u_upload_alloc(fd6_ctx->border_color_uploader, 0, 231 FD6_BORDER_COLOR_UPLOAD_SIZE, FD6_BORDER_COLOR_UPLOAD_SIZE, 232 &off, &fd6_ctx->border_color_buf, &ptr); 233 234 entries = ptr; 235 236 setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]); 237 setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT], 238 &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]); 239 240 OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2); 241 OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0); 242 243 u_upload_unmap(fd6_ctx->border_color_uploader); 244} 245 246static void 247fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) assert_dt 248{ 249 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; 250 struct pipe_surface *psurf = pfb->cbufs[0]; 251 struct fd_resource *rsc = fd_resource(psurf->texture); 252 253 OUT_RINGP(state, 0, &ctx->batch->fb_read_patches); /* texconst0, patched in gmem emit */ 254 OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) | 255 A6XX_TEX_CONST_1_HEIGHT(pfb->height)); 256 OUT_RING(state, 0); /* texconst2, patched in gmem emit */ 257 OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size)); 258 OUT_RING(state, 0); /* BASE_LO, patched in gmem emit */ 259 OUT_RING(state, 0); /* BASE_HI, patched in gmem emit */ 260 OUT_RING(state, 0); /* texconst6 */ 261 OUT_RING(state, 0); /* texconst7 */ 262 OUT_RING(state, 0); /* texconst8 */ 263 OUT_RING(state, 0); /* texconst9 */ 264 OUT_RING(state, 0); /* texconst10 */ 265 OUT_RING(state, 0); /* texconst11 */ 266 OUT_RING(state, 0); 267 OUT_RING(state, 0); 268 OUT_RING(state, 0); 269 OUT_RING(state, 0); 270} 271 272bool 273fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, 274 enum pipe_shader_type type, struct fd_texture_stateobj *tex, 275 unsigned bcolor_offset, 276 /* can be NULL if no image/SSBO/fb state to merge in: */ 277 const struct ir3_shader_variant *v) 278{ 279 bool needs_border = false; 280 unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg; 281 enum a6xx_state_block sb; 282 283 switch (type) { 284 case PIPE_SHADER_VERTEX: 285 sb = SB6_VS_TEX; 286 opcode = CP_LOAD_STATE6_GEOM; 287 tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP; 288 tex_const_reg = REG_A6XX_SP_VS_TEX_CONST; 289 tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; 290 break; 291 case PIPE_SHADER_TESS_CTRL: 292 sb = SB6_HS_TEX; 293 opcode = CP_LOAD_STATE6_GEOM; 294 tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP; 295 tex_const_reg = REG_A6XX_SP_HS_TEX_CONST; 296 tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT; 297 break; 298 case PIPE_SHADER_TESS_EVAL: 299 sb = SB6_DS_TEX; 300 opcode = CP_LOAD_STATE6_GEOM; 301 tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP; 302 tex_const_reg = REG_A6XX_SP_DS_TEX_CONST; 303 tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT; 304 break; 305 case PIPE_SHADER_GEOMETRY: 306 sb = SB6_GS_TEX; 307 opcode = CP_LOAD_STATE6_GEOM; 308 tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP; 309 tex_const_reg = REG_A6XX_SP_GS_TEX_CONST; 310 tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT; 311 break; 312 case PIPE_SHADER_FRAGMENT: 313 sb = SB6_FS_TEX; 314 opcode = CP_LOAD_STATE6_FRAG; 315 tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP; 316 tex_const_reg = REG_A6XX_SP_FS_TEX_CONST; 317 tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT; 318 break; 319 case PIPE_SHADER_COMPUTE: 320 sb = SB6_CS_TEX; 321 opcode = CP_LOAD_STATE6_FRAG; 322 tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP; 323 tex_const_reg = REG_A6XX_SP_CS_TEX_CONST; 324 tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT; 325 break; 326 default: 327 unreachable("bad state block"); 328 } 329 330 if (tex->num_samplers > 0) { 331 struct fd_ringbuffer *state = 332 fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4); 333 for (unsigned i = 0; i < tex->num_samplers; i++) { 334 static const struct fd6_sampler_stateobj dummy_sampler = {}; 335 const struct fd6_sampler_stateobj *sampler = 336 tex->samplers[i] ? fd6_sampler_stateobj(tex->samplers[i]) 337 : &dummy_sampler; 338 OUT_RING(state, sampler->texsamp0); 339 OUT_RING(state, sampler->texsamp1); 340 OUT_RING(state, sampler->texsamp2 | 341 A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset)); 342 OUT_RING(state, sampler->texsamp3); 343 needs_border |= sampler->needs_border; 344 } 345 346 /* output sampler state: */ 347 OUT_PKT7(ring, opcode, 3); 348 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 349 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 350 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 351 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 352 CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers)); 353 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ 354 355 OUT_PKT4(ring, tex_samp_reg, 2); 356 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ 357 358 fd_ringbuffer_del(state); 359 } 360 361 unsigned num_merged_textures = tex->num_textures; 362 unsigned num_textures = tex->num_textures; 363 if (v) { 364 num_merged_textures += v->image_mapping.num_tex; 365 366 if (v->fb_read) 367 num_merged_textures++; 368 369 /* There could be more bound textures than what the shader uses. 370 * Which isn't known at shader compile time. So in the case we 371 * are merging tex state, only emit the textures that the shader 372 * uses (since the image/SSBO related tex state comes immediately 373 * after) 374 */ 375 num_textures = v->image_mapping.tex_base; 376 } 377 378 if (num_merged_textures > 0) { 379 struct fd_ringbuffer *state = 380 fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4); 381 for (unsigned i = 0; i < num_textures; i++) { 382 const struct fd6_pipe_sampler_view *view; 383 384 if (tex->textures[i]) { 385 view = fd6_pipe_sampler_view(tex->textures[i]); 386 if (unlikely(view->rsc_seqno != 387 fd_resource(view->base.texture)->seqno)) { 388 fd6_sampler_view_update(ctx, 389 fd6_pipe_sampler_view(tex->textures[i])); 390 } 391 } else { 392 static const struct fd6_pipe_sampler_view dummy_view = {}; 393 view = &dummy_view; 394 } 395 396 OUT_RING(state, view->texconst0); 397 OUT_RING(state, view->texconst1); 398 OUT_RING(state, view->texconst2); 399 OUT_RING(state, view->texconst3); 400 401 if (view->ptr1) { 402 OUT_RELOC(state, view->ptr1->bo, view->offset1, 403 (uint64_t)view->texconst5 << 32, 0); 404 } else { 405 OUT_RING(state, 0x00000000); 406 OUT_RING(state, view->texconst5); 407 } 408 409 OUT_RING(state, view->texconst6); 410 411 if (view->ptr2) { 412 OUT_RELOC(state, view->ptr2->bo, view->offset2, 0, 0); 413 } else { 414 OUT_RING(state, 0); 415 OUT_RING(state, 0); 416 } 417 418 OUT_RING(state, view->texconst9); 419 OUT_RING(state, view->texconst10); 420 OUT_RING(state, view->texconst11); 421 OUT_RING(state, 0); 422 OUT_RING(state, 0); 423 OUT_RING(state, 0); 424 OUT_RING(state, 0); 425 } 426 427 if (v) { 428 const struct ir3_ibo_mapping *mapping = &v->image_mapping; 429 struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type]; 430 struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type]; 431 432 for (unsigned i = 0; i < mapping->num_tex; i++) { 433 unsigned idx = mapping->tex_to_image[i]; 434 if (idx & IBO_SSBO) { 435 fd6_emit_ssbo_tex(state, &buf->sb[idx & ~IBO_SSBO]); 436 } else { 437 fd6_emit_image_tex(state, &img->si[idx]); 438 } 439 } 440 441 if (v->fb_read) { 442 fd6_emit_fb_tex(state, ctx); 443 } 444 } 445 446 /* emit texture state: */ 447 OUT_PKT7(ring, opcode, 3); 448 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 449 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | 450 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 451 CP_LOAD_STATE6_0_STATE_BLOCK(sb) | 452 CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures)); 453 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ 454 455 OUT_PKT4(ring, tex_const_reg, 2); 456 OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ 457 458 fd_ringbuffer_del(state); 459 } 460 461 OUT_PKT4(ring, tex_count_reg, 1); 462 OUT_RING(ring, num_merged_textures); 463 464 return needs_border; 465} 466 467/* Emits combined texture state, which also includes any Image/SSBO 468 * related texture state merged in (because we must have all texture 469 * state for a given stage in a single buffer). In the fast-path, if 470 * we don't need to merge in any image/ssbo related texture state, we 471 * just use cached texture stateobj. Otherwise we generate a single- 472 * use stateobj. 473 * 474 * TODO Is there some sane way we can still use cached texture stateobj 475 * with image/ssbo in use? 476 * 477 * returns whether border_color is required: 478 */ 479static bool 480fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit, 481 enum pipe_shader_type type, 482 const struct ir3_shader_variant *v) assert_dt 483{ 484 struct fd_context *ctx = emit->ctx; 485 bool needs_border = false; 486 487 static const struct { 488 enum fd6_state_id state_id; 489 unsigned enable_mask; 490 } s[PIPE_SHADER_TYPES] = { 491 [PIPE_SHADER_VERTEX] = {FD6_GROUP_VS_TEX, ENABLE_ALL}, 492 [PIPE_SHADER_TESS_CTRL] = {FD6_GROUP_HS_TEX, ENABLE_ALL}, 493 [PIPE_SHADER_TESS_EVAL] = {FD6_GROUP_DS_TEX, ENABLE_ALL}, 494 [PIPE_SHADER_GEOMETRY] = {FD6_GROUP_GS_TEX, ENABLE_ALL}, 495 [PIPE_SHADER_FRAGMENT] = {FD6_GROUP_FS_TEX, ENABLE_DRAW}, 496 }; 497 498 debug_assert(s[type].state_id); 499 500 if (!v->image_mapping.num_tex && !v->fb_read) { 501 /* in the fast-path, when we don't have to mix in any image/SSBO 502 * related texture state, we can just lookup the stateobj and 503 * re-emit that: 504 * 505 * Also, framebuffer-read is a slow-path because an extra 506 * texture needs to be inserted. 507 * 508 * TODO we can probably simmplify things if we also treated 509 * border_color as a slow-path.. this way the tex state key 510 * wouldn't depend on bcolor_offset.. but fb_read might rather 511 * be *somehow* a fast-path if we eventually used it for PLS. 512 * I suppose there would be no harm in just *always* inserting 513 * an fb_read texture? 514 */ 515 if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) && 516 ctx->tex[type].num_textures > 0) { 517 struct fd6_texture_state *tex = 518 fd6_texture_state(ctx, type, &ctx->tex[type]); 519 520 needs_border |= tex->needs_border; 521 522 fd6_emit_add_group(emit, tex->stateobj, s[type].state_id, 523 s[type].enable_mask); 524 525 fd6_texture_state_reference(&tex, NULL); 526 } 527 } else { 528 /* In the slow-path, create a one-shot texture state object 529 * if either TEX|PROG|SSBO|IMAGE state is dirty: 530 */ 531 if ((ctx->dirty_shader[type] & 532 (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE | 533 FD_DIRTY_SHADER_SSBO)) || 534 v->fb_read) { 535 struct fd_texture_stateobj *tex = &ctx->tex[type]; 536 struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer( 537 ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); 538 unsigned bcolor_offset = fd6_border_color_offset(ctx, type, tex); 539 540 needs_border |= 541 fd6_emit_textures(ctx, stateobj, type, tex, bcolor_offset, v); 542 543 fd6_emit_take_group(emit, stateobj, s[type].state_id, 544 s[type].enable_mask); 545 } 546 } 547 548 return needs_border; 549} 550 551static struct fd_ringbuffer * 552build_vbo_state(struct fd6_emit *emit) assert_dt 553{ 554 const struct fd_vertex_state *vtx = emit->vtx; 555 556 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 557 emit->ctx->batch->submit, 4 * (1 + vtx->vertexbuf.count * 4), 558 FD_RINGBUFFER_STREAMING); 559 560 OUT_PKT4(ring, REG_A6XX_VFD_FETCH(0), 4 * vtx->vertexbuf.count); 561 for (int32_t j = 0; j < vtx->vertexbuf.count; j++) { 562 const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j]; 563 struct fd_resource *rsc = fd_resource(vb->buffer.resource); 564 if (rsc == NULL) { 565 OUT_RING(ring, 0); 566 OUT_RING(ring, 0); 567 OUT_RING(ring, 0); 568 OUT_RING(ring, 0); 569 } else { 570 uint32_t off = vb->buffer_offset; 571 uint32_t size = fd_bo_size(rsc->bo) - off; 572 573 OUT_RELOC(ring, rsc->bo, off, 0, 0); 574 OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ 575 OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ 576 } 577 } 578 579 return ring; 580} 581 582static enum a6xx_ztest_mode 583compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt 584{ 585 struct fd_context *ctx = emit->ctx; 586 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; 587 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); 588 const struct ir3_shader_variant *fs = emit->fs; 589 590 if (fs->shader->nir->info.fs.early_fragment_tests) 591 return A6XX_EARLY_Z; 592 593 if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled || 594 fs->writes_stencilref) { 595 return A6XX_LATE_Z; 596 } else if ((fs->has_kill || zsa->alpha_test) && 597 (zsa->writes_zs || !pfb->zsbuf)) { 598 /* Slightly odd, but seems like the hw wants us to select 599 * LATE_Z mode if there is no depth buffer + discard. Either 600 * that, or when occlusion query is enabled. See: 601 * 602 * dEQP-GLES31.functional.fbo.no_attachments.* 603 */ 604 return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z; 605 } else { 606 return A6XX_EARLY_Z; 607 } 608} 609 610/** 611 * Calculate normalized LRZ state based on zsa/prog/blend state, updating 612 * the zsbuf's lrz state as necessary to detect the cases where we need 613 * to invalidate lrz. 614 */ 615static struct fd6_lrz_state 616compute_lrz_state(struct fd6_emit *emit, bool binning_pass) assert_dt 617{ 618 struct fd_context *ctx = emit->ctx; 619 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; 620 const struct ir3_shader_variant *fs = emit->fs; 621 struct fd6_lrz_state lrz; 622 623 if (!pfb->zsbuf) { 624 memset(&lrz, 0, sizeof(lrz)); 625 if (!binning_pass) { 626 lrz.z_mode = compute_ztest_mode(emit, false); 627 } 628 return lrz; 629 } 630 631 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); 632 struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); 633 struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); 634 635 lrz = zsa->lrz; 636 637 /* normalize lrz state: */ 638 if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill) { 639 lrz.write = false; 640 if (binning_pass) 641 lrz.enable = false; 642 } 643 644 /* if we change depthfunc direction, bail out on using LRZ. The 645 * LRZ buffer encodes a min/max depth value per block, but if 646 * we switch from GT/GE <-> LT/LE, those values cannot be 647 * interpreted properly. 648 */ 649 if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) && 650 (rsc->lrz_direction != lrz.direction)) { 651 rsc->lrz_valid = false; 652 } 653 654 if (zsa->invalidate_lrz || !rsc->lrz_valid) { 655 rsc->lrz_valid = false; 656 memset(&lrz, 0, sizeof(lrz)); 657 } 658 659 if (fs->no_earlyz || fs->writes_pos) { 660 lrz.enable = false; 661 lrz.write = false; 662 lrz.test = false; 663 } 664 665 if (!binning_pass) { 666 lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid); 667 } 668 669 /* Once we start writing to the real depth buffer, we lock in the 670 * direction for LRZ.. if we have to skip a LRZ write for any 671 * reason, it is still safe to have LRZ until there is a direction 672 * reversal. Prior to the reversal, since we disabled LRZ writes 673 * in the "unsafe" cases, this just means that the LRZ test may 674 * not early-discard some things that end up not passing a later 675 * test (ie. be overly concervative). But once you have a reversal 676 * of direction, it is possible to increase/decrease the z value 677 * to the point where the overly-conservative test is incorrect. 678 */ 679 if (zsa->base.depth_writemask) { 680 rsc->lrz_direction = lrz.direction; 681 } 682 683 return lrz; 684} 685 686static struct fd_ringbuffer * 687build_lrz(struct fd6_emit *emit, bool binning_pass) assert_dt 688{ 689 struct fd_context *ctx = emit->ctx; 690 struct fd6_context *fd6_ctx = fd6_context(ctx); 691 struct fd6_lrz_state lrz = compute_lrz_state(emit, binning_pass); 692 693 /* If the LRZ state has not changed, we can skip the emit: */ 694 if (!ctx->last.dirty && 695 !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz))) 696 return NULL; 697 698 fd6_ctx->last.lrz[binning_pass] = lrz; 699 700 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 701 ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING); 702 703 OUT_REG(ring, 704 A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write, 705 .greater = lrz.direction == FD_LRZ_GREATER, 706 .z_test_enable = lrz.test, )); 707 OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, )); 708 709 OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); 710 711 OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); 712 713 return ring; 714} 715 716static struct fd_ringbuffer * 717build_scissor(struct fd6_emit *emit) assert_dt 718{ 719 struct fd_context *ctx = emit->ctx; 720 struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); 721 722 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 723 emit->ctx->batch->submit, 3 * 4, FD_RINGBUFFER_STREAMING); 724 725 OUT_REG( 726 ring, 727 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = scissor->minx, .y = scissor->miny), 728 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1, 729 .y = MAX2(scissor->maxy, 1) - 1)); 730 731 ctx->batch->max_scissor.minx = 732 MIN2(ctx->batch->max_scissor.minx, scissor->minx); 733 ctx->batch->max_scissor.miny = 734 MIN2(ctx->batch->max_scissor.miny, scissor->miny); 735 ctx->batch->max_scissor.maxx = 736 MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); 737 ctx->batch->max_scissor.maxy = 738 MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); 739 740 return ring; 741} 742 743/* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD | 744 * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND 745 */ 746static struct fd_ringbuffer * 747build_prog_fb_rast(struct fd6_emit *emit) assert_dt 748{ 749 struct fd_context *ctx = emit->ctx; 750 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; 751 const struct fd6_program_state *prog = fd6_emit_get_prog(emit); 752 const struct ir3_shader_variant *fs = emit->fs; 753 754 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 755 ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING); 756 757 unsigned nr = pfb->nr_cbufs; 758 759 if (ctx->rasterizer->rasterizer_discard) 760 nr = 0; 761 762 struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); 763 764 if (blend->use_dual_src_blend) 765 nr++; 766 767 OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); 768 OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | 769 COND(fs->writes_smask && pfb->samples > 1, 770 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | 771 COND(fs->writes_stencilref, 772 A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | 773 COND(blend->use_dual_src_blend, 774 A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); 775 OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr)); 776 777 OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1); 778 OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr)); 779 780 unsigned mrt_components = 0; 781 for (unsigned i = 0; i < pfb->nr_cbufs; i++) { 782 if (!pfb->cbufs[i]) 783 continue; 784 mrt_components |= 0xf << (i * 4); 785 } 786 787 /* dual source blending has an extra fs output in the 2nd slot */ 788 if (blend->use_dual_src_blend) 789 mrt_components |= 0xf << 4; 790 791 mrt_components &= prog->mrt_components; 792 793 OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components)); 794 OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components)); 795 796 return ring; 797} 798 799static struct fd_ringbuffer * 800build_blend_color(struct fd6_emit *emit) assert_dt 801{ 802 struct fd_context *ctx = emit->ctx; 803 struct pipe_blend_color *bcolor = &ctx->blend_color; 804 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 805 ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING); 806 807 OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]), 808 A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]), 809 A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]), 810 A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); 811 812 return ring; 813} 814 815static struct fd_ringbuffer * 816build_ibo(struct fd6_emit *emit) assert_dt 817{ 818 struct fd_context *ctx = emit->ctx; 819 820 if (emit->hs) { 821 debug_assert(ir3_shader_nibo(emit->hs) == 0); 822 debug_assert(ir3_shader_nibo(emit->ds) == 0); 823 } 824 if (emit->gs) { 825 debug_assert(ir3_shader_nibo(emit->gs) == 0); 826 } 827 828 struct fd_ringbuffer *ibo_state = 829 fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT); 830 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( 831 ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING); 832 833 OUT_PKT7(ring, CP_LOAD_STATE6, 3); 834 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 835 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | 836 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 837 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | 838 CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs))); 839 OUT_RB(ring, ibo_state); 840 841 OUT_PKT4(ring, REG_A6XX_SP_IBO, 2); 842 OUT_RB(ring, ibo_state); 843 844 /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could 845 * de-duplicate this from program->config_stateobj 846 */ 847 OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); 848 OUT_RING(ring, ir3_shader_nibo(emit->fs)); 849 850 fd_ringbuffer_del(ibo_state); 851 852 return ring; 853} 854 855static void 856fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt 857{ 858 struct fd_context *ctx = emit->ctx; 859 const struct fd6_program_state *prog = fd6_emit_get_prog(emit); 860 struct ir3_stream_output_info *info = prog->stream_output; 861 struct fd_streamout_stateobj *so = &ctx->streamout; 862 863 emit->streamout_mask = 0; 864 865 if (!info) 866 return; 867 868 for (unsigned i = 0; i < so->num_targets; i++) { 869 struct fd_stream_output_target *target = 870 fd_stream_output_target(so->targets[i]); 871 872 if (!target) 873 continue; 874 875 target->stride = info->stride[i]; 876 877 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3); 878 /* VPC_SO[i].BUFFER_BASE_LO: */ 879 OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); 880 OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); 881 882 struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; 883 884 if (so->reset & (1 << i)) { 885 assert(so->offsets[i] == 0); 886 887 OUT_PKT7(ring, CP_MEM_WRITE, 3); 888 OUT_RELOC(ring, offset_bo, 0, 0, 0); 889 OUT_RING(ring, target->base.buffer_offset); 890 891 OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1); 892 OUT_RING(ring, target->base.buffer_offset); 893 } else { 894 OUT_PKT7(ring, CP_MEM_TO_REG, 3); 895 OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | 896 CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | 897 CP_MEM_TO_REG_0_CNT(0)); 898 OUT_RELOC(ring, offset_bo, 0, 0, 0); 899 } 900 901 // After a draw HW would write the new offset to offset_bo 902 OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); 903 OUT_RELOC(ring, offset_bo, 0, 0, 0); 904 905 so->reset &= ~(1 << i); 906 907 emit->streamout_mask |= (1 << i); 908 } 909 910 if (emit->streamout_mask) { 911 fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO, 912 ENABLE_ALL); 913 } else if (ctx->last.streamout_mask != 0) { 914 /* If we transition from a draw with streamout to one without, turn 915 * off streamout. 916 */ 917 fd6_emit_add_group(emit, fd6_context(ctx)->streamout_disable_stateobj, 918 FD6_GROUP_SO, ENABLE_ALL); 919 } 920 921 /* Make sure that any use of our TFB outputs (indirect draw source or shader 922 * UBO reads) comes after the TFB output is written. From the GL 4.6 core 923 * spec: 924 * 925 * "Buffers should not be bound or in use for both transform feedback and 926 * other purposes in the GL. Specifically, if a buffer object is 927 * simultaneously bound to a transform feedback buffer binding point 928 * and elsewhere in the GL, any writes to or reads from the buffer 929 * generate undefined values." 930 * 931 * So we idle whenever SO buffers change. Note that this function is called 932 * on every draw with TFB enabled, so check the dirty flag for the buffers 933 * themselves. 934 */ 935 if (ctx->dirty & FD_DIRTY_STREAMOUT) 936 fd_wfi(ctx->batch, ring); 937 938 ctx->last.streamout_mask = emit->streamout_mask; 939} 940 941/** 942 * Stuff that less frequently changes and isn't (yet) moved into stategroups 943 */ 944static void 945fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt 946{ 947 struct fd_context *ctx = emit->ctx; 948 const enum fd_dirty_3d_state dirty = emit->dirty; 949 950 if (dirty & FD_DIRTY_STENCIL_REF) { 951 struct pipe_stencil_ref *sr = &ctx->stencil_ref; 952 953 OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1); 954 OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) | 955 A6XX_RB_STENCILREF_BFREF(sr->ref_value[1])); 956 } 957 958 if (dirty & FD_DIRTY_VIEWPORT) { 959 struct pipe_scissor_state *scissor = &ctx->viewport_scissor; 960 961 OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]), 962 A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]), 963 A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]), 964 A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]), 965 A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]), 966 A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2])); 967 968 OUT_REG( 969 ring, 970 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = scissor->minx, 971 .y = scissor->miny), 972 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1, 973 .y = MAX2(scissor->maxy, 1) - 1)); 974 975 unsigned guardband_x = fd_calc_guardband(ctx->viewport.translate[0], 976 ctx->viewport.scale[0], false); 977 unsigned guardband_y = fd_calc_guardband(ctx->viewport.translate[1], 978 ctx->viewport.scale[1], false); 979 980 OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = guardband_x, 981 .vert = guardband_y)); 982 } 983 984 /* The clamp ranges are only used when the rasterizer wants depth 985 * clamping. 986 */ 987 if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) && 988 fd_depth_clamp_enabled(ctx)) { 989 float zmin, zmax; 990 util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, 991 &zmin, &zmax); 992 993 OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin), 994 A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax)); 995 996 OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax)); 997 } 998} 999 1000void 1001fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) 1002{ 1003 struct fd_context *ctx = emit->ctx; 1004 struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; 1005 const struct fd6_program_state *prog = fd6_emit_get_prog(emit); 1006 const struct ir3_shader_variant *vs = emit->vs; 1007 const struct ir3_shader_variant *hs = emit->hs; 1008 const struct ir3_shader_variant *ds = emit->ds; 1009 const struct ir3_shader_variant *gs = emit->gs; 1010 const struct ir3_shader_variant *fs = emit->fs; 1011 bool needs_border = false; 1012 1013 emit_marker6(ring, 5); 1014 1015 /* NOTE: we track fb_read differently than _BLEND_ENABLED since we 1016 * might decide to do sysmem in some cases when blend is enabled: 1017 */ 1018 if (fs->fb_read) 1019 ctx->batch->gmem_reason |= FD_GMEM_FB_READ; 1020 1021 u_foreach_bit (b, emit->dirty_groups) { 1022 enum fd6_state_id group = b; 1023 struct fd_ringbuffer *state = NULL; 1024 uint32_t enable_mask = ENABLE_ALL; 1025 1026 switch (group) { 1027 case FD6_GROUP_VTXSTATE: 1028 state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj; 1029 fd_ringbuffer_ref(state); 1030 break; 1031 case FD6_GROUP_VBO: 1032 state = build_vbo_state(emit); 1033 break; 1034 case FD6_GROUP_ZSA: 1035 state = fd6_zsa_state( 1036 ctx, 1037 util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])), 1038 fd_depth_clamp_enabled(ctx)); 1039 fd_ringbuffer_ref(state); 1040 break; 1041 case FD6_GROUP_LRZ: 1042 state = build_lrz(emit, false); 1043 if (!state) 1044 continue; 1045 enable_mask = ENABLE_DRAW; 1046 break; 1047 case FD6_GROUP_LRZ_BINNING: 1048 state = build_lrz(emit, true); 1049 if (!state) 1050 continue; 1051 enable_mask = CP_SET_DRAW_STATE__0_BINNING; 1052 break; 1053 case FD6_GROUP_SCISSOR: 1054 state = build_scissor(emit); 1055 break; 1056 case FD6_GROUP_PROG: 1057 fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG, 1058 ENABLE_ALL); 1059 fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW); 1060 fd6_emit_add_group(emit, prog->binning_stateobj, 1061 FD6_GROUP_PROG_BINNING, 1062 CP_SET_DRAW_STATE__0_BINNING); 1063 1064 /* emit remaining streaming program state, ie. what depends on 1065 * other emit state, so cannot be pre-baked. 1066 */ 1067 fd6_emit_take_group(emit, fd6_program_interp_state(emit), 1068 FD6_GROUP_PROG_INTERP, ENABLE_DRAW); 1069 continue; 1070 case FD6_GROUP_RASTERIZER: 1071 state = fd6_rasterizer_state(ctx, emit->primitive_restart); 1072 fd_ringbuffer_ref(state); 1073 break; 1074 case FD6_GROUP_PROG_FB_RAST: 1075 state = build_prog_fb_rast(emit); 1076 break; 1077 case FD6_GROUP_BLEND: 1078 state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask) 1079 ->stateobj; 1080 fd_ringbuffer_ref(state); 1081 break; 1082 case FD6_GROUP_BLEND_COLOR: 1083 state = build_blend_color(emit); 1084 break; 1085 case FD6_GROUP_IBO: 1086 state = build_ibo(emit); 1087 break; 1088 case FD6_GROUP_CONST: 1089 state = fd6_build_user_consts(emit); 1090 break; 1091 case FD6_GROUP_VS_DRIVER_PARAMS: 1092 state = fd6_build_vs_driver_params(emit); 1093 break; 1094 case FD6_GROUP_PRIMITIVE_PARAMS: 1095 state = fd6_build_tess_consts(emit); 1096 break; 1097 case FD6_GROUP_VS_TEX: 1098 needs_border |= 1099 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs); 1100 continue; 1101 case FD6_GROUP_HS_TEX: 1102 if (hs) { 1103 needs_border |= fd6_emit_combined_textures( 1104 ring, emit, PIPE_SHADER_TESS_CTRL, hs); 1105 } 1106 continue; 1107 case FD6_GROUP_DS_TEX: 1108 if (ds) { 1109 needs_border |= fd6_emit_combined_textures( 1110 ring, emit, PIPE_SHADER_TESS_EVAL, ds); 1111 } 1112 continue; 1113 case FD6_GROUP_GS_TEX: 1114 if (gs) { 1115 needs_border |= 1116 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs); 1117 } 1118 continue; 1119 case FD6_GROUP_FS_TEX: 1120 needs_border |= 1121 fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs); 1122 continue; 1123 case FD6_GROUP_SO: 1124 fd6_emit_streamout(ring, emit); 1125 continue; 1126 case FD6_GROUP_NON_GROUP: 1127 fd6_emit_non_ring(ring, emit); 1128 continue; 1129 default: 1130 unreachable("bad state group"); 1131 } 1132 1133 fd6_emit_take_group(emit, state, group, enable_mask); 1134 } 1135 1136 if (needs_border) 1137 emit_border_color(ctx, ring); 1138 1139 if (emit->num_groups > 0) { 1140 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups); 1141 for (unsigned i = 0; i < emit->num_groups; i++) { 1142 struct fd6_state_group *g = &emit->groups[i]; 1143 unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0; 1144 1145 debug_assert((g->enable_mask & ~ENABLE_ALL) == 0); 1146 1147 if (n == 0) { 1148 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | 1149 CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask | 1150 CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); 1151 OUT_RING(ring, 0x00000000); 1152 OUT_RING(ring, 0x00000000); 1153 } else { 1154 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask | 1155 CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); 1156 OUT_RB(ring, g->stateobj); 1157 } 1158 1159 if (g->stateobj) 1160 fd_ringbuffer_del(g->stateobj); 1161 } 1162 emit->num_groups = 0; 1163 } 1164} 1165 1166void 1167fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, 1168 struct ir3_shader_variant *cp) 1169{ 1170 enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; 1171 1172 if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | 1173 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) { 1174 struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE]; 1175 unsigned bcolor_offset = 1176 fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex); 1177 1178 bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex, 1179 bcolor_offset, cp); 1180 1181 if (needs_border) 1182 emit_border_color(ctx, ring); 1183 1184 OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1); 1185 OUT_RING(ring, 0); 1186 1187 OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1); 1188 OUT_RING(ring, 0); 1189 1190 OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1); 1191 OUT_RING(ring, 0); 1192 1193 OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1); 1194 OUT_RING(ring, 0); 1195 1196 OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1); 1197 OUT_RING(ring, 0); 1198 } 1199 1200 if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) { 1201 struct fd_ringbuffer *state = 1202 fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE); 1203 1204 OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); 1205 OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | 1206 CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) | 1207 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | 1208 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | 1209 CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp))); 1210 OUT_RB(ring, state); 1211 1212 OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2); 1213 OUT_RB(ring, state); 1214 1215 OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1); 1216 OUT_RING(ring, ir3_shader_nibo(cp)); 1217 1218 fd_ringbuffer_del(state); 1219 } 1220} 1221 1222/* emit setup at begin of new cmdstream buffer (don't rely on previous 1223 * state, there could have been a context switch between ioctls): 1224 */ 1225void 1226fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) 1227{ 1228 struct fd_screen *screen = batch->ctx->screen; 1229 1230 if (!batch->nondraw) { 1231 trace_start_state_restore(&batch->trace, ring); 1232 } 1233 1234 fd6_cache_inv(batch, ring); 1235 1236 OUT_REG(ring, 1237 A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, 1238 .ds_state = true, .gs_state = true, 1239 .fs_state = true, .cs_state = true, 1240 .gfx_ibo = true, .cs_ibo = true, 1241 .gfx_shared_const = true, 1242 .cs_shared_const = true, 1243 .gfx_bindless = 0x1f, .cs_bindless = 0x1f)); 1244 1245 OUT_WFI5(ring); 1246 1247 WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0); 1248 WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF); 1249 WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0); 1250 WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); 1251 WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44); 1252 WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL); 1253 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); 1254 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0); 1255 1256 WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0); 1257 WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880); 1258 WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000); 1259 WRITE(REG_A6XX_SP_CHICKEN_BITS, 0x1430); 1260 WRITE(REG_A6XX_SP_IBO_COUNT, 0); 1261 WRITE(REG_A6XX_SP_UNKNOWN_B182, 0); 1262 WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0); 1263 WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); 1264 WRITE(REG_A6XX_UCHE_CLIENT_PF, 4); 1265 WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1); 1266 WRITE(REG_A6XX_SP_MODE_CONTROL, 1267 A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); 1268 WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); 1269 WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); 1270 WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f); 1271 1272 WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0); 1273 WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0); 1274 WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2); 1275 1276 WRITE(REG_A6XX_RB_UNKNOWN_8818, 0); 1277 WRITE(REG_A6XX_RB_UNKNOWN_8819, 0); 1278 WRITE(REG_A6XX_RB_UNKNOWN_881A, 0); 1279 WRITE(REG_A6XX_RB_UNKNOWN_881B, 0); 1280 WRITE(REG_A6XX_RB_UNKNOWN_881C, 0); 1281 WRITE(REG_A6XX_RB_UNKNOWN_881D, 0); 1282 WRITE(REG_A6XX_RB_UNKNOWN_881E, 0); 1283 WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0); 1284 1285 WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value); 1286 WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0); 1287 1288 WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value); 1289 1290 WRITE(REG_A6XX_PC_RASTER_CNTL, 0); 1291 1292 WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0); 1293 1294 WRITE(REG_A6XX_SP_UNKNOWN_B183, 0); 1295 1296 WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0); 1297 WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0); 1298 WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); 1299 WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0); 1300 WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0); 1301 WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0); 1302 WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0); 1303 WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0); 1304 WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0); 1305 /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL 1306 * but this seems to kill texture gather offsets. 1307 */ 1308 WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 | 1309 A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL)); 1310 WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0); 1311 WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0); 1312 WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0); 1313 WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0); 1314 WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); 1315 1316 emit_marker6(ring, 7); 1317 1318 OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); 1319 OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */ 1320 1321 WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0); 1322 1323 OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1); 1324 OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */ 1325 1326 /* Clear any potential pending state groups to be safe: */ 1327 OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); 1328 OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | 1329 CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | 1330 CP_SET_DRAW_STATE__0_GROUP_ID(0)); 1331 OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); 1332 OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); 1333 1334 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1); 1335 OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */ 1336 1337 OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); 1338 OUT_RING(ring, 0x00000000); 1339 1340 OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1); 1341 OUT_RING(ring, 0x00000000); 1342 1343 if (!batch->nondraw) { 1344 trace_end_state_restore(&batch->trace, ring); 1345 } 1346} 1347 1348static void 1349fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, 1350 unsigned dst_off, struct pipe_resource *src, unsigned src_off, 1351 unsigned sizedwords) 1352{ 1353 struct fd_bo *src_bo = fd_resource(src)->bo; 1354 struct fd_bo *dst_bo = fd_resource(dst)->bo; 1355 unsigned i; 1356 1357 for (i = 0; i < sizedwords; i++) { 1358 OUT_PKT7(ring, CP_MEM_TO_MEM, 5); 1359 OUT_RING(ring, 0x00000000); 1360 OUT_RELOC(ring, dst_bo, dst_off, 0, 0); 1361 OUT_RELOC(ring, src_bo, src_off, 0, 0); 1362 1363 dst_off += 4; 1364 src_off += 4; 1365 } 1366} 1367 1368/* this is *almost* the same as fd6_cache_flush().. which I guess 1369 * could be re-worked to be something a bit more generic w/ param 1370 * indicating what needs to be flushed.. although that would mean 1371 * figuring out which events trigger what state to flush.. 1372 */ 1373static void 1374fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt 1375{ 1376 struct fd6_context *fd6_ctx = fd6_context(ctx); 1377 struct fd_batch *batch = fd_context_batch_locked(ctx); 1378 struct fd_ringbuffer *ring = batch->draw; 1379 unsigned seqno; 1380 1381 fd_batch_needs_flush(batch); 1382 1383 seqno = fd6_event_write(batch, ring, RB_DONE_TS, true); 1384 1385 OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); 1386 OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | 1387 CP_WAIT_REG_MEM_0_POLL_MEMORY); 1388 OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); 1389 OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); 1390 OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); 1391 OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 1392 1393 fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); 1394 fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); 1395 1396 seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); 1397 fd_wfi(batch, ring); 1398 1399 fd6_event_write(batch, ring, 0x31, false); 1400 1401 OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); 1402 OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); 1403 OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); 1404 OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); 1405 1406 fd_batch_unlock_submit(batch); 1407 fd_batch_reference(&batch, NULL); 1408} 1409 1410void 1411fd6_emit_init_screen(struct pipe_screen *pscreen) 1412{ 1413 struct fd_screen *screen = fd_screen(pscreen); 1414 screen->emit_ib = fd6_emit_ib; 1415 screen->mem_to_mem = fd6_mem_to_mem; 1416} 1417 1418void 1419fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis 1420{ 1421 struct fd_context *ctx = fd_context(pctx); 1422 ctx->framebuffer_barrier = fd6_framebuffer_barrier; 1423} 1424