1/* 2 * Copyright (C) 2018 Alyssa Rosenzweig 3 * Copyright (C) 2020 Collabora Ltd. 4 * Copyright © 2017 Intel Corporation 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 */ 25 26#include "util/macros.h" 27#include "util/u_prim.h" 28#include "util/u_vbuf.h" 29#include "util/u_helpers.h" 30#include "util/u_draw.h" 31#include "util/u_memory.h" 32#include "pipe/p_defines.h" 33#include "pipe/p_state.h" 34#include "gallium/auxiliary/util/u_blend.h" 35 36#include "panfrost-quirks.h" 37#include "genxml/gen_macros.h" 38 39#include "pan_pool.h" 40#include "pan_bo.h" 41#include "pan_blend.h" 42#include "pan_context.h" 43#include "pan_job.h" 44#include "pan_shader.h" 45#include "pan_texture.h" 46#include "pan_util.h" 47#include "pan_indirect_draw.h" 48#include "pan_indirect_dispatch.h" 49#include "pan_blitter.h" 50 51struct panfrost_rasterizer { 52 struct pipe_rasterizer_state base; 53 54 /* Partially packed RSD words */ 55 struct mali_multisample_misc_packed multisample; 56 struct mali_stencil_mask_misc_packed stencil_misc; 57}; 58 59struct panfrost_zsa_state { 60 struct pipe_depth_stencil_alpha_state base; 61 62 /* Is any depth, stencil, or alpha testing enabled? */ 63 bool enabled; 64 65 /* Mask of PIPE_CLEAR_{DEPTH,STENCIL} written */ 66 unsigned draws; 67 68 /* Prepacked words from the RSD */ 69 struct mali_multisample_misc_packed rsd_depth; 70 struct mali_stencil_mask_misc_packed rsd_stencil; 71 struct mali_stencil_packed stencil_front, stencil_back; 72}; 73 74struct panfrost_sampler_state { 75 struct pipe_sampler_state base; 76 struct mali_sampler_packed hw; 77}; 78 79/* Misnomer: Sampler view corresponds to textures, not samplers */ 80 81struct panfrost_sampler_view { 82 struct pipe_sampler_view base; 83 struct panfrost_pool_ref state; 84 struct mali_texture_packed bifrost_descriptor; 85 mali_ptr texture_bo; 86 uint64_t modifier; 87}; 88 89/* Statically assert that PIPE_* enums match the hardware enums. 90 * (As long as they match, we don't need to translate them.) 91 */ 92UNUSED static void 93pan_pipe_asserts() 94{ 95#define PIPE_ASSERT(x) STATIC_ASSERT((int)x) 96 97 /* Compare functions are natural in both Gallium and Mali */ 98 PIPE_ASSERT(PIPE_FUNC_NEVER == MALI_FUNC_NEVER); 99 PIPE_ASSERT(PIPE_FUNC_LESS == MALI_FUNC_LESS); 100 PIPE_ASSERT(PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL); 101 PIPE_ASSERT(PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL); 102 PIPE_ASSERT(PIPE_FUNC_GREATER == MALI_FUNC_GREATER); 103 PIPE_ASSERT(PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL); 104 PIPE_ASSERT(PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL); 105 PIPE_ASSERT(PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS); 106} 107 108static inline enum mali_sample_pattern 109panfrost_sample_pattern(unsigned samples) 110{ 111 switch (samples) { 112 case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; 113 case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; 114 case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID; 115 case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID; 116 default: unreachable("Unsupported sample count"); 117 } 118} 119 120static unsigned 121translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest) 122{ 123 /* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use 124 * CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for 125 * nearest filtering, so use CLAMP_TO_EDGE in that case. */ 126 127 switch (w) { 128 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT; 129 case PIPE_TEX_WRAP_CLAMP: 130 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE : 131#if PAN_ARCH <= 5 132 MALI_WRAP_MODE_CLAMP; 133#else 134 MALI_WRAP_MODE_CLAMP_TO_BORDER; 135#endif 136 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE; 137 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER; 138 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT; 139 case PIPE_TEX_WRAP_MIRROR_CLAMP: 140 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE : 141#if PAN_ARCH <= 5 142 MALI_WRAP_MODE_MIRRORED_CLAMP; 143#else 144 MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER; 145#endif 146 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE; 147 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER; 148 default: unreachable("Invalid wrap"); 149 } 150} 151 152/* The hardware compares in the wrong order order, so we have to flip before 153 * encoding. Yes, really. */ 154 155static enum mali_func 156panfrost_sampler_compare_func(const struct pipe_sampler_state *cso) 157{ 158 return !cso->compare_mode ? MALI_FUNC_NEVER : 159 panfrost_flip_compare_func((enum mali_func) cso->compare_func); 160} 161 162static enum mali_mipmap_mode 163pan_pipe_to_mipmode(enum pipe_tex_mipfilter f) 164{ 165 switch (f) { 166 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST; 167 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR; 168#if PAN_ARCH >= 6 169 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE; 170#else 171 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST; 172#endif 173 default: unreachable("Invalid"); 174 } 175} 176 177 178static void * 179panfrost_create_sampler_state( 180 struct pipe_context *pctx, 181 const struct pipe_sampler_state *cso) 182{ 183 struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state); 184 so->base = *cso; 185 186 bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST; 187 188 pan_pack(&so->hw, SAMPLER, cfg) { 189 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST; 190 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST; 191 192 cfg.normalized_coordinates = cso->normalized_coords; 193 cfg.lod_bias = FIXED_16(cso->lod_bias, true); 194 cfg.minimum_lod = FIXED_16(cso->min_lod, false); 195 cfg.maximum_lod = FIXED_16(cso->max_lod, false); 196 197 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest); 198 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest); 199 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest); 200 201 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter); 202 cfg.compare_function = panfrost_sampler_compare_func(cso); 203 cfg.seamless_cube_map = cso->seamless_cube_map; 204 205 cfg.border_color_r = cso->border_color.ui[0]; 206 cfg.border_color_g = cso->border_color.ui[1]; 207 cfg.border_color_b = cso->border_color.ui[2]; 208 cfg.border_color_a = cso->border_color.ui[3]; 209 210#if PAN_ARCH >= 6 211 if (cso->max_anisotropy > 1) { 212 cfg.maximum_anisotropy = cso->max_anisotropy; 213 cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC; 214 } 215#else 216 /* Emulate disabled mipmapping by clamping the LOD as tight as 217 * possible (from 0 to epsilon = 1/256) */ 218 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) 219 cfg.maximum_lod = cfg.minimum_lod + 1; 220#endif 221 } 222 223 return so; 224} 225 226static bool 227panfrost_fs_required( 228 struct panfrost_shader_state *fs, 229 struct panfrost_blend_state *blend, 230 struct pipe_framebuffer_state *state, 231 const struct panfrost_zsa_state *zsa) 232{ 233 /* If we generally have side effects. This inclues use of discard, 234 * which can affect the results of an occlusion query. */ 235 if (fs->info.fs.sidefx) 236 return true; 237 238 /* Using an empty FS requires early-z to be enabled, but alpha test 239 * needs it disabled */ 240 if ((enum mali_func) zsa->base.alpha_func != MALI_FUNC_ALWAYS) 241 return true; 242 243 /* If colour is written we need to execute */ 244 for (unsigned i = 0; i < state->nr_cbufs; ++i) { 245 if (state->cbufs[i] && !blend->info[i].no_colour) 246 return true; 247 } 248 249 /* If depth is written and not implied we need to execute. 250 * TODO: Predicate on Z/S writes being enabled */ 251 return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil); 252} 253 254#if PAN_ARCH >= 5 255UNUSED static uint16_t 256pack_blend_constant(enum pipe_format format, float cons) 257{ 258 const struct util_format_description *format_desc = 259 util_format_description(format); 260 261 unsigned chan_size = 0; 262 263 for (unsigned i = 0; i < format_desc->nr_channels; i++) 264 chan_size = MAX2(format_desc->channel[0].size, chan_size); 265 266 uint16_t unorm = (cons * ((1 << chan_size) - 1)); 267 return unorm << (16 - chan_size); 268} 269 270static void 271panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders) 272{ 273 unsigned rt_count = batch->key.nr_cbufs; 274 struct panfrost_context *ctx = batch->ctx; 275 const struct panfrost_blend_state *so = ctx->blend; 276 bool dithered = so->base.dither; 277 278 /* Always have at least one render target for depth-only passes */ 279 for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) { 280 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND)); 281 282 /* Disable blending for unbacked render targets */ 283 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) { 284 pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) { 285 cfg.enable = false; 286#if PAN_ARCH >= 6 287 cfg.internal.mode = MALI_BLEND_MODE_OFF; 288#endif 289 } 290 291 continue; 292 } 293 294 struct pan_blend_info info = so->info[i]; 295 enum pipe_format format = batch->key.cbufs[i]->format; 296 float cons = pan_blend_get_constant(info.constant_mask, 297 ctx->blend_color.color); 298 299 /* Word 0: Flags and constant */ 300 pan_pack(packed, BLEND, cfg) { 301 cfg.srgb = util_format_is_srgb(format); 302 cfg.load_destination = info.load_dest; 303 cfg.round_to_fb_precision = !dithered; 304 cfg.alpha_to_one = ctx->blend->base.alpha_to_one; 305#if PAN_ARCH >= 6 306 cfg.constant = pack_blend_constant(format, cons); 307#else 308 cfg.blend_shader = (blend_shaders[i] != 0); 309 310 if (blend_shaders[i]) 311 cfg.shader_pc = blend_shaders[i]; 312 else 313 cfg.constant = cons; 314#endif 315 } 316 317 if (!blend_shaders[i]) { 318 /* Word 1: Blend Equation */ 319 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); 320 packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i]; 321 } 322 323#if PAN_ARCH >= 6 324 const struct panfrost_device *dev = pan_device(ctx->base.screen); 325 struct panfrost_shader_state *fs = 326 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); 327 328 /* Words 2 and 3: Internal blend */ 329 if (blend_shaders[i]) { 330 /* The blend shader's address needs to be at 331 * the same top 32 bit as the fragment shader. 332 * TODO: Ensure that's always the case. 333 */ 334 assert(!fs->bin.bo || 335 (blend_shaders[i] & (0xffffffffull << 32)) == 336 (fs->bin.gpu & (0xffffffffull << 32))); 337 338 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset; 339 assert(!(ret_offset & 0x7)); 340 341 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { 342 cfg.mode = MALI_BLEND_MODE_SHADER; 343 cfg.shader.pc = (u32) blend_shaders[i]; 344 cfg.shader.return_value = ret_offset ? 345 fs->bin.gpu + ret_offset : 0; 346 } 347 } else { 348 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { 349 cfg.mode = info.opaque ? 350 MALI_BLEND_MODE_OPAQUE : 351 MALI_BLEND_MODE_FIXED_FUNCTION; 352 353 /* If we want the conversion to work properly, 354 * num_comps must be set to 4 355 */ 356 cfg.fixed_function.num_comps = 4; 357 cfg.fixed_function.conversion.memory_format = 358 panfrost_format_to_bifrost_blend(dev, format, dithered); 359 cfg.fixed_function.conversion.register_format = 360 fs->info.bifrost.blend[i].format; 361 cfg.fixed_function.rt = i; 362 } 363 } 364#endif 365 } 366 367 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) { 368 if (!so->info[i].no_colour && batch->key.cbufs[i]) { 369 batch->draws |= (PIPE_CLEAR_COLOR0 << i); 370 batch->resolve |= (PIPE_CLEAR_COLOR0 << i); 371 } 372 } 373} 374#endif 375 376/* Construct a partial RSD corresponding to no executed fragment shader, and 377 * merge with the existing partial RSD. */ 378 379static void 380pan_merge_empty_fs(struct mali_renderer_state_packed *rsd) 381{ 382 struct mali_renderer_state_packed empty_rsd; 383 384 pan_pack(&empty_rsd, RENDERER_STATE, cfg) { 385#if PAN_ARCH >= 6 386 cfg.properties.shader_modifies_coverage = true; 387 cfg.properties.allow_forward_pixel_to_kill = true; 388 cfg.properties.allow_forward_pixel_to_be_killed = true; 389 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; 390#else 391 cfg.shader.shader = 0x1; 392 cfg.properties.work_register_count = 1; 393 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION; 394 cfg.properties.force_early_z = true; 395#endif 396 } 397 398 pan_merge((*rsd), empty_rsd, RENDERER_STATE); 399} 400 401static void 402panfrost_prepare_fs_state(struct panfrost_context *ctx, 403 mali_ptr *blend_shaders, 404 struct mali_renderer_state_packed *rsd) 405{ 406 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; 407 const struct panfrost_zsa_state *zsa = ctx->depth_stencil; 408 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); 409 struct panfrost_blend_state *so = ctx->blend; 410 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; 411 bool msaa = rast->multisample; 412 413 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs; 414 415 bool has_blend_shader = false; 416 417 for (unsigned c = 0; c < rt_count; ++c) 418 has_blend_shader |= (blend_shaders[c] != 0); 419 420 pan_pack(rsd, RENDERER_STATE, cfg) { 421 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) { 422#if PAN_ARCH >= 6 423 /* Track if any colour buffer is reused across draws, either 424 * from reading it directly, or from failing to write it */ 425 unsigned rt_mask = ctx->fb_rt_mask; 426 uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0); 427 bool blend_reads_dest = (so->load_dest_mask & rt_mask); 428 429 cfg.properties.allow_forward_pixel_to_kill = 430 fs->info.fs.can_fpk && 431 !(rt_mask & ~rt_written) && 432 !alpha_to_coverage && 433 !blend_reads_dest; 434#else 435 cfg.properties.force_early_z = 436 fs->info.fs.can_early_z && !alpha_to_coverage && 437 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS); 438 439 /* TODO: Reduce this limit? */ 440 if (has_blend_shader) 441 cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8); 442 else 443 cfg.properties.work_register_count = fs->info.work_reg_count; 444 445 /* Hardware quirks around early-zs forcing without a 446 * depth buffer. Note this breaks occlusion queries. */ 447 bool has_oq = ctx->occlusion_query && ctx->active_queries; 448 bool force_ez_with_discard = !zsa->enabled && !has_oq; 449 450 cfg.properties.shader_reads_tilebuffer = 451 force_ez_with_discard && fs->info.fs.can_discard; 452 cfg.properties.shader_contains_discard = 453 !force_ez_with_discard && fs->info.fs.can_discard; 454#endif 455 } 456 457#if PAN_ARCH == 4 458 if (rt_count > 0) { 459 cfg.multisample_misc.load_destination = so->info[0].load_dest; 460 cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0); 461 cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour; 462 cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format); 463 cfg.stencil_mask_misc.dither_disable = !so->base.dither; 464 cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one; 465 466 if (blend_shaders[0]) { 467 cfg.blend_shader = blend_shaders[0]; 468 } else { 469 cfg.blend_constant = pan_blend_get_constant( 470 so->info[0].constant_mask, 471 ctx->blend_color.color); 472 } 473 } else { 474 /* If there is no colour buffer, leaving fields default is 475 * fine, except for blending which is nonnullable */ 476 cfg.blend_equation.color_mask = 0xf; 477 cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; 478 cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; 479 cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; 480 cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; 481 cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; 482 cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; 483 } 484#elif PAN_ARCH == 5 485 /* Workaround */ 486 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count); 487#endif 488 489 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF; 490 491 cfg.multisample_misc.evaluate_per_sample = 492 msaa && (ctx->min_samples > 1); 493 494#if PAN_ARCH >= 6 495 /* MSAA blend shaders need to pass their sample ID to 496 * LD_TILE/ST_TILE, so we must preload it. Additionally, we 497 * need per-sample shading for the blend shader, accomplished 498 * by forcing per-sample shading for the whole program. */ 499 500 if (msaa && has_blend_shader) { 501 cfg.multisample_misc.evaluate_per_sample = true; 502 cfg.preload.fragment.sample_mask_id = true; 503 } 504#endif 505 506 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage; 507 cfg.depth_units = rast->offset_units * 2.0f; 508 cfg.depth_factor = rast->offset_scale; 509 510 bool back_enab = zsa->base.stencil[1].enabled; 511 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0]; 512 cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; 513 514#if PAN_ARCH <= 5 515 /* v6+ fits register preload here, no alpha testing */ 516 cfg.alpha_reference = zsa->base.alpha_ref_value; 517#endif 518 } 519} 520 521static void 522panfrost_emit_frag_shader(struct panfrost_context *ctx, 523 struct mali_renderer_state_packed *fragmeta, 524 mali_ptr *blend_shaders) 525{ 526 const struct panfrost_zsa_state *zsa = ctx->depth_stencil; 527 const struct panfrost_rasterizer *rast = ctx->rasterizer; 528 struct panfrost_shader_state *fs = 529 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); 530 531 /* We need to merge several several partial renderer state descriptors, 532 * so stage to temporary storage rather than reading back write-combine 533 * memory, which will trash performance. */ 534 struct mali_renderer_state_packed rsd; 535 panfrost_prepare_fs_state(ctx, blend_shaders, &rsd); 536 537#if PAN_ARCH == 4 538 if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) { 539 /* Word 14: SFBD Blend Equation */ 540 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); 541 rsd.opaque[14] = ctx->blend->equation[0]; 542 } 543#endif 544 545 /* Merge with CSO state and upload */ 546 if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) { 547 struct mali_renderer_state_packed *partial_rsd = 548 (struct mali_renderer_state_packed *)&fs->partial_rsd; 549 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd)); 550 pan_merge(rsd, *partial_rsd, RENDERER_STATE); 551 } else { 552 pan_merge_empty_fs(&rsd); 553 } 554 555 /* Word 8, 9 Misc state */ 556 rsd.opaque[8] |= zsa->rsd_depth.opaque[0] 557 | rast->multisample.opaque[0]; 558 559 rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] 560 | rast->stencil_misc.opaque[0]; 561 562 /* Word 10, 11 Stencil Front and Back */ 563 rsd.opaque[10] |= zsa->stencil_front.opaque[0]; 564 rsd.opaque[11] |= zsa->stencil_back.opaque[0]; 565 566 memcpy(fragmeta, &rsd, sizeof(rsd)); 567} 568 569static mali_ptr 570panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage) 571{ 572 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage); 573 574 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX); 575 panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX); 576 577 return ss->state.gpu; 578} 579 580static mali_ptr 581panfrost_emit_frag_shader_meta(struct panfrost_batch *batch) 582{ 583 struct panfrost_context *ctx = batch->ctx; 584 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); 585 586 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT); 587 588 struct panfrost_ptr xfer; 589 590#if PAN_ARCH == 4 591 xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE); 592#else 593 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); 594 595 xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base, 596 PAN_DESC(RENDERER_STATE), 597 PAN_DESC_ARRAY(rt_count, BLEND)); 598#endif 599 600 mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 }; 601 unsigned shader_offset = 0; 602 struct panfrost_bo *shader_bo = NULL; 603 604 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c) { 605 if (ctx->pipe_framebuffer.cbufs[c]) { 606 blend_shaders[c] = panfrost_get_blend(batch, 607 c, &shader_bo, &shader_offset); 608 } 609 } 610 611 panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders); 612 613#if PAN_ARCH >= 5 614 panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders); 615#else 616 batch->draws |= PIPE_CLEAR_COLOR0; 617 batch->resolve |= PIPE_CLEAR_COLOR0; 618#endif 619 620 if (ctx->depth_stencil->base.depth_enabled) 621 batch->read |= PIPE_CLEAR_DEPTH; 622 623 if (ctx->depth_stencil->base.stencil[0].enabled) 624 batch->read |= PIPE_CLEAR_STENCIL; 625 626 return xfer.gpu; 627} 628 629static mali_ptr 630panfrost_emit_viewport(struct panfrost_batch *batch) 631{ 632 struct panfrost_context *ctx = batch->ctx; 633 const struct pipe_viewport_state *vp = &ctx->pipe_viewport; 634 const struct pipe_scissor_state *ss = &ctx->scissor; 635 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; 636 637 /* Derive min/max from translate/scale. Note since |x| >= 0 by 638 * definition, we have that -|x| <= |x| hence translate - |scale| <= 639 * translate + |scale|, so the ordering is correct here. */ 640 float vp_minx = vp->translate[0] - fabsf(vp->scale[0]); 641 float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); 642 float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); 643 float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); 644 float minz = (vp->translate[2] - fabsf(vp->scale[2])); 645 float maxz = (vp->translate[2] + fabsf(vp->scale[2])); 646 647 /* Scissor to the intersection of viewport and to the scissor, clamped 648 * to the framebuffer */ 649 650 unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0)); 651 unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0)); 652 unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0)); 653 unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0)); 654 655 if (ss && rast->scissor) { 656 minx = MAX2(ss->minx, minx); 657 miny = MAX2(ss->miny, miny); 658 maxx = MIN2(ss->maxx, maxx); 659 maxy = MIN2(ss->maxy, maxy); 660 } 661 662 /* Set the range to [1, 1) so max values don't wrap round */ 663 if (maxx == 0 || maxy == 0) 664 maxx = maxy = minx = miny = 1; 665 666 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); 667 668 pan_pack(T.cpu, VIEWPORT, cfg) { 669 /* [minx, maxx) and [miny, maxy) are exclusive ranges, but 670 * these are inclusive */ 671 cfg.scissor_minimum_x = minx; 672 cfg.scissor_minimum_y = miny; 673 cfg.scissor_maximum_x = maxx - 1; 674 cfg.scissor_maximum_y = maxy - 1; 675 676 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY; 677 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY; 678 } 679 680 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); 681 batch->scissor_culls_everything = (minx >= maxx || miny >= maxy); 682 683 return T.gpu; 684} 685 686static mali_ptr 687panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch, 688 enum pipe_shader_type st, 689 struct panfrost_constant_buffer *buf, 690 unsigned index) 691{ 692 struct pipe_constant_buffer *cb = &buf->cb[index]; 693 struct panfrost_resource *rsrc = pan_resource(cb->buffer); 694 695 if (rsrc) { 696 panfrost_batch_read_rsrc(batch, rsrc, st); 697 698 /* Alignment gauranteed by 699 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */ 700 return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset; 701 } else if (cb->user_buffer) { 702 return pan_pool_upload_aligned(&batch->pool.base, 703 cb->user_buffer + 704 cb->buffer_offset, 705 cb->buffer_size, 16); 706 } else { 707 unreachable("No constant buffer"); 708 } 709} 710 711struct sysval_uniform { 712 union { 713 float f[4]; 714 int32_t i[4]; 715 uint32_t u[4]; 716 uint64_t du[2]; 717 }; 718}; 719 720static void 721panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch, 722 struct sysval_uniform *uniform) 723{ 724 struct panfrost_context *ctx = batch->ctx; 725 const struct pipe_viewport_state *vp = &ctx->pipe_viewport; 726 727 uniform->f[0] = vp->scale[0]; 728 uniform->f[1] = vp->scale[1]; 729 uniform->f[2] = vp->scale[2]; 730} 731 732static void 733panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch, 734 struct sysval_uniform *uniform) 735{ 736 struct panfrost_context *ctx = batch->ctx; 737 const struct pipe_viewport_state *vp = &ctx->pipe_viewport; 738 739 uniform->f[0] = vp->translate[0]; 740 uniform->f[1] = vp->translate[1]; 741 uniform->f[2] = vp->translate[2]; 742} 743 744static void panfrost_upload_txs_sysval(struct panfrost_batch *batch, 745 enum pipe_shader_type st, 746 unsigned int sysvalid, 747 struct sysval_uniform *uniform) 748{ 749 struct panfrost_context *ctx = batch->ctx; 750 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); 751 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); 752 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); 753 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base; 754 755 assert(dim); 756 757 if (tex->target == PIPE_BUFFER) { 758 assert(dim == 1); 759 uniform->i[0] = 760 tex->u.buf.size / util_format_get_blocksize(tex->format); 761 return; 762 } 763 764 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level); 765 766 if (dim > 1) 767 uniform->i[1] = u_minify(tex->texture->height0, 768 tex->u.tex.first_level); 769 770 if (dim > 2) 771 uniform->i[2] = u_minify(tex->texture->depth0, 772 tex->u.tex.first_level); 773 774 if (is_array) 775 uniform->i[dim] = tex->texture->array_size; 776} 777 778static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch, 779 enum pipe_shader_type st, 780 unsigned int sysvalid, 781 struct sysval_uniform *uniform) 782{ 783 struct panfrost_context *ctx = batch->ctx; 784 unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); 785 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); 786 unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); 787 788 assert(dim && dim < 4); 789 790 struct pipe_image_view *image = &ctx->images[st][idx]; 791 792 if (image->resource->target == PIPE_BUFFER) { 793 unsigned blocksize = util_format_get_blocksize(image->format); 794 uniform->i[0] = image->resource->width0 / blocksize; 795 return; 796 } 797 798 uniform->i[0] = u_minify(image->resource->width0, 799 image->u.tex.level); 800 801 if (dim > 1) 802 uniform->i[1] = u_minify(image->resource->height0, 803 image->u.tex.level); 804 805 if (dim > 2) 806 uniform->i[2] = u_minify(image->resource->depth0, 807 image->u.tex.level); 808 809 if (is_array) 810 uniform->i[dim] = image->resource->array_size; 811} 812 813static void 814panfrost_upload_ssbo_sysval(struct panfrost_batch *batch, 815 enum pipe_shader_type st, 816 unsigned ssbo_id, 817 struct sysval_uniform *uniform) 818{ 819 struct panfrost_context *ctx = batch->ctx; 820 821 assert(ctx->ssbo_mask[st] & (1 << ssbo_id)); 822 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id]; 823 824 /* Compute address */ 825 struct panfrost_resource *rsrc = pan_resource(sb.buffer); 826 struct panfrost_bo *bo = rsrc->image.data.bo; 827 828 panfrost_batch_write_rsrc(batch, rsrc, st); 829 830 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, 831 sb.buffer_offset, sb.buffer_size); 832 833 /* Upload address and size as sysval */ 834 uniform->du[0] = bo->ptr.gpu + sb.buffer_offset; 835 uniform->u[2] = sb.buffer_size; 836} 837 838static void 839panfrost_upload_sampler_sysval(struct panfrost_batch *batch, 840 enum pipe_shader_type st, 841 unsigned samp_idx, 842 struct sysval_uniform *uniform) 843{ 844 struct panfrost_context *ctx = batch->ctx; 845 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base; 846 847 uniform->f[0] = sampl->min_lod; 848 uniform->f[1] = sampl->max_lod; 849 uniform->f[2] = sampl->lod_bias; 850 851 /* Even without any errata, Midgard represents "no mipmapping" as 852 * fixing the LOD with the clamps; keep behaviour consistent. c.f. 853 * panfrost_create_sampler_state which also explains our choice of 854 * epsilon value (again to keep behaviour consistent) */ 855 856 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) 857 uniform->f[1] = uniform->f[0] + (1.0/256.0); 858} 859 860static void 861panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch, 862 struct sysval_uniform *uniform) 863{ 864 struct panfrost_context *ctx = batch->ctx; 865 866 uniform->u[0] = ctx->compute_grid->grid[0]; 867 uniform->u[1] = ctx->compute_grid->grid[1]; 868 uniform->u[2] = ctx->compute_grid->grid[2]; 869} 870 871static void 872panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch, 873 struct sysval_uniform *uniform) 874{ 875 struct panfrost_context *ctx = batch->ctx; 876 877 uniform->u[0] = ctx->compute_grid->block[0]; 878 uniform->u[1] = ctx->compute_grid->block[1]; 879 uniform->u[2] = ctx->compute_grid->block[2]; 880} 881 882static void 883panfrost_upload_work_dim_sysval(struct panfrost_batch *batch, 884 struct sysval_uniform *uniform) 885{ 886 struct panfrost_context *ctx = batch->ctx; 887 888 uniform->u[0] = ctx->compute_grid->work_dim; 889} 890 891/* Sample positions are pushed in a Bifrost specific format on Bifrost. On 892 * Midgard, we emulate the Bifrost path with some extra arithmetic in the 893 * shader, to keep the code as unified as possible. */ 894 895static void 896panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch, 897 struct sysval_uniform *uniform) 898{ 899 struct panfrost_context *ctx = batch->ctx; 900 struct panfrost_device *dev = pan_device(ctx->base.screen); 901 902 unsigned samples = util_framebuffer_get_num_samples(&batch->key); 903 uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples)); 904} 905 906static void 907panfrost_upload_multisampled_sysval(struct panfrost_batch *batch, 908 struct sysval_uniform *uniform) 909{ 910 unsigned samples = util_framebuffer_get_num_samples(&batch->key); 911 uniform->u[0] = samples > 1; 912} 913 914#if PAN_ARCH >= 6 915static void 916panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch, 917 unsigned size_and_rt, struct sysval_uniform *uniform) 918{ 919 struct panfrost_context *ctx = batch->ctx; 920 struct panfrost_device *dev = pan_device(ctx->base.screen); 921 unsigned rt = size_and_rt & 0xF; 922 unsigned size = size_and_rt >> 4; 923 924 if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) { 925 enum pipe_format format = batch->key.cbufs[rt]->format; 926 uniform->u[0] = 927 GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32; 928 } else { 929 pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg) 930 cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw; 931 } 932} 933#endif 934 935static void 936panfrost_upload_sysvals(struct panfrost_batch *batch, 937 const struct panfrost_ptr *ptr, 938 struct panfrost_shader_state *ss, 939 enum pipe_shader_type st) 940{ 941 struct sysval_uniform *uniforms = ptr->cpu; 942 943 for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { 944 int sysval = ss->info.sysvals.sysvals[i]; 945 946 switch (PAN_SYSVAL_TYPE(sysval)) { 947 case PAN_SYSVAL_VIEWPORT_SCALE: 948 panfrost_upload_viewport_scale_sysval(batch, 949 &uniforms[i]); 950 break; 951 case PAN_SYSVAL_VIEWPORT_OFFSET: 952 panfrost_upload_viewport_offset_sysval(batch, 953 &uniforms[i]); 954 break; 955 case PAN_SYSVAL_TEXTURE_SIZE: 956 panfrost_upload_txs_sysval(batch, st, 957 PAN_SYSVAL_ID(sysval), 958 &uniforms[i]); 959 break; 960 case PAN_SYSVAL_SSBO: 961 panfrost_upload_ssbo_sysval(batch, st, 962 PAN_SYSVAL_ID(sysval), 963 &uniforms[i]); 964 break; 965 case PAN_SYSVAL_NUM_WORK_GROUPS: 966 for (unsigned j = 0; j < 3; j++) { 967 batch->num_wg_sysval[j] = 968 ptr->gpu + (i * sizeof(*uniforms)) + (j * 4); 969 } 970 panfrost_upload_num_work_groups_sysval(batch, 971 &uniforms[i]); 972 break; 973 case PAN_SYSVAL_LOCAL_GROUP_SIZE: 974 panfrost_upload_local_group_size_sysval(batch, 975 &uniforms[i]); 976 break; 977 case PAN_SYSVAL_WORK_DIM: 978 panfrost_upload_work_dim_sysval(batch, 979 &uniforms[i]); 980 break; 981 case PAN_SYSVAL_SAMPLER: 982 panfrost_upload_sampler_sysval(batch, st, 983 PAN_SYSVAL_ID(sysval), 984 &uniforms[i]); 985 break; 986 case PAN_SYSVAL_IMAGE_SIZE: 987 panfrost_upload_image_size_sysval(batch, st, 988 PAN_SYSVAL_ID(sysval), 989 &uniforms[i]); 990 break; 991 case PAN_SYSVAL_SAMPLE_POSITIONS: 992 panfrost_upload_sample_positions_sysval(batch, 993 &uniforms[i]); 994 break; 995 case PAN_SYSVAL_MULTISAMPLED: 996 panfrost_upload_multisampled_sysval(batch, 997 &uniforms[i]); 998 break; 999#if PAN_ARCH >= 6 1000 case PAN_SYSVAL_RT_CONVERSION: 1001 panfrost_upload_rt_conversion_sysval(batch, 1002 PAN_SYSVAL_ID(sysval), &uniforms[i]); 1003 break; 1004#endif 1005 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: 1006 batch->ctx->first_vertex_sysval_ptr = 1007 ptr->gpu + (i * sizeof(*uniforms)); 1008 batch->ctx->base_vertex_sysval_ptr = 1009 batch->ctx->first_vertex_sysval_ptr + 4; 1010 batch->ctx->base_instance_sysval_ptr = 1011 batch->ctx->first_vertex_sysval_ptr + 8; 1012 1013 uniforms[i].u[0] = batch->ctx->offset_start; 1014 uniforms[i].u[1] = batch->ctx->base_vertex; 1015 uniforms[i].u[2] = batch->ctx->base_instance; 1016 break; 1017 case PAN_SYSVAL_DRAWID: 1018 uniforms[i].u[0] = batch->ctx->drawid; 1019 break; 1020 default: 1021 assert(0); 1022 } 1023 } 1024} 1025 1026static const void * 1027panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx, 1028 struct panfrost_constant_buffer *buf, 1029 unsigned index) 1030{ 1031 struct pipe_constant_buffer *cb = &buf->cb[index]; 1032 struct panfrost_resource *rsrc = pan_resource(cb->buffer); 1033 1034 if (rsrc) { 1035 panfrost_bo_mmap(rsrc->image.data.bo); 1036 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping"); 1037 panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); 1038 1039 return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset; 1040 } else if (cb->user_buffer) { 1041 return cb->user_buffer + cb->buffer_offset; 1042 } else 1043 unreachable("No constant buffer"); 1044} 1045 1046static mali_ptr 1047panfrost_emit_const_buf(struct panfrost_batch *batch, 1048 enum pipe_shader_type stage, 1049 mali_ptr *push_constants) 1050{ 1051 struct panfrost_context *ctx = batch->ctx; 1052 struct panfrost_shader_variants *all = ctx->shader[stage]; 1053 1054 if (!all) 1055 return 0; 1056 1057 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage]; 1058 struct panfrost_shader_state *ss = &all->variants[all->active_variant]; 1059 1060 /* Allocate room for the sysval and the uniforms */ 1061 size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; 1062 struct panfrost_ptr transfer = 1063 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); 1064 1065 /* Upload sysvals requested by the shader */ 1066 panfrost_upload_sysvals(batch, &transfer, ss, stage); 1067 1068 /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ 1069 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage); 1070 unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); 1071 unsigned sysval_ubo = sys_size ? ubo_count : ~0; 1072 1073 struct panfrost_ptr ubos = 1074 pan_pool_alloc_desc_array(&batch->pool.base, 1075 ubo_count + 1, 1076 UNIFORM_BUFFER); 1077 1078 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu; 1079 1080 /* Upload sysval as a final UBO */ 1081 1082 if (sys_size) { 1083 pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) { 1084 cfg.entries = DIV_ROUND_UP(sys_size, 16); 1085 cfg.pointer = transfer.gpu; 1086 } 1087 } 1088 1089 /* The rest are honest-to-goodness UBOs */ 1090 1091 u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) { 1092 size_t usz = buf->cb[ubo].buffer_size; 1093 1094 if (usz == 0) { 1095 ubo_ptr[ubo] = 0; 1096 continue; 1097 } 1098 1099 /* Issue (57) for the ARB_uniform_buffer_object spec says that 1100 * the buffer can be larger than the uniform data inside it, 1101 * so clamp ubo size to what hardware supports. */ 1102 1103 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) { 1104 cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12); 1105 cfg.pointer = panfrost_map_constant_buffer_gpu(batch, 1106 stage, buf, ubo); 1107 } 1108 } 1109 1110 if (ss->info.push.count == 0) 1111 return ubos.gpu; 1112 1113 /* Copy push constants required by the shader */ 1114 struct panfrost_ptr push_transfer = 1115 pan_pool_alloc_aligned(&batch->pool.base, 1116 ss->info.push.count * 4, 16); 1117 1118 uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; 1119 *push_constants = push_transfer.gpu; 1120 1121 for (unsigned i = 0; i < ss->info.push.count; ++i) { 1122 struct panfrost_ubo_word src = ss->info.push.words[i]; 1123 1124 if (src.ubo == sysval_ubo) { 1125 unsigned sysval_idx = src.offset / 16; 1126 unsigned sysval_comp = (src.offset % 16) / 4; 1127 unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]); 1128 mali_ptr ptr = push_transfer.gpu + (4 * i); 1129 1130 switch (sysval_type) { 1131 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: 1132 switch (sysval_comp) { 1133 case 0: 1134 batch->ctx->first_vertex_sysval_ptr = ptr; 1135 break; 1136 case 1: 1137 batch->ctx->base_vertex_sysval_ptr = ptr; 1138 break; 1139 case 2: 1140 batch->ctx->base_instance_sysval_ptr = ptr; 1141 break; 1142 case 3: 1143 /* Spurious (Midgard doesn't pack) */ 1144 break; 1145 default: 1146 unreachable("Invalid vertex/instance offset component\n"); 1147 } 1148 break; 1149 1150 case PAN_SYSVAL_NUM_WORK_GROUPS: 1151 batch->num_wg_sysval[sysval_comp] = ptr; 1152 break; 1153 1154 default: 1155 break; 1156 } 1157 } 1158 /* Map the UBO, this should be cheap. However this is reading 1159 * from write-combine memory which is _very_ slow. It might pay 1160 * off to upload sysvals to a staging buffer on the CPU on the 1161 * assumption sysvals will get pushed (TODO) */ 1162 1163 const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : 1164 panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); 1165 1166 /* TODO: Is there any benefit to combining ranges */ 1167 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); 1168 } 1169 1170 return ubos.gpu; 1171} 1172 1173static mali_ptr 1174panfrost_emit_shared_memory(struct panfrost_batch *batch, 1175 const struct pipe_grid_info *info) 1176{ 1177 struct panfrost_context *ctx = batch->ctx; 1178 struct panfrost_device *dev = pan_device(ctx->base.screen); 1179 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE]; 1180 struct panfrost_shader_state *ss = &all->variants[all->active_variant]; 1181 struct panfrost_ptr t = 1182 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); 1183 1184 pan_pack(t.cpu, LOCAL_STORAGE, ls) { 1185 unsigned wls_single_size = 1186 util_next_power_of_two(MAX2(ss->info.wls_size, 128)); 1187 1188 if (ss->info.wls_size) { 1189 ls.wls_instances = 1190 util_next_power_of_two(info->grid[0]) * 1191 util_next_power_of_two(info->grid[1]) * 1192 util_next_power_of_two(info->grid[2]); 1193 1194 ls.wls_size_scale = util_logbase2(wls_single_size) + 1; 1195 1196 unsigned wls_size = wls_single_size * ls.wls_instances * dev->core_count; 1197 1198 ls.wls_base_pointer = 1199 (panfrost_batch_get_shared_memory(batch, 1200 wls_size, 1201 1))->ptr.gpu; 1202 } else { 1203 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; 1204 } 1205 1206 if (ss->info.tls_size) { 1207 unsigned shift = 1208 panfrost_get_stack_shift(ss->info.tls_size); 1209 struct panfrost_bo *bo = 1210 panfrost_batch_get_scratchpad(batch, 1211 ss->info.tls_size, 1212 dev->thread_tls_alloc, 1213 dev->core_count); 1214 1215 ls.tls_size = shift; 1216 ls.tls_base_pointer = bo->ptr.gpu; 1217 } 1218 }; 1219 1220 return t.gpu; 1221} 1222 1223#if PAN_ARCH <= 5 1224static mali_ptr 1225panfrost_get_tex_desc(struct panfrost_batch *batch, 1226 enum pipe_shader_type st, 1227 struct panfrost_sampler_view *view) 1228{ 1229 if (!view) 1230 return (mali_ptr) 0; 1231 1232 struct pipe_sampler_view *pview = &view->base; 1233 struct panfrost_resource *rsrc = pan_resource(pview->texture); 1234 1235 panfrost_batch_read_rsrc(batch, rsrc, st); 1236 panfrost_batch_add_bo(batch, view->state.bo, st); 1237 1238 return view->state.gpu; 1239} 1240#endif 1241 1242static void 1243panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so, 1244 struct pipe_context *pctx, 1245 struct pipe_resource *texture) 1246{ 1247 struct panfrost_device *device = pan_device(pctx->screen); 1248 struct panfrost_context *ctx = pan_context(pctx); 1249 struct panfrost_resource *prsrc = (struct panfrost_resource *)texture; 1250 enum pipe_format format = so->base.format; 1251 assert(prsrc->image.data.bo); 1252 1253 /* Format to access the stencil/depth portion of a Z32_S8 texture */ 1254 if (format == PIPE_FORMAT_X32_S8X24_UINT) { 1255 assert(prsrc->separate_stencil); 1256 texture = &prsrc->separate_stencil->base; 1257 prsrc = (struct panfrost_resource *)texture; 1258 format = texture->format; 1259 } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { 1260 format = PIPE_FORMAT_Z32_FLOAT; 1261 } 1262 1263 const struct util_format_description *desc = util_format_description(format); 1264 1265 bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM); 1266 1267 if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) { 1268 if (desc->is_snorm) 1269 format = PIPE_FORMAT_R8G8B8A8_SNORM; 1270 else 1271 format = PIPE_FORMAT_R8G8B8A8_UNORM; 1272 desc = util_format_description(format); 1273 } 1274 1275 so->texture_bo = prsrc->image.data.bo->ptr.gpu; 1276 so->modifier = prsrc->image.layout.modifier; 1277 1278 /* MSAA only supported for 2D textures */ 1279 1280 assert(texture->nr_samples <= 1 || 1281 so->base.target == PIPE_TEXTURE_2D || 1282 so->base.target == PIPE_TEXTURE_2D_ARRAY); 1283 1284 enum mali_texture_dimension type = 1285 panfrost_translate_texture_dimension(so->base.target); 1286 1287 bool is_buffer = (so->base.target == PIPE_BUFFER); 1288 1289 unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level; 1290 unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level; 1291 unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer; 1292 unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer; 1293 unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0; 1294 unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) / 1295 util_format_get_blocksize(format); 1296 1297 if (so->base.target == PIPE_TEXTURE_3D) { 1298 first_layer /= prsrc->image.layout.depth; 1299 last_layer /= prsrc->image.layout.depth; 1300 assert(!first_layer && !last_layer); 1301 } 1302 1303 struct pan_image_view iview = { 1304 .format = format, 1305 .dim = type, 1306 .first_level = first_level, 1307 .last_level = last_level, 1308 .first_layer = first_layer, 1309 .last_layer = last_layer, 1310 .swizzle = { 1311 so->base.swizzle_r, 1312 so->base.swizzle_g, 1313 so->base.swizzle_b, 1314 so->base.swizzle_a, 1315 }, 1316 .image = &prsrc->image, 1317 1318 .buf.offset = buf_offset, 1319 .buf.size = buf_size, 1320 }; 1321 1322 unsigned size = 1323 (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) + 1324 GENX(panfrost_estimate_texture_payload_size)(&iview); 1325 1326 struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64); 1327 so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu); 1328 1329 void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu; 1330 1331 if (PAN_ARCH <= 5) { 1332 payload.cpu += pan_size(TEXTURE); 1333 payload.gpu += pan_size(TEXTURE); 1334 } 1335 1336 GENX(panfrost_new_texture)(device, &iview, tex, &payload); 1337} 1338 1339static void 1340panfrost_update_sampler_view(struct panfrost_sampler_view *view, 1341 struct pipe_context *pctx) 1342{ 1343 struct panfrost_resource *rsrc = pan_resource(view->base.texture); 1344 if (view->texture_bo != rsrc->image.data.bo->ptr.gpu || 1345 view->modifier != rsrc->image.layout.modifier) { 1346 panfrost_bo_unreference(view->state.bo); 1347 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base); 1348 } 1349} 1350 1351static mali_ptr 1352panfrost_emit_texture_descriptors(struct panfrost_batch *batch, 1353 enum pipe_shader_type stage) 1354{ 1355 struct panfrost_context *ctx = batch->ctx; 1356 1357 if (!ctx->sampler_view_count[stage]) 1358 return 0; 1359 1360#if PAN_ARCH >= 6 1361 struct panfrost_ptr T = 1362 pan_pool_alloc_desc_array(&batch->pool.base, 1363 ctx->sampler_view_count[stage], 1364 TEXTURE); 1365 struct mali_texture_packed *out = 1366 (struct mali_texture_packed *) T.cpu; 1367 1368 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { 1369 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; 1370 struct pipe_sampler_view *pview = &view->base; 1371 struct panfrost_resource *rsrc = pan_resource(pview->texture); 1372 1373 panfrost_update_sampler_view(view, &ctx->base); 1374 out[i] = view->bifrost_descriptor; 1375 1376 panfrost_batch_read_rsrc(batch, rsrc, stage); 1377 panfrost_batch_add_bo(batch, view->state.bo, stage); 1378 } 1379 1380 return T.gpu; 1381#else 1382 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS]; 1383 1384 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { 1385 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; 1386 1387 panfrost_update_sampler_view(view, &ctx->base); 1388 1389 trampolines[i] = panfrost_get_tex_desc(batch, stage, view); 1390 } 1391 1392 return pan_pool_upload_aligned(&batch->pool.base, trampolines, 1393 sizeof(uint64_t) * 1394 ctx->sampler_view_count[stage], 1395 sizeof(uint64_t)); 1396#endif 1397} 1398 1399static mali_ptr 1400panfrost_emit_sampler_descriptors(struct panfrost_batch *batch, 1401 enum pipe_shader_type stage) 1402{ 1403 struct panfrost_context *ctx = batch->ctx; 1404 1405 if (!ctx->sampler_count[stage]) 1406 return 0; 1407 1408 struct panfrost_ptr T = 1409 pan_pool_alloc_desc_array(&batch->pool.base, 1410 ctx->sampler_count[stage], 1411 SAMPLER); 1412 struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu; 1413 1414 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) 1415 out[i] = ctx->samplers[stage][i]->hw; 1416 1417 return T.gpu; 1418} 1419 1420/* Packs all image attribute descs and attribute buffer descs. 1421 * `first_image_buf_index` must be the index of the first image attribute buffer descriptor. 1422 */ 1423static void 1424emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader, 1425 struct mali_attribute_packed *attribs, unsigned first_buf) 1426{ 1427 struct panfrost_device *dev = pan_device(ctx->base.screen); 1428 unsigned last_bit = util_last_bit(ctx->image_mask[shader]); 1429 1430 for (unsigned i = 0; i < last_bit; ++i) { 1431 enum pipe_format format = ctx->images[shader][i].format; 1432 1433 pan_pack(attribs + i, ATTRIBUTE, cfg) { 1434 /* Continuation record means 2 buffers per image */ 1435 cfg.buffer_index = first_buf + (i * 2); 1436 cfg.offset_enable = (PAN_ARCH <= 5); 1437 cfg.format = dev->formats[format].hw; 1438 } 1439 } 1440} 1441 1442static enum mali_attribute_type 1443pan_modifier_to_attr_type(uint64_t modifier) 1444{ 1445 switch (modifier) { 1446 case DRM_FORMAT_MOD_LINEAR: 1447 return MALI_ATTRIBUTE_TYPE_3D_LINEAR; 1448 case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: 1449 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED; 1450 default: 1451 unreachable("Invalid modifier for attribute record"); 1452 } 1453} 1454 1455static void 1456emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader, 1457 struct mali_attribute_buffer_packed *bufs, 1458 unsigned first_image_buf_index) 1459{ 1460 struct panfrost_context *ctx = batch->ctx; 1461 unsigned last_bit = util_last_bit(ctx->image_mask[shader]); 1462 1463 for (unsigned i = 0; i < last_bit; ++i) { 1464 struct pipe_image_view *image = &ctx->images[shader][i]; 1465 1466 if (!(ctx->image_mask[shader] & (1 << i)) || 1467 !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) { 1468 /* Unused image bindings */ 1469 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg); 1470 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg); 1471 continue; 1472 } 1473 1474 struct panfrost_resource *rsrc = pan_resource(image->resource); 1475 1476 /* TODO: MSAA */ 1477 assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported"); 1478 1479 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D; 1480 bool is_buffer = rsrc->base.target == PIPE_BUFFER; 1481 1482 unsigned offset = is_buffer ? image->u.buf.offset : 1483 panfrost_texture_offset(&rsrc->image.layout, 1484 image->u.tex.level, 1485 is_3d ? 0 : image->u.tex.first_layer, 1486 is_3d ? image->u.tex.first_layer : 0); 1487 1488 if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) { 1489 panfrost_batch_write_rsrc(batch, rsrc, shader); 1490 1491 unsigned level = is_buffer ? 0 : image->u.tex.level; 1492 BITSET_SET(rsrc->valid.data, level); 1493 1494 if (is_buffer) { 1495 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, 1496 0, rsrc->base.width0); 1497 } 1498 } else { 1499 panfrost_batch_read_rsrc(batch, rsrc, shader); 1500 } 1501 1502 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) { 1503 cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier); 1504 cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset; 1505 cfg.stride = util_format_get_blocksize(image->format); 1506 cfg.size = rsrc->image.data.bo->size - offset; 1507 } 1508 1509 if (is_buffer) { 1510 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { 1511 cfg.s_dimension = rsrc->base.width0 / 1512 util_format_get_blocksize(image->format); 1513 cfg.t_dimension = cfg.r_dimension = 1; 1514 } 1515 1516 continue; 1517 } 1518 1519 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { 1520 unsigned level = image->u.tex.level; 1521 1522 cfg.s_dimension = u_minify(rsrc->base.width0, level); 1523 cfg.t_dimension = u_minify(rsrc->base.height0, level); 1524 cfg.r_dimension = is_3d ? 1525 u_minify(rsrc->base.depth0, level) : 1526 image->u.tex.last_layer - image->u.tex.first_layer + 1; 1527 1528 cfg.row_stride = 1529 rsrc->image.layout.slices[level].row_stride; 1530 1531 if (rsrc->base.target != PIPE_TEXTURE_2D) { 1532 cfg.slice_stride = 1533 panfrost_get_layer_stride(&rsrc->image.layout, 1534 level); 1535 } 1536 } 1537 } 1538} 1539 1540static mali_ptr 1541panfrost_emit_image_attribs(struct panfrost_batch *batch, 1542 mali_ptr *buffers, 1543 enum pipe_shader_type type) 1544{ 1545 struct panfrost_context *ctx = batch->ctx; 1546 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type); 1547 1548 if (!shader->info.attribute_count) { 1549 *buffers = 0; 1550 return 0; 1551 } 1552 1553 /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */ 1554 unsigned attr_count = shader->info.attribute_count; 1555 unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0); 1556 1557 struct panfrost_ptr bufs = 1558 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER); 1559 1560 struct panfrost_ptr attribs = 1561 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE); 1562 1563 emit_image_attribs(ctx, type, attribs.cpu, 0); 1564 emit_image_bufs(batch, type, bufs.cpu, 0); 1565 1566 /* We need an empty attrib buf to stop the prefetching on Bifrost */ 1567#if PAN_ARCH >= 6 1568 pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)), 1569 ATTRIBUTE_BUFFER, cfg); 1570#endif 1571 1572 *buffers = bufs.gpu; 1573 return attribs.gpu; 1574} 1575 1576static mali_ptr 1577panfrost_emit_vertex_data(struct panfrost_batch *batch, 1578 mali_ptr *buffers) 1579{ 1580 struct panfrost_context *ctx = batch->ctx; 1581 struct panfrost_vertex_state *so = ctx->vertex; 1582 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); 1583 bool instanced = ctx->indirect_draw || ctx->instance_count > 1; 1584 uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX]; 1585 unsigned nr_images = util_last_bit(image_mask); 1586 1587 /* Worst case: everything is NPOT, which is only possible if instancing 1588 * is enabled. Otherwise single record is gauranteed. 1589 * Also, we allocate more memory than what's needed here if either instancing 1590 * is enabled or images are present, this can be improved. */ 1591 unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1; 1592 unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) + 1593 (PAN_ARCH >= 6 ? 1 : 0); 1594 1595#if PAN_ARCH <= 5 1596 /* Midgard needs vertexid/instanceid handled specially */ 1597 bool special_vbufs = vs->info.attribute_count >= PAN_VERTEX_ID; 1598 1599 if (special_vbufs) 1600 nr_bufs += 2; 1601#endif 1602 1603 if (!nr_bufs) { 1604 *buffers = 0; 1605 return 0; 1606 } 1607 1608 struct panfrost_ptr S = 1609 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, 1610 ATTRIBUTE_BUFFER); 1611 struct panfrost_ptr T = 1612 pan_pool_alloc_desc_array(&batch->pool.base, 1613 vs->info.attribute_count, 1614 ATTRIBUTE); 1615 1616 struct mali_attribute_buffer_packed *bufs = 1617 (struct mali_attribute_buffer_packed *) S.cpu; 1618 1619 struct mali_attribute_packed *out = 1620 (struct mali_attribute_packed *) T.cpu; 1621 1622 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 }; 1623 unsigned k = 0; 1624 1625 for (unsigned i = 0; i < so->nr_bufs; ++i) { 1626 unsigned vbi = so->buffers[i].vbi; 1627 unsigned divisor = so->buffers[i].divisor; 1628 attrib_to_buffer[i] = k; 1629 1630 if (!(ctx->vb_mask & (1 << vbi))) 1631 continue; 1632 1633 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; 1634 struct panfrost_resource *rsrc; 1635 1636 rsrc = pan_resource(buf->buffer.resource); 1637 if (!rsrc) 1638 continue; 1639 1640 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); 1641 1642 /* Mask off lower bits, see offset fixup below */ 1643 mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset; 1644 mali_ptr addr = raw_addr & ~63; 1645 1646 /* Since we advanced the base pointer, we shrink the buffer 1647 * size, but add the offset we subtracted */ 1648 unsigned size = rsrc->base.width0 + (raw_addr - addr) 1649 - buf->buffer_offset; 1650 1651 /* When there is a divisor, the hardware-level divisor is 1652 * the product of the instance divisor and the padded count */ 1653 unsigned stride = buf->stride; 1654 1655 if (ctx->indirect_draw) { 1656 /* We allocated 2 records for each attribute buffer */ 1657 assert((k & 1) == 0); 1658 1659 /* With indirect draws we can't guess the vertex_count. 1660 * Pre-set the address, stride and size fields, the 1661 * compute shader do the rest. 1662 */ 1663 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { 1664 cfg.type = MALI_ATTRIBUTE_TYPE_1D; 1665 cfg.pointer = addr; 1666 cfg.stride = stride; 1667 cfg.size = size; 1668 } 1669 1670 /* We store the unmodified divisor in the continuation 1671 * slot so the compute shader can retrieve it. 1672 */ 1673 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { 1674 cfg.divisor = divisor; 1675 } 1676 1677 k += 2; 1678 continue; 1679 } 1680 1681 unsigned hw_divisor = ctx->padded_count * divisor; 1682 1683 if (ctx->instance_count <= 1) { 1684 /* Per-instance would be every attribute equal */ 1685 if (divisor) 1686 stride = 0; 1687 1688 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { 1689 cfg.pointer = addr; 1690 cfg.stride = stride; 1691 cfg.size = size; 1692 } 1693 } else if (!divisor) { 1694 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { 1695 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS; 1696 cfg.pointer = addr; 1697 cfg.stride = stride; 1698 cfg.size = size; 1699 cfg.divisor = ctx->padded_count; 1700 } 1701 } else if (util_is_power_of_two_or_zero(hw_divisor)) { 1702 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { 1703 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; 1704 cfg.pointer = addr; 1705 cfg.stride = stride; 1706 cfg.size = size; 1707 cfg.divisor_r = __builtin_ctz(hw_divisor); 1708 } 1709 1710 } else { 1711 unsigned shift = 0, extra_flags = 0; 1712 1713 unsigned magic_divisor = 1714 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); 1715 1716 /* Records with continuations must be aligned */ 1717 k = ALIGN_POT(k, 2); 1718 attrib_to_buffer[i] = k; 1719 1720 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { 1721 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; 1722 cfg.pointer = addr; 1723 cfg.stride = stride; 1724 cfg.size = size; 1725 1726 cfg.divisor_r = shift; 1727 cfg.divisor_e = extra_flags; 1728 } 1729 1730 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { 1731 cfg.divisor_numerator = magic_divisor; 1732 cfg.divisor = divisor; 1733 } 1734 1735 ++k; 1736 } 1737 1738 ++k; 1739 } 1740 1741#if PAN_ARCH <= 5 1742 /* Add special gl_VertexID/gl_InstanceID buffers */ 1743 if (special_vbufs) { 1744 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); 1745 1746 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) { 1747 cfg.buffer_index = k++; 1748 cfg.format = so->formats[PAN_VERTEX_ID]; 1749 } 1750 1751 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); 1752 1753 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) { 1754 cfg.buffer_index = k++; 1755 cfg.format = so->formats[PAN_INSTANCE_ID]; 1756 } 1757 } 1758#endif 1759 1760 k = ALIGN_POT(k, 2); 1761 emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k); 1762 emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k); 1763 k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2); 1764 1765#if PAN_ARCH >= 6 1766 /* We need an empty attrib buf to stop the prefetching on Bifrost */ 1767 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg); 1768#endif 1769 1770 /* Attribute addresses require 64-byte alignment, so let: 1771 * 1772 * base' = base & ~63 = base - (base & 63) 1773 * offset' = offset + (base & 63) 1774 * 1775 * Since base' + offset' = base + offset, these are equivalent 1776 * addressing modes and now base is 64 aligned. 1777 */ 1778 1779 for (unsigned i = 0; i < so->num_elements; ++i) { 1780 unsigned vbi = so->pipe[i].vertex_buffer_index; 1781 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; 1782 1783 /* BOs are aligned; just fixup for buffer_offset */ 1784 signed src_offset = so->pipe[i].src_offset; 1785 src_offset += (buf->buffer_offset & 63); 1786 1787 /* Base instance offset */ 1788 if (ctx->base_instance && so->pipe[i].instance_divisor) { 1789 src_offset += (ctx->base_instance * buf->stride) / 1790 so->pipe[i].instance_divisor; 1791 } 1792 1793 /* Also, somewhat obscurely per-instance data needs to be 1794 * offset in response to a delayed start in an indexed draw */ 1795 1796 if (so->pipe[i].instance_divisor && ctx->instance_count > 1) 1797 src_offset -= buf->stride * ctx->offset_start; 1798 1799 pan_pack(out + i, ATTRIBUTE, cfg) { 1800 cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]]; 1801 cfg.format = so->formats[i]; 1802 cfg.offset = src_offset; 1803 } 1804 } 1805 1806 *buffers = S.gpu; 1807 return T.gpu; 1808} 1809 1810static mali_ptr 1811panfrost_emit_varyings(struct panfrost_batch *batch, 1812 struct mali_attribute_buffer_packed *slot, 1813 unsigned stride, unsigned count) 1814{ 1815 unsigned size = stride * count; 1816 mali_ptr ptr = 1817 batch->ctx->indirect_draw ? 0 : 1818 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu; 1819 1820 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { 1821 cfg.stride = stride; 1822 cfg.size = size; 1823 cfg.pointer = ptr; 1824 } 1825 1826 return ptr; 1827} 1828 1829static unsigned 1830panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target) 1831{ 1832 return target->buffer_offset + (pan_so_target(target)->offset * stride); 1833} 1834 1835static void 1836panfrost_emit_streamout(struct panfrost_batch *batch, 1837 struct mali_attribute_buffer_packed *slot, 1838 unsigned stride, unsigned count, 1839 struct pipe_stream_output_target *target) 1840{ 1841 unsigned max_size = target->buffer_size; 1842 unsigned expected_size = stride * count; 1843 1844 /* Grab the BO and bind it to the batch */ 1845 struct panfrost_resource *rsrc = pan_resource(target->buffer); 1846 struct panfrost_bo *bo = rsrc->image.data.bo; 1847 1848 panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); 1849 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT); 1850 1851 unsigned offset = panfrost_xfb_offset(stride, target); 1852 1853 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { 1854 cfg.pointer = bo->ptr.gpu + (offset & ~63); 1855 cfg.stride = stride; 1856 cfg.size = MIN2(max_size, expected_size) + (offset & 63); 1857 1858 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, 1859 offset, cfg.size); 1860 } 1861} 1862 1863/* Helpers for manipulating stream out information so we can pack varyings 1864 * accordingly. Compute the src_offset for a given captured varying */ 1865 1866static struct pipe_stream_output * 1867pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc) 1868{ 1869 for (unsigned i = 0; i < info->num_outputs; ++i) { 1870 if (info->output[i].register_index == loc) 1871 return &info->output[i]; 1872 } 1873 1874 unreachable("Varying not captured"); 1875} 1876 1877/* Given a varying, figure out which index it corresponds to */ 1878 1879static inline unsigned 1880pan_varying_index(unsigned present, enum pan_special_varying v) 1881{ 1882 return util_bitcount(present & BITFIELD_MASK(v)); 1883} 1884 1885/* Get the base offset for XFB buffers, which by convention come after 1886 * everything else. Wrapper function for semantic reasons; by construction this 1887 * is just popcount. */ 1888 1889static inline unsigned 1890pan_xfb_base(unsigned present) 1891{ 1892 return util_bitcount(present); 1893} 1894 1895/* Determines which varying buffers are required */ 1896 1897static inline unsigned 1898pan_varying_present(const struct panfrost_device *dev, 1899 struct pan_shader_info *producer, 1900 struct pan_shader_info *consumer, 1901 uint16_t point_coord_mask) 1902{ 1903 /* At the moment we always emit general and position buffers. Not 1904 * strictly necessary but usually harmless */ 1905 1906 unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION); 1907 1908 /* Enable special buffers by the shader info */ 1909 1910 if (producer->vs.writes_point_size) 1911 present |= BITFIELD_BIT(PAN_VARY_PSIZ); 1912 1913#if PAN_ARCH <= 5 1914 /* On Midgard, these exist as real varyings. Later architectures use 1915 * LD_VAR_SPECIAL reads instead. */ 1916 1917 if (consumer->fs.reads_point_coord) 1918 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); 1919 1920 if (consumer->fs.reads_face) 1921 present |= BITFIELD_BIT(PAN_VARY_FACE); 1922 1923 if (consumer->fs.reads_frag_coord) 1924 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD); 1925 1926 /* Also, if we have a point sprite, we need a point coord buffer */ 1927 1928 for (unsigned i = 0; i < consumer->varyings.input_count; i++) { 1929 gl_varying_slot loc = consumer->varyings.input[i].location; 1930 1931 if (util_varying_is_point_coord(loc, point_coord_mask)) 1932 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); 1933 } 1934#endif 1935 1936 return present; 1937} 1938 1939/* Emitters for varying records */ 1940 1941static void 1942pan_emit_vary(const struct panfrost_device *dev, 1943 struct mali_attribute_packed *out, 1944 unsigned buffer_index, 1945 mali_pixel_format format, unsigned offset) 1946{ 1947 pan_pack(out, ATTRIBUTE, cfg) { 1948 cfg.buffer_index = buffer_index; 1949 cfg.offset_enable = (PAN_ARCH <= 5); 1950 cfg.format = format; 1951 cfg.offset = offset; 1952 } 1953} 1954 1955/* Special records */ 1956 1957static const struct { 1958 unsigned components; 1959 enum mali_format format; 1960} pan_varying_formats[PAN_VARY_MAX] = { 1961 [PAN_VARY_POSITION] = { 4, MALI_SNAP_4 }, 1962 [PAN_VARY_PSIZ] = { 1, MALI_R16F }, 1963 [PAN_VARY_PNTCOORD] = { 1, MALI_R16F }, 1964 [PAN_VARY_FACE] = { 1, MALI_R32I }, 1965 [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F }, 1966}; 1967 1968static mali_pixel_format 1969pan_special_format(const struct panfrost_device *dev, 1970 enum pan_special_varying buf) 1971{ 1972 assert(buf < PAN_VARY_MAX); 1973 mali_pixel_format format = (pan_varying_formats[buf].format << 12); 1974 1975#if PAN_ARCH <= 6 1976 unsigned nr = pan_varying_formats[buf].components; 1977 format |= panfrost_get_default_swizzle(nr); 1978#endif 1979 1980 return format; 1981} 1982 1983static void 1984pan_emit_vary_special(const struct panfrost_device *dev, 1985 struct mali_attribute_packed *out, 1986 unsigned present, enum pan_special_varying buf) 1987{ 1988 pan_emit_vary(dev, out, pan_varying_index(present, buf), 1989 pan_special_format(dev, buf), 0); 1990} 1991 1992/* Negative indicates a varying is not found */ 1993 1994static signed 1995pan_find_vary(const struct pan_shader_varying *vary, 1996 unsigned vary_count, unsigned loc) 1997{ 1998 for (unsigned i = 0; i < vary_count; ++i) { 1999 if (vary[i].location == loc) 2000 return i; 2001 } 2002 2003 return -1; 2004} 2005 2006/* Assign varying locations for the general buffer. Returns the calculated 2007 * per-vertex stride, and outputs offsets into the passed array. Negative 2008 * offset indicates a varying is not used. */ 2009 2010static unsigned 2011pan_assign_varyings(const struct panfrost_device *dev, 2012 struct pan_shader_info *producer, 2013 struct pan_shader_info *consumer, 2014 signed *offsets) 2015{ 2016 unsigned producer_count = producer->varyings.output_count; 2017 unsigned consumer_count = consumer->varyings.input_count; 2018 2019 const struct pan_shader_varying *producer_vars = producer->varyings.output; 2020 const struct pan_shader_varying *consumer_vars = consumer->varyings.input; 2021 2022 unsigned stride = 0; 2023 2024 for (unsigned i = 0; i < producer_count; ++i) { 2025 signed loc = pan_find_vary(consumer_vars, consumer_count, 2026 producer_vars[i].location); 2027 2028 if (loc >= 0) { 2029 offsets[i] = stride; 2030 2031 enum pipe_format format = consumer_vars[loc].format; 2032 stride += util_format_get_blocksize(format); 2033 } else { 2034 offsets[i] = -1; 2035 } 2036 } 2037 2038 return stride; 2039} 2040 2041/* Emitter for a single varying (attribute) descriptor */ 2042 2043static void 2044panfrost_emit_varying(const struct panfrost_device *dev, 2045 struct mali_attribute_packed *out, 2046 const struct pan_shader_varying varying, 2047 enum pipe_format pipe_format, 2048 unsigned present, 2049 uint16_t point_sprite_mask, 2050 struct pipe_stream_output_info *xfb, 2051 uint64_t xfb_loc_mask, 2052 unsigned max_xfb, 2053 unsigned *xfb_offsets, 2054 signed offset, 2055 enum pan_special_varying pos_varying) 2056{ 2057 /* Note: varying.format != pipe_format in some obscure cases due to a 2058 * limitation of the NIR linker. This should be fixed in the future to 2059 * eliminate the additional lookups. See: 2060 * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex 2061 */ 2062 gl_varying_slot loc = varying.location; 2063 mali_pixel_format format = dev->formats[pipe_format].hw; 2064 2065 struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ? 2066 pan_get_so(xfb, loc) : NULL; 2067 2068 if (util_varying_is_point_coord(loc, point_sprite_mask)) { 2069 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD); 2070 } else if (o && o->output_buffer < max_xfb) { 2071 unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63; 2072 2073 pan_emit_vary(dev, out, 2074 pan_xfb_base(present) + o->output_buffer, 2075 format, (o->dst_offset * 4) + fixup_offset); 2076 } else if (loc == VARYING_SLOT_POS) { 2077 pan_emit_vary_special(dev, out, present, pos_varying); 2078 } else if (loc == VARYING_SLOT_PSIZ) { 2079 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ); 2080 } else if (loc == VARYING_SLOT_FACE) { 2081 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE); 2082 } else if (offset < 0) { 2083 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0); 2084 } else { 2085 STATIC_ASSERT(PAN_VARY_GENERAL == 0); 2086 pan_emit_vary(dev, out, 0, format, offset); 2087 } 2088} 2089 2090/* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time, 2091 * rather than draw time (under good conditions). */ 2092 2093static void 2094panfrost_emit_varying_descs( 2095 struct panfrost_pool *pool, 2096 struct panfrost_shader_state *producer, 2097 struct panfrost_shader_state *consumer, 2098 struct panfrost_streamout *xfb, 2099 uint16_t point_coord_mask, 2100 struct pan_linkage *out) 2101{ 2102 struct panfrost_device *dev = pool->base.dev; 2103 struct pipe_stream_output_info *xfb_info = &producer->stream_output; 2104 unsigned producer_count = producer->info.varyings.output_count; 2105 unsigned consumer_count = consumer->info.varyings.input_count; 2106 2107 /* Offsets within the general varying buffer, indexed by location */ 2108 signed offsets[PAN_MAX_VARYINGS]; 2109 assert(producer_count <= ARRAY_SIZE(offsets)); 2110 assert(consumer_count <= ARRAY_SIZE(offsets)); 2111 2112 /* Allocate enough descriptors for both shader stages */ 2113 struct panfrost_ptr T = 2114 pan_pool_alloc_desc_array(&pool->base, 2115 producer_count + consumer_count, 2116 ATTRIBUTE); 2117 2118 /* Take a reference if we're being put on the CSO */ 2119 if (!pool->owned) { 2120 out->bo = pool->transient_bo; 2121 panfrost_bo_reference(out->bo); 2122 } 2123 2124 struct mali_attribute_packed *descs = T.cpu; 2125 out->producer = producer_count ? T.gpu : 0; 2126 out->consumer = consumer_count ? T.gpu + 2127 (pan_size(ATTRIBUTE) * producer_count) : 0; 2128 2129 /* Lay out the varyings. Must use producer to lay out, in order to 2130 * respect transform feedback precisions. */ 2131 out->present = pan_varying_present(dev, &producer->info, 2132 &consumer->info, point_coord_mask); 2133 2134 out->stride = pan_assign_varyings(dev, &producer->info, 2135 &consumer->info, offsets); 2136 2137 unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS]; 2138 2139 for (unsigned i = 0; i < xfb->num_targets; ++i) { 2140 xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4, 2141 xfb->targets[i]); 2142 } 2143 2144 for (unsigned i = 0; i < producer_count; ++i) { 2145 signed j = pan_find_vary(consumer->info.varyings.input, 2146 consumer->info.varyings.input_count, 2147 producer->info.varyings.output[i].location); 2148 2149 enum pipe_format format = (j >= 0) ? 2150 consumer->info.varyings.input[j].format : 2151 producer->info.varyings.output[i].format; 2152 2153 panfrost_emit_varying(dev, descs + i, 2154 producer->info.varyings.output[i], format, 2155 out->present, 0, &producer->stream_output, 2156 producer->so_mask, xfb->num_targets, 2157 xfb_offsets, offsets[i], PAN_VARY_POSITION); 2158 } 2159 2160 for (unsigned i = 0; i < consumer_count; ++i) { 2161 signed j = pan_find_vary(producer->info.varyings.output, 2162 producer->info.varyings.output_count, 2163 consumer->info.varyings.input[i].location); 2164 2165 signed offset = (j >= 0) ? offsets[j] : -1; 2166 2167 panfrost_emit_varying(dev, descs + producer_count + i, 2168 consumer->info.varyings.input[i], 2169 consumer->info.varyings.input[i].format, 2170 out->present, point_coord_mask, 2171 &producer->stream_output, producer->so_mask, 2172 xfb->num_targets, xfb_offsets, offset, 2173 PAN_VARY_FRAGCOORD); 2174 } 2175} 2176 2177#if PAN_ARCH <= 5 2178static void 2179pan_emit_special_input(struct mali_attribute_buffer_packed *out, 2180 unsigned present, 2181 enum pan_special_varying v, 2182 unsigned special) 2183{ 2184 if (present & BITFIELD_BIT(v)) { 2185 unsigned idx = pan_varying_index(present, v); 2186 2187 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) { 2188 cfg.special = special; 2189 cfg.type = 0; 2190 } 2191 } 2192} 2193#endif 2194 2195static void 2196panfrost_emit_varying_descriptor(struct panfrost_batch *batch, 2197 unsigned vertex_count, 2198 mali_ptr *vs_attribs, 2199 mali_ptr *fs_attribs, 2200 mali_ptr *buffers, 2201 unsigned *buffer_count, 2202 mali_ptr *position, 2203 mali_ptr *psiz, 2204 bool point_coord_replace) 2205{ 2206 /* Load the shaders */ 2207 struct panfrost_context *ctx = batch->ctx; 2208 struct panfrost_shader_state *vs, *fs; 2209 2210 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); 2211 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); 2212 2213 uint16_t point_coord_mask = 0; 2214 2215#if PAN_ARCH <= 5 2216 /* Point sprites are lowered on Bifrost and newer */ 2217 if (point_coord_replace) 2218 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable; 2219#endif 2220 2221 /* In good conditions, we only need to link varyings once */ 2222 bool prelink = 2223 (point_coord_mask == 0) && 2224 (ctx->streamout.num_targets == 0) && 2225 !vs->info.separable && 2226 !fs->info.separable; 2227 2228 /* Try to reduce copies */ 2229 struct pan_linkage _linkage; 2230 struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage; 2231 2232 /* Emit ATTRIBUTE descriptors if needed */ 2233 if (!prelink || vs->linkage.bo == NULL) { 2234 struct panfrost_pool *pool = 2235 prelink ? &ctx->descs : &batch->pool; 2236 2237 panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage); 2238 } 2239 2240 struct pipe_stream_output_info *so = &vs->stream_output; 2241 unsigned present = linkage->present, stride = linkage->stride; 2242 unsigned xfb_base = pan_xfb_base(present); 2243 struct panfrost_ptr T = 2244 pan_pool_alloc_desc_array(&batch->pool.base, 2245 xfb_base + 2246 ctx->streamout.num_targets + 1, 2247 ATTRIBUTE_BUFFER); 2248 struct mali_attribute_buffer_packed *varyings = 2249 (struct mali_attribute_buffer_packed *) T.cpu; 2250 2251 if (buffer_count) 2252 *buffer_count = xfb_base + ctx->streamout.num_targets; 2253 2254#if PAN_ARCH >= 6 2255 /* Suppress prefetch on Bifrost */ 2256 memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings)); 2257#endif 2258 2259 /* Emit the stream out buffers. We need enough room for all the 2260 * vertices we emit across all instances */ 2261 2262 unsigned out_count = ctx->instance_count * 2263 u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count); 2264 2265 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { 2266 panfrost_emit_streamout(batch, &varyings[xfb_base + i], 2267 so->stride[i] * 4, 2268 out_count, 2269 ctx->streamout.targets[i]); 2270 } 2271 2272 if (stride) { 2273 panfrost_emit_varyings(batch, 2274 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], 2275 stride, vertex_count); 2276 } 2277 2278 /* fp32 vec4 gl_Position */ 2279 *position = panfrost_emit_varyings(batch, 2280 &varyings[pan_varying_index(present, PAN_VARY_POSITION)], 2281 sizeof(float) * 4, vertex_count); 2282 2283 if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) { 2284 *psiz = panfrost_emit_varyings(batch, 2285 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2286 2, vertex_count); 2287 } 2288 2289#if PAN_ARCH <= 5 2290 pan_emit_special_input(varyings, present, 2291 PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD); 2292 pan_emit_special_input(varyings, present, PAN_VARY_FACE, 2293 MALI_ATTRIBUTE_SPECIAL_FRONT_FACING); 2294 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, 2295 MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); 2296#endif 2297 2298 *buffers = T.gpu; 2299 *vs_attribs = linkage->producer; 2300 *fs_attribs = linkage->consumer; 2301} 2302 2303static void 2304panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch, 2305 const struct panfrost_ptr *vertex_job, 2306 const struct panfrost_ptr *tiler_job) 2307{ 2308 struct panfrost_context *ctx = batch->ctx; 2309 2310 /* If rasterizer discard is enable, only submit the vertex. XXX - set 2311 * job_barrier in case buffers get ping-ponged and we need to enforce 2312 * ordering, this has a perf hit! See 2313 * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */ 2314 2315 unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard, 2316 MALI_JOB_TYPE_VERTEX, true, false, 2317 ctx->indirect_draw ? 2318 batch->indirect_draw_job_id : 0, 2319 0, vertex_job, false); 2320 2321 if (ctx->rasterizer->base.rasterizer_discard || batch->scissor_culls_everything) 2322 return; 2323 2324 panfrost_add_job(&batch->pool.base, &batch->scoreboard, 2325 MALI_JOB_TYPE_TILER, false, false, 2326 vertex, 0, tiler_job, false); 2327} 2328 2329static void 2330emit_tls(struct panfrost_batch *batch) 2331{ 2332 struct panfrost_device *dev = pan_device(batch->ctx->base.screen); 2333 2334 /* Emitted with the FB descriptor on Midgard. */ 2335 if (PAN_ARCH <= 5 && batch->framebuffer.gpu) 2336 return; 2337 2338 struct panfrost_bo *tls_bo = 2339 batch->stack_size ? 2340 panfrost_batch_get_scratchpad(batch, 2341 batch->stack_size, 2342 dev->thread_tls_alloc, 2343 dev->core_count): 2344 NULL; 2345 struct pan_tls_info tls = { 2346 .tls = { 2347 .ptr = tls_bo ? tls_bo->ptr.gpu : 0, 2348 .size = batch->stack_size, 2349 }, 2350 }; 2351 2352 assert(batch->tls.cpu); 2353 GENX(pan_emit_tls)(&tls, batch->tls.cpu); 2354} 2355 2356static void 2357emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb) 2358{ 2359 struct panfrost_device *dev = pan_device(batch->ctx->base.screen); 2360 struct panfrost_bo *tls_bo = 2361 batch->stack_size ? 2362 panfrost_batch_get_scratchpad(batch, 2363 batch->stack_size, 2364 dev->thread_tls_alloc, 2365 dev->core_count): 2366 NULL; 2367 struct pan_tls_info tls = { 2368 .tls = { 2369 .ptr = tls_bo ? tls_bo->ptr.gpu : 0, 2370 .size = batch->stack_size, 2371 }, 2372 }; 2373 2374 batch->framebuffer.gpu |= 2375 GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx, 2376 batch->framebuffer.cpu); 2377} 2378 2379/* Mark a surface as written */ 2380 2381static void 2382panfrost_initialize_surface(struct panfrost_batch *batch, 2383 struct pipe_surface *surf) 2384{ 2385 if (surf) { 2386 struct panfrost_resource *rsrc = pan_resource(surf->texture); 2387 BITSET_SET(rsrc->valid.data, surf->u.tex.level); 2388 } 2389} 2390 2391/* Generate a fragment job. This should be called once per frame. (According to 2392 * presentations, this is supposed to correspond to eglSwapBuffers) */ 2393 2394static mali_ptr 2395emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb) 2396{ 2397 /* Mark the affected buffers as initialized, since we're writing to it. 2398 * Also, add the surfaces we're writing to to the batch */ 2399 2400 struct pipe_framebuffer_state *fb = &batch->key; 2401 2402 for (unsigned i = 0; i < fb->nr_cbufs; ++i) 2403 panfrost_initialize_surface(batch, fb->cbufs[i]); 2404 2405 panfrost_initialize_surface(batch, fb->zsbuf); 2406 2407 /* The passed tile coords can be out of range in some cases, so we need 2408 * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT. 2409 * Theoretically we also need to clamp the coordinates positive, but we 2410 * avoid that edge case as all four values are unsigned. Also, 2411 * theoretically we could clamp the minima, but if that has to happen 2412 * the asserts would fail anyway (since the maxima would get clamped 2413 * and then be smaller than the minima). An edge case of sorts occurs 2414 * when no scissors are added to draw, so by default min=~0 and max=0. 2415 * But that can't happen if any actual drawing occurs (beyond a 2416 * wallpaper reload), so this is again irrelevant in practice. */ 2417 2418 batch->maxx = MIN2(batch->maxx, fb->width); 2419 batch->maxy = MIN2(batch->maxy, fb->height); 2420 2421 /* Rendering region must be at least 1x1; otherwise, there is nothing 2422 * to do and the whole job chain should have been discarded. */ 2423 2424 assert(batch->maxx > batch->minx); 2425 assert(batch->maxy > batch->miny); 2426 2427 struct panfrost_ptr transfer = 2428 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB); 2429 2430 GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, 2431 transfer.cpu); 2432 2433 return transfer.gpu; 2434} 2435 2436#define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c; 2437 2438static uint8_t 2439pan_draw_mode(enum pipe_prim_type mode) 2440{ 2441 switch (mode) { 2442 DEFINE_CASE(POINTS); 2443 DEFINE_CASE(LINES); 2444 DEFINE_CASE(LINE_LOOP); 2445 DEFINE_CASE(LINE_STRIP); 2446 DEFINE_CASE(TRIANGLES); 2447 DEFINE_CASE(TRIANGLE_STRIP); 2448 DEFINE_CASE(TRIANGLE_FAN); 2449 DEFINE_CASE(QUADS); 2450 DEFINE_CASE(POLYGON); 2451#if PAN_ARCH <= 6 2452 DEFINE_CASE(QUAD_STRIP); 2453#endif 2454 2455 default: 2456 unreachable("Invalid draw mode"); 2457 } 2458} 2459 2460#undef DEFINE_CASE 2461 2462/* Count generated primitives (when there is no geom/tess shaders) for 2463 * transform feedback */ 2464 2465static void 2466panfrost_statistics_record( 2467 struct panfrost_context *ctx, 2468 const struct pipe_draw_info *info, 2469 const struct pipe_draw_start_count_bias *draw) 2470{ 2471 if (!ctx->active_queries) 2472 return; 2473 2474 uint32_t prims = u_prims_for_vertices(info->mode, draw->count); 2475 ctx->prims_generated += prims; 2476 2477 if (!ctx->streamout.num_targets) 2478 return; 2479 2480 ctx->tf_prims_generated += prims; 2481} 2482 2483static void 2484panfrost_update_streamout_offsets(struct panfrost_context *ctx) 2485{ 2486 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { 2487 unsigned count; 2488 2489 count = u_stream_outputs_for_vertices(ctx->active_prim, 2490 ctx->vertex_count); 2491 pan_so_target(ctx->streamout.targets[i])->offset += count; 2492 } 2493} 2494 2495static inline void 2496pan_emit_draw_descs(struct panfrost_batch *batch, 2497 struct MALI_DRAW *d, enum pipe_shader_type st) 2498{ 2499 d->offset_start = batch->ctx->offset_start; 2500 d->instance_size = batch->ctx->instance_count > 1 ? 2501 batch->ctx->padded_count : 1; 2502 2503 d->uniform_buffers = batch->uniform_buffers[st]; 2504 d->push_uniforms = batch->push_uniforms[st]; 2505 d->textures = batch->textures[st]; 2506 d->samplers = batch->samplers[st]; 2507} 2508 2509static inline enum mali_index_type 2510panfrost_translate_index_size(unsigned size) 2511{ 2512 STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0); 2513 STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1); 2514 STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2); 2515 2516 return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size; 2517} 2518 2519static void 2520panfrost_draw_emit_vertex(struct panfrost_batch *batch, 2521 const struct pipe_draw_info *info, 2522 void *invocation_template, 2523 mali_ptr vs_vary, mali_ptr varyings, 2524 mali_ptr attribs, mali_ptr attrib_bufs, 2525 void *job) 2526{ 2527 void *section = 2528 pan_section_ptr(job, COMPUTE_JOB, INVOCATION); 2529 memcpy(section, invocation_template, pan_size(INVOCATION)); 2530 2531 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) { 2532 cfg.job_task_split = 5; 2533 } 2534 2535 pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) { 2536 cfg.draw_descriptor_is_64b = true; 2537 cfg.state = batch->rsd[PIPE_SHADER_VERTEX]; 2538 cfg.attributes = attribs; 2539 cfg.attribute_buffers = attrib_bufs; 2540 cfg.varyings = vs_vary; 2541 cfg.varying_buffers = vs_vary ? varyings : 0; 2542 cfg.thread_storage = batch->tls.gpu; 2543 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX); 2544 } 2545} 2546 2547static void 2548panfrost_emit_primitive_size(struct panfrost_context *ctx, 2549 bool points, mali_ptr size_array, 2550 void *prim_size) 2551{ 2552 struct panfrost_rasterizer *rast = ctx->rasterizer; 2553 2554 pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { 2555 if (panfrost_writes_point_size(ctx)) { 2556 cfg.size_array = size_array; 2557 } else { 2558 cfg.constant = points ? 2559 rast->base.point_size : 2560 rast->base.line_width; 2561 } 2562 } 2563} 2564 2565static bool 2566panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info) 2567{ 2568 unsigned implicit_index = (1 << (info->index_size * 8)) - 1; 2569 bool implicit = info->restart_index == implicit_index; 2570 return info->primitive_restart && implicit; 2571} 2572 2573static inline void 2574panfrost_update_state_tex(struct panfrost_batch *batch, 2575 enum pipe_shader_type st) 2576{ 2577 struct panfrost_context *ctx = batch->ctx; 2578 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st); 2579 2580 unsigned dirty_3d = ctx->dirty; 2581 unsigned dirty = ctx->dirty_shader[st]; 2582 2583 if (dirty & PAN_DIRTY_STAGE_TEXTURE) { 2584 batch->textures[st] = 2585 panfrost_emit_texture_descriptors(batch, st); 2586 } 2587 2588 if (dirty & PAN_DIRTY_STAGE_SAMPLER) { 2589 batch->samplers[st] = 2590 panfrost_emit_sampler_descriptors(batch, st); 2591 } 2592 2593 if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) { 2594 batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st, 2595 &batch->push_uniforms[st]); 2596 } 2597} 2598 2599static inline void 2600panfrost_update_state_3d(struct panfrost_batch *batch) 2601{ 2602 unsigned dirty = batch->ctx->dirty; 2603 2604 if (dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR)) 2605 batch->viewport = panfrost_emit_viewport(batch); 2606 2607 if (dirty & PAN_DIRTY_TLS_SIZE) 2608 panfrost_batch_adjust_stack_size(batch); 2609} 2610 2611static void 2612panfrost_update_state_vs(struct panfrost_batch *batch) 2613{ 2614 enum pipe_shader_type st = PIPE_SHADER_VERTEX; 2615 unsigned dirty = batch->ctx->dirty_shader[st]; 2616 2617 if (dirty & PAN_DIRTY_STAGE_RENDERER) 2618 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st); 2619 2620 panfrost_update_state_tex(batch, st); 2621} 2622 2623static void 2624panfrost_update_state_fs(struct panfrost_batch *batch) 2625{ 2626 enum pipe_shader_type st = PIPE_SHADER_FRAGMENT; 2627 unsigned dirty = batch->ctx->dirty_shader[st]; 2628 2629 if (dirty & PAN_DIRTY_STAGE_RENDERER) 2630 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch); 2631 2632 if (dirty & PAN_DIRTY_STAGE_IMAGE) { 2633 batch->attribs[st] = panfrost_emit_image_attribs(batch, 2634 &batch->attrib_bufs[st], st); 2635 } 2636 2637 panfrost_update_state_tex(batch, st); 2638} 2639 2640#if PAN_ARCH >= 6 2641static mali_ptr 2642panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count) 2643{ 2644 struct panfrost_device *dev = pan_device(batch->ctx->base.screen); 2645 2646 if (!vertex_count) 2647 return 0; 2648 2649 if (batch->tiler_ctx.bifrost) 2650 return batch->tiler_ctx.bifrost; 2651 2652 struct panfrost_ptr t = 2653 pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); 2654 2655 GENX(pan_emit_tiler_heap)(dev, t.cpu); 2656 2657 mali_ptr heap = t.gpu; 2658 2659 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); 2660 GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height, 2661 util_framebuffer_get_num_samples(&batch->key), 2662 heap, t.cpu); 2663 2664 batch->tiler_ctx.bifrost = t.gpu; 2665 return batch->tiler_ctx.bifrost; 2666} 2667#endif 2668 2669static void 2670panfrost_draw_emit_tiler(struct panfrost_batch *batch, 2671 const struct pipe_draw_info *info, 2672 const struct pipe_draw_start_count_bias *draw, 2673 void *invocation_template, 2674 mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings, 2675 mali_ptr pos, mali_ptr psiz, void *job) 2676{ 2677 struct panfrost_context *ctx = batch->ctx; 2678 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; 2679 2680 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); 2681 memcpy(section, invocation_template, pan_size(INVOCATION)); 2682 2683 section = pan_section_ptr(job, TILER_JOB, PRIMITIVE); 2684 pan_pack(section, PRIMITIVE, cfg) { 2685 cfg.draw_mode = pan_draw_mode(info->mode); 2686 if (panfrost_writes_point_size(ctx)) 2687 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; 2688 2689 /* For line primitives, PRIMITIVE.first_provoking_vertex must 2690 * be set to true and the provoking vertex is selected with 2691 * DRAW.flat_shading_vertex. 2692 */ 2693 if (info->mode == PIPE_PRIM_LINES || 2694 info->mode == PIPE_PRIM_LINE_LOOP || 2695 info->mode == PIPE_PRIM_LINE_STRIP) 2696 cfg.first_provoking_vertex = true; 2697 else 2698 cfg.first_provoking_vertex = rast->flatshade_first; 2699 2700 if (panfrost_is_implicit_prim_restart(info)) { 2701 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT; 2702 } else if (info->primitive_restart) { 2703 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT; 2704 cfg.primitive_restart_index = info->restart_index; 2705 } 2706 2707 cfg.job_task_split = 6; 2708 2709 cfg.index_count = ctx->indirect_draw ? 1 : draw->count; 2710 cfg.index_type = panfrost_translate_index_size(info->index_size); 2711 2712 if (cfg.index_type) { 2713 cfg.indices = indices; 2714 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start; 2715 } 2716 } 2717 2718 enum pipe_prim_type prim = u_reduced_prim(info->mode); 2719 bool polygon = (prim == PIPE_PRIM_TRIANGLES); 2720 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); 2721 2722#if PAN_ARCH >= 6 2723 pan_section_pack(job, TILER_JOB, TILER, cfg) { 2724 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); 2725 } 2726 2727 pan_section_pack(job, TILER_JOB, PADDING, cfg); 2728#endif 2729 2730 section = pan_section_ptr(job, TILER_JOB, DRAW); 2731 pan_pack(section, DRAW, cfg) { 2732 cfg.four_components_per_vertex = true; 2733 cfg.draw_descriptor_is_64b = true; 2734 cfg.front_face_ccw = rast->front_ccw; 2735 2736 /* 2737 * From the Gallium documentation, 2738 * pipe_rasterizer_state::cull_face "indicates which faces of 2739 * polygons to cull". Points and lines are not considered 2740 * polygons and should be drawn even if all faces are culled. 2741 * The hardware does not take primitive type into account when 2742 * culling, so we need to do that check ourselves. 2743 */ 2744 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT); 2745 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK); 2746 cfg.position = pos; 2747 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT]; 2748 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT]; 2749 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT]; 2750 cfg.viewport = batch->viewport; 2751 cfg.varyings = fs_vary; 2752 cfg.varying_buffers = fs_vary ? varyings : 0; 2753 cfg.thread_storage = batch->tls.gpu; 2754 2755 /* For all primitives but lines DRAW.flat_shading_vertex must 2756 * be set to 0 and the provoking vertex is selected with the 2757 * PRIMITIVE.first_provoking_vertex field. 2758 */ 2759 if (prim == PIPE_PRIM_LINES) { 2760 /* The logic is inverted across arches. */ 2761 cfg.flat_shading_vertex = rast->flatshade_first 2762 ^ (PAN_ARCH <= 5); 2763 } 2764 2765 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT); 2766 2767 if (ctx->occlusion_query && ctx->active_queries) { 2768 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER) 2769 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER; 2770 else 2771 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE; 2772 2773 struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc); 2774 cfg.occlusion = rsrc->image.data.bo->ptr.gpu; 2775 panfrost_batch_write_rsrc(ctx->batch, rsrc, 2776 PIPE_SHADER_FRAGMENT); 2777 } 2778 } 2779 2780 panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); 2781} 2782 2783static void 2784panfrost_direct_draw(struct panfrost_batch *batch, 2785 const struct pipe_draw_info *info, 2786 unsigned drawid_offset, 2787 const struct pipe_draw_start_count_bias *draw) 2788{ 2789 if (!draw->count || !info->instance_count) 2790 return; 2791 2792 struct panfrost_context *ctx = batch->ctx; 2793 2794 /* Take into account a negative bias */ 2795 ctx->indirect_draw = false; 2796 ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0); 2797 ctx->instance_count = info->instance_count; 2798 ctx->base_vertex = info->index_size ? draw->index_bias : 0; 2799 ctx->base_instance = info->start_instance; 2800 ctx->active_prim = info->mode; 2801 ctx->drawid = drawid_offset; 2802 2803 struct panfrost_ptr tiler = 2804 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); 2805 struct panfrost_ptr vertex = 2806 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); 2807 2808 unsigned vertex_count = ctx->vertex_count; 2809 2810 unsigned min_index = 0, max_index = 0; 2811 mali_ptr indices = 0; 2812 2813 if (info->index_size) { 2814 indices = panfrost_get_index_buffer_bounded(batch, info, draw, 2815 &min_index, 2816 &max_index); 2817 2818 /* Use the corresponding values */ 2819 vertex_count = max_index - min_index + 1; 2820 ctx->offset_start = min_index + draw->index_bias; 2821 } else { 2822 ctx->offset_start = draw->start; 2823 } 2824 2825 if (info->instance_count > 1) 2826 ctx->padded_count = panfrost_padded_vertex_count(vertex_count); 2827 else 2828 ctx->padded_count = vertex_count; 2829 2830 panfrost_statistics_record(ctx, info, draw); 2831 2832 struct mali_invocation_packed invocation; 2833 if (info->instance_count > 1) { 2834 panfrost_pack_work_groups_compute(&invocation, 2835 1, vertex_count, info->instance_count, 2836 1, 1, 1, true, false); 2837 } else { 2838 pan_pack(&invocation, INVOCATION, cfg) { 2839 cfg.invocations = MALI_POSITIVE(vertex_count); 2840 cfg.size_y_shift = 0; 2841 cfg.size_z_shift = 0; 2842 cfg.workgroups_x_shift = 0; 2843 cfg.workgroups_y_shift = 0; 2844 cfg.workgroups_z_shift = 32; 2845 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT; 2846 } 2847 } 2848 2849 /* Emit all sort of descriptors. */ 2850 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; 2851 2852 panfrost_emit_varying_descriptor(batch, 2853 ctx->padded_count * 2854 ctx->instance_count, 2855 &vs_vary, &fs_vary, &varyings, 2856 NULL, &pos, &psiz, 2857 info->mode == PIPE_PRIM_POINTS); 2858 2859 mali_ptr attribs, attrib_bufs; 2860 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); 2861 2862 panfrost_update_state_3d(batch); 2863 panfrost_update_state_vs(batch); 2864 panfrost_update_state_fs(batch); 2865 panfrost_clean_state_3d(ctx); 2866 2867 /* Fire off the draw itself */ 2868 panfrost_draw_emit_vertex(batch, info, &invocation, 2869 vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); 2870 panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, 2871 fs_vary, varyings, pos, psiz, tiler.cpu); 2872 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); 2873 2874 /* Increment transform feedback offsets */ 2875 panfrost_update_streamout_offsets(ctx); 2876} 2877 2878static void 2879panfrost_indirect_draw(struct panfrost_batch *batch, 2880 const struct pipe_draw_info *info, 2881 unsigned drawid_offset, 2882 const struct pipe_draw_indirect_info *indirect, 2883 const struct pipe_draw_start_count_bias *draw) 2884{ 2885 /* Indirect draw count and multi-draw not supported. */ 2886 assert(indirect->draw_count == 1 && !indirect->indirect_draw_count); 2887 2888 struct panfrost_context *ctx = batch->ctx; 2889 struct panfrost_device *dev = pan_device(ctx->base.screen); 2890 2891 /* TODO: update statistics (see panfrost_statistics_record()) */ 2892 /* TODO: Increment transform feedback offsets */ 2893 assert(ctx->streamout.num_targets == 0); 2894 2895 ctx->active_prim = info->mode; 2896 ctx->drawid = drawid_offset; 2897 ctx->indirect_draw = true; 2898 2899 struct panfrost_ptr tiler = 2900 pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); 2901 struct panfrost_ptr vertex = 2902 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); 2903 2904 struct panfrost_shader_state *vs = 2905 panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); 2906 2907 struct panfrost_bo *index_buf = NULL; 2908 2909 if (info->index_size) { 2910 assert(!info->has_user_indices); 2911 struct panfrost_resource *rsrc = pan_resource(info->index.resource); 2912 index_buf = rsrc->image.data.bo; 2913 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); 2914 } 2915 2916 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; 2917 unsigned varying_buf_count; 2918 2919 /* We want to create templates, set all count fields to 0 to reflect 2920 * that. 2921 */ 2922 ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0; 2923 ctx->offset_start = 0; 2924 2925 /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the 2926 * vertex shader uses gl_VertexID or gl_BaseVertex. 2927 */ 2928 ctx->first_vertex_sysval_ptr = 0; 2929 ctx->base_vertex_sysval_ptr = 0; 2930 ctx->base_instance_sysval_ptr = 0; 2931 2932 panfrost_update_state_3d(batch); 2933 panfrost_update_state_vs(batch); 2934 panfrost_update_state_fs(batch); 2935 panfrost_clean_state_3d(ctx); 2936 2937 bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS); 2938 2939 panfrost_emit_varying_descriptor(batch, 0, 2940 &vs_vary, &fs_vary, &varyings, 2941 &varying_buf_count, &pos, &psiz, 2942 point_coord_replace); 2943 2944 mali_ptr attribs, attrib_bufs; 2945 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); 2946 2947 /* Zero-ed invocation, the compute job will update it. */ 2948 static struct mali_invocation_packed invocation; 2949 2950 /* Fire off the draw itself */ 2951 panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings, 2952 attribs, attrib_bufs, vertex.cpu); 2953 panfrost_draw_emit_tiler(batch, info, draw, &invocation, 2954 index_buf ? index_buf->ptr.gpu : 0, 2955 fs_vary, varyings, pos, psiz, tiler.cpu); 2956 2957 /* Add the varying heap BO to the batch if we're allocating varyings. */ 2958 if (varyings) { 2959 panfrost_batch_add_bo(batch, 2960 dev->indirect_draw_shaders.varying_heap, 2961 PIPE_SHADER_VERTEX); 2962 } 2963 2964 assert(indirect->buffer); 2965 2966 struct panfrost_resource *draw_buf = pan_resource(indirect->buffer); 2967 2968 /* Don't count images: those attributes don't need to be patched. */ 2969 unsigned attrib_count = 2970 vs->info.attribute_count - 2971 util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]); 2972 2973 panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX); 2974 2975 struct pan_indirect_draw_info draw_info = { 2976 .last_indirect_draw = batch->indirect_draw_job_id, 2977 .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset, 2978 .index_buf = index_buf ? index_buf->ptr.gpu : 0, 2979 .first_vertex_sysval = ctx->first_vertex_sysval_ptr, 2980 .base_vertex_sysval = ctx->base_vertex_sysval_ptr, 2981 .base_instance_sysval = ctx->base_instance_sysval_ptr, 2982 .vertex_job = vertex.gpu, 2983 .tiler_job = tiler.gpu, 2984 .attrib_bufs = attrib_bufs, 2985 .attribs = attribs, 2986 .attrib_count = attrib_count, 2987 .varying_bufs = varyings, 2988 .index_size = info->index_size, 2989 }; 2990 2991 if (panfrost_writes_point_size(ctx)) 2992 draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE; 2993 2994 if (vs->info.vs.writes_point_size) 2995 draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ; 2996 2997 2998 if (info->primitive_restart) { 2999 draw_info.restart_index = info->restart_index; 3000 draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART; 3001 } 3002 3003 batch->indirect_draw_job_id = 3004 GENX(panfrost_emit_indirect_draw)(&batch->pool.base, 3005 &batch->scoreboard, 3006 &draw_info, 3007 &batch->indirect_draw_ctx); 3008 3009 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); 3010} 3011 3012static void 3013panfrost_draw_vbo(struct pipe_context *pipe, 3014 const struct pipe_draw_info *info, 3015 unsigned drawid_offset, 3016 const struct pipe_draw_indirect_info *indirect, 3017 const struct pipe_draw_start_count_bias *draws, 3018 unsigned num_draws) 3019{ 3020 struct panfrost_context *ctx = pan_context(pipe); 3021 struct panfrost_device *dev = pan_device(pipe->screen); 3022 3023 if (!panfrost_render_condition_check(ctx)) 3024 return; 3025 3026 /* Emulate indirect draws unless we're using the experimental path */ 3027 if (!(dev->debug & PAN_DBG_INDIRECT) && indirect && indirect->buffer) { 3028 assert(num_draws == 1); 3029 util_draw_indirect(pipe, info, indirect); 3030 return; 3031 } 3032 3033 /* Do some common setup */ 3034 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); 3035 3036 /* Don't add too many jobs to a single batch. Hardware has a hard limit 3037 * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to 3038 * avoid the risk of timeouts. This might not be a good idea. */ 3039 if (unlikely(batch->scoreboard.job_index > 10000)) 3040 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws"); 3041 3042 unsigned zs_draws = ctx->depth_stencil->draws; 3043 batch->draws |= zs_draws; 3044 batch->resolve |= zs_draws; 3045 3046 /* Mark everything dirty when debugging */ 3047 if (unlikely(dev->debug & PAN_DBG_DIRTY)) 3048 panfrost_dirty_state_all(ctx); 3049 3050 /* Conservatively assume draw parameters always change */ 3051 ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID; 3052 3053 if (indirect) { 3054 assert(num_draws == 1); 3055 3056 if (indirect->count_from_stream_output) { 3057 struct pipe_draw_start_count_bias tmp_draw = *draws; 3058 struct panfrost_streamout_target *so = 3059 pan_so_target(indirect->count_from_stream_output); 3060 3061 tmp_draw.start = 0; 3062 tmp_draw.count = so->offset; 3063 tmp_draw.index_bias = 0; 3064 panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw); 3065 return; 3066 } 3067 3068 panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]); 3069 return; 3070 } 3071 3072 struct pipe_draw_info tmp_info = *info; 3073 unsigned drawid = drawid_offset; 3074 3075 for (unsigned i = 0; i < num_draws; i++) { 3076 panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]); 3077 3078 if (tmp_info.increment_draw_id) { 3079 ctx->dirty |= PAN_DIRTY_DRAWID; 3080 drawid++; 3081 } 3082 } 3083 3084} 3085 3086/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we 3087 * construct the COMPUTE job and some of its payload. 3088 */ 3089 3090static void 3091panfrost_launch_grid(struct pipe_context *pipe, 3092 const struct pipe_grid_info *info) 3093{ 3094 struct panfrost_context *ctx = pan_context(pipe); 3095 3096 /* XXX - shouldn't be necessary with working memory barriers. Affected 3097 * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */ 3098 panfrost_flush_all_batches(ctx, "Launch grid pre-barrier"); 3099 3100 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); 3101 3102 struct panfrost_shader_state *cs = 3103 &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0]; 3104 3105 /* Indirect dispatch can't handle workgroup local storage since that 3106 * would require dynamic memory allocation. Bail in this case. */ 3107 if (info->indirect && !cs->info.wls_size) { 3108 struct pipe_transfer *transfer; 3109 uint32_t *params = pipe_buffer_map_range(pipe, info->indirect, 3110 info->indirect_offset, 3111 3 * sizeof(uint32_t), 3112 PIPE_MAP_READ, 3113 &transfer); 3114 3115 struct pipe_grid_info direct = *info; 3116 direct.indirect = NULL; 3117 direct.grid[0] = params[0]; 3118 direct.grid[1] = params[1]; 3119 direct.grid[2] = params[2]; 3120 pipe_buffer_unmap(pipe, transfer); 3121 3122 if (params[0] && params[1] && params[2]) 3123 panfrost_launch_grid(pipe, &direct); 3124 3125 return; 3126 } 3127 3128 ctx->compute_grid = info; 3129 3130 struct panfrost_ptr t = 3131 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); 3132 3133 /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so 3134 * reuse the graphics path for this by lowering to Gallium */ 3135 3136 struct pipe_constant_buffer ubuf = { 3137 .buffer = NULL, 3138 .buffer_offset = 0, 3139 .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem, 3140 .user_buffer = info->input 3141 }; 3142 3143 if (info->input) 3144 pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf); 3145 3146 /* Invoke according to the grid info */ 3147 3148 void *invocation = 3149 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION); 3150 unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] }; 3151 3152 if (info->indirect) 3153 num_wg[0] = num_wg[1] = num_wg[2] = 1; 3154 3155 panfrost_pack_work_groups_compute(invocation, 3156 num_wg[0], num_wg[1], num_wg[2], 3157 info->block[0], info->block[1], 3158 info->block[2], 3159 false, info->indirect != NULL); 3160 3161 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) { 3162 cfg.job_task_split = 3163 util_logbase2_ceil(info->block[0] + 1) + 3164 util_logbase2_ceil(info->block[1] + 1) + 3165 util_logbase2_ceil(info->block[2] + 1); 3166 } 3167 3168 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) { 3169 cfg.draw_descriptor_is_64b = true; 3170 cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE); 3171 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE); 3172 cfg.thread_storage = panfrost_emit_shared_memory(batch, info); 3173 cfg.uniform_buffers = panfrost_emit_const_buf(batch, 3174 PIPE_SHADER_COMPUTE, &cfg.push_uniforms); 3175 cfg.textures = panfrost_emit_texture_descriptors(batch, 3176 PIPE_SHADER_COMPUTE); 3177 cfg.samplers = panfrost_emit_sampler_descriptors(batch, 3178 PIPE_SHADER_COMPUTE); 3179 } 3180 3181 unsigned indirect_dep = 0; 3182 if (info->indirect) { 3183 struct pan_indirect_dispatch_info indirect = { 3184 .job = t.gpu, 3185 .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu + 3186 info->indirect_offset, 3187 .num_wg_sysval = { 3188 batch->num_wg_sysval[0], 3189 batch->num_wg_sysval[1], 3190 batch->num_wg_sysval[2], 3191 }, 3192 }; 3193 3194 indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base, 3195 &batch->scoreboard, 3196 &indirect); 3197 } 3198 3199 panfrost_add_job(&batch->pool.base, &batch->scoreboard, 3200 MALI_JOB_TYPE_COMPUTE, true, false, 3201 indirect_dep, 0, &t, false); 3202 panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); 3203} 3204 3205static void * 3206panfrost_create_rasterizer_state( 3207 struct pipe_context *pctx, 3208 const struct pipe_rasterizer_state *cso) 3209{ 3210 struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer); 3211 3212 so->base = *cso; 3213 3214 /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */ 3215 assert(cso->offset_clamp == 0.0); 3216 3217 pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) { 3218 cfg.multisample_enable = cso->multisample; 3219 cfg.fixed_function_near_discard = cso->depth_clip_near; 3220 cfg.fixed_function_far_discard = cso->depth_clip_far; 3221 cfg.shader_depth_range_fixed = true; 3222 } 3223 3224 pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) { 3225 cfg.depth_range_1 = cso->offset_tri; 3226 cfg.depth_range_2 = cso->offset_tri; 3227 cfg.single_sampled_lines = !cso->multisample; 3228 } 3229 3230 return so; 3231} 3232 3233/* Assigns a vertex buffer for a given (index, divisor) tuple */ 3234 3235static unsigned 3236pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers, 3237 unsigned *nr_bufs, 3238 unsigned vbi, 3239 unsigned divisor) 3240{ 3241 /* Look up the buffer */ 3242 for (unsigned i = 0; i < (*nr_bufs); ++i) { 3243 if (buffers[i].vbi == vbi && buffers[i].divisor == divisor) 3244 return i; 3245 } 3246 3247 /* Else, create a new buffer */ 3248 unsigned idx = (*nr_bufs)++; 3249 3250 buffers[idx] = (struct pan_vertex_buffer) { 3251 .vbi = vbi, 3252 .divisor = divisor 3253 }; 3254 3255 return idx; 3256} 3257 3258static void * 3259panfrost_create_vertex_elements_state( 3260 struct pipe_context *pctx, 3261 unsigned num_elements, 3262 const struct pipe_vertex_element *elements) 3263{ 3264 struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state); 3265 struct panfrost_device *dev = pan_device(pctx->screen); 3266 3267 so->num_elements = num_elements; 3268 memcpy(so->pipe, elements, sizeof(*elements) * num_elements); 3269 3270 /* Assign attribute buffers corresponding to the vertex buffers, keyed 3271 * for a particular divisor since that's how instancing works on Mali */ 3272 for (unsigned i = 0; i < num_elements; ++i) { 3273 so->element_buffer[i] = pan_assign_vertex_buffer( 3274 so->buffers, &so->nr_bufs, 3275 elements[i].vertex_buffer_index, 3276 elements[i].instance_divisor); 3277 } 3278 3279 for (int i = 0; i < num_elements; ++i) { 3280 enum pipe_format fmt = elements[i].src_format; 3281 const struct util_format_description *desc = util_format_description(fmt); 3282 so->formats[i] = dev->formats[desc->format].hw; 3283 assert(so->formats[i]); 3284 } 3285 3286 /* Let's also prepare vertex builtins */ 3287 so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; 3288 so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; 3289 3290 return so; 3291} 3292 3293static inline unsigned 3294pan_pipe_to_stencil_op(enum pipe_stencil_op in) 3295{ 3296 switch (in) { 3297 case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP; 3298 case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO; 3299 case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE; 3300 case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT; 3301 case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT; 3302 case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP; 3303 case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP; 3304 case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT; 3305 default: unreachable("Invalid stencil op"); 3306 } 3307} 3308 3309static inline void 3310pan_pipe_to_stencil(const struct pipe_stencil_state *in, 3311 struct mali_stencil_packed *out) 3312{ 3313 pan_pack(out, STENCIL, s) { 3314 s.mask = in->valuemask; 3315 s.compare_function = (enum mali_func) in->func; 3316 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op); 3317 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op); 3318 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op); 3319 } 3320} 3321 3322static void * 3323panfrost_create_depth_stencil_state(struct pipe_context *pipe, 3324 const struct pipe_depth_stencil_alpha_state *zsa) 3325{ 3326 struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state); 3327 so->base = *zsa; 3328 3329 /* Normalize (there's no separate enable) */ 3330 if (!zsa->alpha_enabled) 3331 so->base.alpha_func = MALI_FUNC_ALWAYS; 3332 3333 /* Prepack relevant parts of the Renderer State Descriptor. They will 3334 * be ORed in at draw-time */ 3335 pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) { 3336 cfg.depth_function = zsa->depth_enabled ? 3337 (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS; 3338 3339 cfg.depth_write_mask = zsa->depth_writemask; 3340 } 3341 3342 pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) { 3343 cfg.stencil_enable = zsa->stencil[0].enabled; 3344 3345 cfg.stencil_mask_front = zsa->stencil[0].writemask; 3346 cfg.stencil_mask_back = zsa->stencil[1].enabled ? 3347 zsa->stencil[1].writemask : zsa->stencil[0].writemask; 3348 3349#if PAN_ARCH <= 5 3350 cfg.alpha_test_compare_function = 3351 (enum mali_func) so->base.alpha_func; 3352#endif 3353 } 3354 3355 /* Stencil tests have their own words in the RSD */ 3356 pan_pipe_to_stencil(&zsa->stencil[0], &so->stencil_front); 3357 3358 if (zsa->stencil[1].enabled) 3359 pan_pipe_to_stencil(&zsa->stencil[1], &so->stencil_back); 3360 else 3361 so->stencil_back = so->stencil_front; 3362 3363 so->enabled = zsa->stencil[0].enabled || 3364 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS); 3365 3366 /* Write masks need tracking together */ 3367 if (zsa->depth_writemask) 3368 so->draws |= PIPE_CLEAR_DEPTH; 3369 3370 if (zsa->stencil[0].enabled) 3371 so->draws |= PIPE_CLEAR_STENCIL; 3372 3373 /* TODO: Bounds test should be easy */ 3374 assert(!zsa->depth_bounds_test); 3375 3376 return so; 3377} 3378 3379static struct pipe_sampler_view * 3380panfrost_create_sampler_view( 3381 struct pipe_context *pctx, 3382 struct pipe_resource *texture, 3383 const struct pipe_sampler_view *template) 3384{ 3385 struct panfrost_context *ctx = pan_context(pctx); 3386 struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view); 3387 3388 pan_legalize_afbc_format(ctx, pan_resource(texture), template->format); 3389 3390 pipe_reference(NULL, &texture->reference); 3391 3392 so->base = *template; 3393 so->base.texture = texture; 3394 so->base.reference.count = 1; 3395 so->base.context = pctx; 3396 3397 panfrost_create_sampler_view_bo(so, pctx, texture); 3398 3399 return (struct pipe_sampler_view *) so; 3400} 3401 3402/* A given Gallium blend state can be encoded to the hardware in numerous, 3403 * dramatically divergent ways due to the interactions of blending with 3404 * framebuffer formats. Conceptually, there are two modes: 3405 * 3406 * - Fixed-function blending (for suitable framebuffer formats, suitable blend 3407 * state, and suitable blend constant) 3408 * 3409 * - Blend shaders (for everything else) 3410 * 3411 * A given Gallium blend configuration will compile to exactly one 3412 * fixed-function blend state, if it compiles to any, although the constant 3413 * will vary across runs as that is tracked outside of the Gallium CSO. 3414 * 3415 * However, that same blend configuration will compile to many different blend 3416 * shaders, depending on the framebuffer formats active. The rationale is that 3417 * blend shaders override not just fixed-function blending but also 3418 * fixed-function format conversion, so blend shaders are keyed to a particular 3419 * framebuffer format. As an example, the tilebuffer format is identical for 3420 * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require 3421 * blend shaders. 3422 * 3423 * All of this state is encapsulated in the panfrost_blend_state struct 3424 * (our subclass of pipe_blend_state). 3425 */ 3426 3427/* Create a blend CSO. Essentially, try to compile a fixed-function 3428 * expression and initialize blend shaders */ 3429 3430static void * 3431panfrost_create_blend_state(struct pipe_context *pipe, 3432 const struct pipe_blend_state *blend) 3433{ 3434 struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state); 3435 so->base = *blend; 3436 3437 so->pan.logicop_enable = blend->logicop_enable; 3438 so->pan.logicop_func = blend->logicop_func; 3439 so->pan.rt_count = blend->max_rt + 1; 3440 3441 for (unsigned c = 0; c < so->pan.rt_count; ++c) { 3442 unsigned g = blend->independent_blend_enable ? c : 0; 3443 const struct pipe_rt_blend_state pipe = blend->rt[g]; 3444 struct pan_blend_equation equation = {0}; 3445 3446 equation.color_mask = pipe.colormask; 3447 equation.blend_enable = pipe.blend_enable; 3448 3449 if (pipe.blend_enable) { 3450 equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func); 3451 equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor); 3452 equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor); 3453 equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor); 3454 equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor); 3455 equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func); 3456 equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor); 3457 equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor); 3458 equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor); 3459 equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor); 3460 } 3461 3462 /* Determine some common properties */ 3463 unsigned constant_mask = pan_blend_constant_mask(equation); 3464 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH); 3465 so->info[c] = (struct pan_blend_info) { 3466 .no_colour = (equation.color_mask == 0), 3467 .opaque = pan_blend_is_opaque(equation), 3468 .constant_mask = constant_mask, 3469 3470 /* TODO: check the dest for the logicop */ 3471 .load_dest = blend->logicop_enable || 3472 pan_blend_reads_dest(equation), 3473 3474 /* Could this possibly be fixed-function? */ 3475 .fixed_function = !blend->logicop_enable && 3476 pan_blend_can_fixed_function(equation, 3477 supports_2src) && 3478 (!constant_mask || 3479 pan_blend_supports_constant(PAN_ARCH, c)) 3480 }; 3481 3482 so->pan.rts[c].equation = equation; 3483 3484 /* Bifrost needs to know if any render target loads its 3485 * destination in the hot draw path, so precompute this */ 3486 if (so->info[c].load_dest) 3487 so->load_dest_mask |= BITFIELD_BIT(c); 3488 3489 /* Converting equations to Mali style is expensive, do it at 3490 * CSO create time instead of draw-time */ 3491 if (so->info[c].fixed_function) { 3492 so->equation[c] = pan_pack_blend(equation); 3493 } 3494 } 3495 3496 return so; 3497} 3498 3499static void 3500prepare_rsd(struct panfrost_shader_state *state, 3501 struct panfrost_pool *pool, bool upload) 3502{ 3503 struct mali_renderer_state_packed *out = 3504 (struct mali_renderer_state_packed *)&state->partial_rsd; 3505 3506 if (upload) { 3507 struct panfrost_ptr ptr = 3508 pan_pool_alloc_desc(&pool->base, RENDERER_STATE); 3509 3510 state->state = panfrost_pool_take_ref(pool, ptr.gpu); 3511 out = ptr.cpu; 3512 } 3513 3514 pan_pack(out, RENDERER_STATE, cfg) { 3515 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg); 3516 } 3517} 3518 3519static void 3520panfrost_get_sample_position(struct pipe_context *context, 3521 unsigned sample_count, 3522 unsigned sample_index, 3523 float *out_value) 3524{ 3525 panfrost_query_sample_position( 3526 panfrost_sample_pattern(sample_count), 3527 sample_index, 3528 out_value); 3529} 3530 3531static void 3532screen_destroy(struct pipe_screen *pscreen) 3533{ 3534 struct panfrost_device *dev = pan_device(pscreen); 3535 GENX(panfrost_cleanup_indirect_draw_shaders)(dev); 3536 GENX(pan_indirect_dispatch_cleanup)(dev); 3537 GENX(pan_blitter_cleanup)(dev); 3538} 3539 3540static void 3541preload(struct panfrost_batch *batch, struct pan_fb_info *fb) 3542{ 3543 GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu, 3544 PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL); 3545} 3546 3547static void 3548init_batch(struct panfrost_batch *batch) 3549{ 3550 /* Reserve the framebuffer and local storage descriptors */ 3551 batch->framebuffer = 3552#if PAN_ARCH == 4 3553 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER); 3554#else 3555 pan_pool_alloc_desc_aggregate(&batch->pool.base, 3556 PAN_DESC(FRAMEBUFFER), 3557 PAN_DESC(ZS_CRC_EXTENSION), 3558 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); 3559 3560 batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD; 3561#endif 3562 3563#if PAN_ARCH >= 6 3564 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); 3565#else 3566 /* On Midgard, the TLS is embedded in the FB descriptor */ 3567 batch->tls = batch->framebuffer; 3568#endif 3569} 3570 3571static void 3572panfrost_sampler_view_destroy( 3573 struct pipe_context *pctx, 3574 struct pipe_sampler_view *pview) 3575{ 3576 struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview; 3577 3578 pipe_resource_reference(&pview->texture, NULL); 3579 panfrost_bo_unreference(view->state.bo); 3580 ralloc_free(view); 3581} 3582 3583static void 3584context_init(struct pipe_context *pipe) 3585{ 3586 pipe->draw_vbo = panfrost_draw_vbo; 3587 pipe->launch_grid = panfrost_launch_grid; 3588 3589 pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state; 3590 pipe->create_rasterizer_state = panfrost_create_rasterizer_state; 3591 pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state; 3592 pipe->create_sampler_view = panfrost_create_sampler_view; 3593 pipe->sampler_view_destroy = panfrost_sampler_view_destroy; 3594 pipe->create_sampler_state = panfrost_create_sampler_state; 3595 pipe->create_blend_state = panfrost_create_blend_state; 3596 3597 pipe->get_sample_position = panfrost_get_sample_position; 3598} 3599 3600#if PAN_ARCH <= 5 3601 3602/* Returns the polygon list's GPU address if available, or otherwise allocates 3603 * the polygon list. It's perfectly fast to use allocate/free BO directly, 3604 * since we'll hit the BO cache and this is one-per-batch anyway. */ 3605 3606static mali_ptr 3607batch_get_polygon_list(struct panfrost_batch *batch) 3608{ 3609 struct panfrost_device *dev = pan_device(batch->ctx->base.screen); 3610 3611 if (!batch->tiler_ctx.midgard.polygon_list) { 3612 bool has_draws = batch->scoreboard.first_tiler != NULL; 3613 unsigned size = 3614 panfrost_tiler_get_polygon_list_size(dev, 3615 batch->key.width, 3616 batch->key.height, 3617 has_draws); 3618 size = util_next_power_of_two(size); 3619 3620 /* Create the BO as invisible if we can. In the non-hierarchical tiler case, 3621 * we need to write the polygon list manually because there's not WRITE_VALUE 3622 * job in the chain (maybe we should add one...). */ 3623 bool init_polygon_list = !has_draws && (dev->quirks & MIDGARD_NO_HIER_TILING); 3624 batch->tiler_ctx.midgard.polygon_list = 3625 panfrost_batch_create_bo(batch, size, 3626 init_polygon_list ? 0 : PAN_BO_INVISIBLE, 3627 PIPE_SHADER_VERTEX, 3628 "Polygon list"); 3629 panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list, 3630 PIPE_SHADER_FRAGMENT); 3631 3632 if (init_polygon_list) { 3633 assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); 3634 uint32_t *polygon_list_body = 3635 batch->tiler_ctx.midgard.polygon_list->ptr.cpu + 3636 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; 3637 3638 /* Magic for Mali T720 */ 3639 polygon_list_body[0] = 0xa0000000; 3640 } 3641 3642 batch->tiler_ctx.midgard.disable = !has_draws; 3643 } 3644 3645 return batch->tiler_ctx.midgard.polygon_list->ptr.gpu; 3646} 3647#endif 3648 3649static void 3650init_polygon_list(struct panfrost_batch *batch) 3651{ 3652#if PAN_ARCH <= 5 3653 mali_ptr polygon_list = batch_get_polygon_list(batch); 3654 panfrost_scoreboard_initialize_tiler(&batch->pool.base, 3655 &batch->scoreboard, 3656 polygon_list); 3657#endif 3658} 3659 3660void 3661GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) 3662{ 3663 struct panfrost_device *dev = &screen->dev; 3664 3665 screen->vtbl.prepare_rsd = prepare_rsd; 3666 screen->vtbl.emit_tls = emit_tls; 3667 screen->vtbl.emit_fbd = emit_fbd; 3668 screen->vtbl.emit_fragment_job = emit_fragment_job; 3669 screen->vtbl.screen_destroy = screen_destroy; 3670 screen->vtbl.preload = preload; 3671 screen->vtbl.context_init = context_init; 3672 screen->vtbl.init_batch = init_batch; 3673 screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked); 3674 screen->vtbl.init_polygon_list = init_polygon_list; 3675 screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); 3676 screen->vtbl.compile_shader = GENX(pan_shader_compile); 3677 3678 GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, 3679 &screen->blitter.desc_pool.base); 3680 GENX(pan_indirect_dispatch_init)(dev); 3681 GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base); 3682} 3683