v3d_compiler.h revision 7ec681f3
1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef V3D_COMPILER_H 25#define V3D_COMPILER_H 26 27#include <assert.h> 28#include <stdio.h> 29#include <stdlib.h> 30#include <stdbool.h> 31#include <stdint.h> 32#include <string.h> 33 34#include "util/macros.h" 35#include "common/v3d_debug.h" 36#include "common/v3d_device_info.h" 37#include "common/v3d_limits.h" 38#include "compiler/nir/nir.h" 39#include "util/list.h" 40#include "util/u_math.h" 41 42#include "qpu/qpu_instr.h" 43#include "pipe/p_state.h" 44 45/** 46 * Maximum number of outstanding TMU operations we can queue for execution. 47 * 48 * This is mostly limited by the size of the TMU fifos. The Input and Config 49 * fifos can stall, but we prefer that than injecting TMU flushes manually 50 * in the driver, so we can ignore these, but we can't overflow the Output fifo, 51 * which has 16 / threads per-thread entries, meaning that the maximum number 52 * of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader. 53 * This means that at most we can have 8 outstanding TMU loads, if each load 54 * is just one component. 55 * 56 * NOTE: we could actually have a larger value here because TMU stores don't 57 * consume any entries in the Output fifo (so we could have any number of 58 * outstanding stores) and the driver keeps track of used Output fifo entries 59 * and will flush if we ever needs more than 8, but since loads are much more 60 * common than stores, it is probably not worth it. 61 */ 62#define MAX_TMU_QUEUE_SIZE 8 63 64/** 65 * Maximum offset distance in bytes between two consecutive constant UBO loads 66 * for the same UBO where we would favor updating the unifa address by emitting 67 * dummy ldunifa instructions to avoid writing the unifa register. 68 */ 69#define MAX_UNIFA_SKIP_DISTANCE 16 70 71struct nir_builder; 72 73struct v3d_fs_inputs { 74 /** 75 * Array of the meanings of the VPM inputs this shader needs. 76 * 77 * It doesn't include those that aren't part of the VPM, like 78 * point/line coordinates. 79 */ 80 struct v3d_varying_slot *input_slots; 81 uint32_t num_inputs; 82}; 83 84enum qfile { 85 /** An unused source or destination register. */ 86 QFILE_NULL, 87 88 /** A physical register, such as the W coordinate payload. */ 89 QFILE_REG, 90 /** One of the regsiters for fixed function interactions. */ 91 QFILE_MAGIC, 92 93 /** 94 * A virtual register, that will be allocated to actual accumulator 95 * or physical registers later. 96 */ 97 QFILE_TEMP, 98 99 /** 100 * VPM reads use this with an index value to say what part of the VPM 101 * is being read. 102 */ 103 QFILE_VPM, 104 105 /** 106 * Stores an immediate value in the index field that will be used 107 * directly by qpu_load_imm(). 108 */ 109 QFILE_LOAD_IMM, 110 111 /** 112 * Stores an immediate value in the index field that can be turned 113 * into a small immediate field by qpu_encode_small_immediate(). 114 */ 115 QFILE_SMALL_IMM, 116}; 117 118/** 119 * A reference to a QPU register or a virtual temp register. 120 */ 121struct qreg { 122 enum qfile file; 123 uint32_t index; 124}; 125 126static inline struct qreg vir_reg(enum qfile file, uint32_t index) 127{ 128 return (struct qreg){file, index}; 129} 130 131static inline struct qreg vir_magic_reg(uint32_t index) 132{ 133 return (struct qreg){QFILE_MAGIC, index}; 134} 135 136static inline struct qreg vir_nop_reg(void) 137{ 138 return (struct qreg){QFILE_NULL, 0}; 139} 140 141/** 142 * A reference to an actual register at the QPU level, for register 143 * allocation. 144 */ 145struct qpu_reg { 146 bool magic; 147 bool smimm; 148 int index; 149}; 150 151struct qinst { 152 /** Entry in qblock->instructions */ 153 struct list_head link; 154 155 /** 156 * The instruction being wrapped. Its condition codes, pack flags, 157 * signals, etc. will all be used, with just the register references 158 * being replaced by the contents of qinst->dst and qinst->src[]. 159 */ 160 struct v3d_qpu_instr qpu; 161 162 /* Pre-register-allocation references to src/dst registers */ 163 struct qreg dst; 164 struct qreg src[3]; 165 bool is_last_thrsw; 166 167 /* If the instruction reads a uniform (other than through src[i].file 168 * == QFILE_UNIF), that uniform's index in c->uniform_contents. ~0 169 * otherwise. 170 */ 171 int uniform; 172}; 173 174enum quniform_contents { 175 /** 176 * Indicates that a constant 32-bit value is copied from the program's 177 * uniform contents. 178 */ 179 QUNIFORM_CONSTANT, 180 /** 181 * Indicates that the program's uniform contents are used as an index 182 * into the GL uniform storage. 183 */ 184 QUNIFORM_UNIFORM, 185 186 /** @{ 187 * Scaling factors from clip coordinates to relative to the viewport 188 * center. 189 * 190 * This is used by the coordinate and vertex shaders to produce the 191 * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed 192 * point offsets from the viewport ccenter. 193 */ 194 QUNIFORM_VIEWPORT_X_SCALE, 195 QUNIFORM_VIEWPORT_Y_SCALE, 196 /** @} */ 197 198 QUNIFORM_VIEWPORT_Z_OFFSET, 199 QUNIFORM_VIEWPORT_Z_SCALE, 200 201 QUNIFORM_USER_CLIP_PLANE, 202 203 /** 204 * A reference to a V3D 3.x texture config parameter 0 uniform. 205 * 206 * This is a uniform implicitly loaded with a QPU_W_TMU* write, which 207 * defines texture type, miplevels, and such. It will be found as a 208 * parameter to the first QOP_TEX_[STRB] instruction in a sequence. 209 */ 210 QUNIFORM_TEXTURE_CONFIG_P0_0, 211 QUNIFORM_TEXTURE_CONFIG_P0_1, 212 QUNIFORM_TEXTURE_CONFIG_P0_2, 213 QUNIFORM_TEXTURE_CONFIG_P0_3, 214 QUNIFORM_TEXTURE_CONFIG_P0_4, 215 QUNIFORM_TEXTURE_CONFIG_P0_5, 216 QUNIFORM_TEXTURE_CONFIG_P0_6, 217 QUNIFORM_TEXTURE_CONFIG_P0_7, 218 QUNIFORM_TEXTURE_CONFIG_P0_8, 219 QUNIFORM_TEXTURE_CONFIG_P0_9, 220 QUNIFORM_TEXTURE_CONFIG_P0_10, 221 QUNIFORM_TEXTURE_CONFIG_P0_11, 222 QUNIFORM_TEXTURE_CONFIG_P0_12, 223 QUNIFORM_TEXTURE_CONFIG_P0_13, 224 QUNIFORM_TEXTURE_CONFIG_P0_14, 225 QUNIFORM_TEXTURE_CONFIG_P0_15, 226 QUNIFORM_TEXTURE_CONFIG_P0_16, 227 QUNIFORM_TEXTURE_CONFIG_P0_17, 228 QUNIFORM_TEXTURE_CONFIG_P0_18, 229 QUNIFORM_TEXTURE_CONFIG_P0_19, 230 QUNIFORM_TEXTURE_CONFIG_P0_20, 231 QUNIFORM_TEXTURE_CONFIG_P0_21, 232 QUNIFORM_TEXTURE_CONFIG_P0_22, 233 QUNIFORM_TEXTURE_CONFIG_P0_23, 234 QUNIFORM_TEXTURE_CONFIG_P0_24, 235 QUNIFORM_TEXTURE_CONFIG_P0_25, 236 QUNIFORM_TEXTURE_CONFIG_P0_26, 237 QUNIFORM_TEXTURE_CONFIG_P0_27, 238 QUNIFORM_TEXTURE_CONFIG_P0_28, 239 QUNIFORM_TEXTURE_CONFIG_P0_29, 240 QUNIFORM_TEXTURE_CONFIG_P0_30, 241 QUNIFORM_TEXTURE_CONFIG_P0_31, 242 QUNIFORM_TEXTURE_CONFIG_P0_32, 243 244 /** 245 * A reference to a V3D 3.x texture config parameter 1 uniform. 246 * 247 * This is a uniform implicitly loaded with a QPU_W_TMU* write, which 248 * has the pointer to the indirect texture state. Our data[] field 249 * will have a packed p1 value, but the address field will be just 250 * which texture unit's texture should be referenced. 251 */ 252 QUNIFORM_TEXTURE_CONFIG_P1, 253 254 /* A V3D 4.x texture config parameter. The high 8 bits will be 255 * which texture or sampler is being sampled, and the driver must 256 * replace the address field with the appropriate address. 257 */ 258 QUNIFORM_TMU_CONFIG_P0, 259 QUNIFORM_TMU_CONFIG_P1, 260 261 QUNIFORM_IMAGE_TMU_CONFIG_P0, 262 263 QUNIFORM_TEXTURE_FIRST_LEVEL, 264 265 QUNIFORM_TEXTURE_WIDTH, 266 QUNIFORM_TEXTURE_HEIGHT, 267 QUNIFORM_TEXTURE_DEPTH, 268 QUNIFORM_TEXTURE_ARRAY_SIZE, 269 QUNIFORM_TEXTURE_LEVELS, 270 QUNIFORM_TEXTURE_SAMPLES, 271 272 QUNIFORM_UBO_ADDR, 273 274 QUNIFORM_TEXRECT_SCALE_X, 275 QUNIFORM_TEXRECT_SCALE_Y, 276 277 /* Returns the base offset of the SSBO given by the data value. */ 278 QUNIFORM_SSBO_OFFSET, 279 280 /* Returns the size of the SSBO or UBO given by the data value. */ 281 QUNIFORM_GET_SSBO_SIZE, 282 QUNIFORM_GET_UBO_SIZE, 283 284 /* Sizes (in pixels) of a shader image given by the data value. */ 285 QUNIFORM_IMAGE_WIDTH, 286 QUNIFORM_IMAGE_HEIGHT, 287 QUNIFORM_IMAGE_DEPTH, 288 QUNIFORM_IMAGE_ARRAY_SIZE, 289 290 QUNIFORM_LINE_WIDTH, 291 292 /* The line width sent to hardware. This includes the expanded width 293 * when anti-aliasing is enabled. 294 */ 295 QUNIFORM_AA_LINE_WIDTH, 296 297 /* Number of workgroups passed to glDispatchCompute in the dimension 298 * selected by the data value. 299 */ 300 QUNIFORM_NUM_WORK_GROUPS, 301 302 /* Base workgroup offset passed to vkCmdDispatchBase in the dimension 303 * selected by the data value. 304 */ 305 QUNIFORM_WORK_GROUP_BASE, 306 307 /** 308 * Returns the the offset of the scratch buffer for register spilling. 309 */ 310 QUNIFORM_SPILL_OFFSET, 311 QUNIFORM_SPILL_SIZE_PER_THREAD, 312 313 /** 314 * Returns the offset of the shared memory for compute shaders. 315 * 316 * This will be accessed using TMU general memory operations, so the 317 * L2T cache will effectively be the shared memory area. 318 */ 319 QUNIFORM_SHARED_OFFSET, 320 321 /** 322 * Returns the number of layers in the framebuffer. 323 * 324 * This is used to cap gl_Layer in geometry shaders to avoid 325 * out-of-bounds accesses into the tile state during binning. 326 */ 327 QUNIFORM_FB_LAYERS, 328 329 /** 330 * Current value of gl_ViewIndex for Multiview rendering. 331 */ 332 QUNIFORM_VIEW_INDEX, 333}; 334 335static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) 336{ 337 assert(value < (1 << 24)); 338 return unit << 24 | value; 339} 340 341static inline uint32_t v3d_unit_data_get_unit(uint32_t data) 342{ 343 return data >> 24; 344} 345 346static inline uint32_t v3d_unit_data_get_offset(uint32_t data) 347{ 348 return data & 0xffffff; 349} 350 351struct v3d_varying_slot { 352 uint8_t slot_and_component; 353}; 354 355static inline struct v3d_varying_slot 356v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component) 357{ 358 assert(slot < 255 / 4); 359 return (struct v3d_varying_slot){ (slot << 2) + component }; 360} 361 362static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot) 363{ 364 return slot.slot_and_component >> 2; 365} 366 367static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) 368{ 369 return slot.slot_and_component & 3; 370} 371 372enum v3d_execution_environment { 373 V3D_ENVIRONMENT_OPENGL = 0, 374 V3D_ENVIRONMENT_VULKAN, 375}; 376 377struct v3d_key { 378 void *shader_state; 379 struct { 380 uint8_t swizzle[4]; 381 } tex[V3D_MAX_TEXTURE_SAMPLERS]; 382 struct { 383 uint8_t return_size; 384 uint8_t return_channels; 385 } sampler[V3D_MAX_TEXTURE_SAMPLERS]; 386 387 uint8_t num_tex_used; 388 uint8_t num_samplers_used; 389 uint8_t ucp_enables; 390 bool is_last_geometry_stage; 391 bool robust_buffer_access; 392 393 enum v3d_execution_environment environment; 394}; 395 396struct v3d_fs_key { 397 struct v3d_key base; 398 bool is_points; 399 bool is_lines; 400 bool line_smoothing; 401 bool point_coord_upper_left; 402 bool msaa; 403 bool sample_coverage; 404 bool sample_alpha_to_coverage; 405 bool sample_alpha_to_one; 406 /* Mask of which color render targets are present. */ 407 uint8_t cbufs; 408 uint8_t swap_color_rb; 409 /* Mask of which render targets need to be written as 32-bit floats */ 410 uint8_t f32_color_rb; 411 /* Masks of which render targets need to be written as ints/uints. 412 * Used by gallium to work around lost information in TGSI. 413 */ 414 uint8_t int_color_rb; 415 uint8_t uint_color_rb; 416 417 /* Color format information per render target. Only set when logic 418 * operations are enabled. 419 */ 420 struct { 421 enum pipe_format format; 422 const uint8_t *swizzle; 423 } color_fmt[V3D_MAX_DRAW_BUFFERS]; 424 425 uint8_t logicop_func; 426 uint32_t point_sprite_mask; 427 428 struct pipe_rt_blend_state blend; 429 430 /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios: 431 * 432 * - If there is a geometry shader, then gl_PrimitiveID must be written 433 * by it and the fragment shader loads it as a regular explicit input 434 * varying. This is the only valid use case in GLES 3.1. 435 * 436 * - If there is not a geometry shader (allowed since GLES 3.2 and 437 * Vulkan 1.0), then gl_PrimitiveID must be implicitly written by 438 * hardware and is considered an implicit input varying in the 439 * fragment shader. 440 */ 441 bool has_gs; 442}; 443 444struct v3d_gs_key { 445 struct v3d_key base; 446 447 struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS]; 448 uint8_t num_used_outputs; 449 450 bool is_coord; 451 bool per_vertex_point_size; 452}; 453 454struct v3d_vs_key { 455 struct v3d_key base; 456 457 struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS]; 458 uint8_t num_used_outputs; 459 460 /* A bit-mask indicating if we need to swap the R/B channels for 461 * vertex attributes. Since the hardware doesn't provide any 462 * means to swizzle vertex attributes we need to do it in the shader. 463 */ 464 uint32_t va_swap_rb_mask; 465 466 bool is_coord; 467 bool per_vertex_point_size; 468 bool clamp_color; 469}; 470 471/** A basic block of VIR intructions. */ 472struct qblock { 473 struct list_head link; 474 475 struct list_head instructions; 476 477 struct set *predecessors; 478 struct qblock *successors[2]; 479 480 int index; 481 482 /* Instruction IPs for the first and last instruction of the block. 483 * Set by qpu_schedule.c. 484 */ 485 uint32_t start_qpu_ip; 486 uint32_t end_qpu_ip; 487 488 /* Instruction IP for the branch instruction of the block. Set by 489 * qpu_schedule.c. 490 */ 491 uint32_t branch_qpu_ip; 492 493 /** Offset within the uniform stream at the start of the block. */ 494 uint32_t start_uniform; 495 /** Offset within the uniform stream of the branch instruction */ 496 uint32_t branch_uniform; 497 498 /** 499 * Has the terminating branch of this block already been emitted 500 * by a break or continue? 501 */ 502 bool branch_emitted; 503 504 /** @{ used by v3d_vir_live_variables.c */ 505 BITSET_WORD *def; 506 BITSET_WORD *defin; 507 BITSET_WORD *defout; 508 BITSET_WORD *use; 509 BITSET_WORD *live_in; 510 BITSET_WORD *live_out; 511 int start_ip, end_ip; 512 /** @} */ 513}; 514 515/** Which util/list.h add mode we should use when inserting an instruction. */ 516enum vir_cursor_mode { 517 vir_cursor_add, 518 vir_cursor_addtail, 519}; 520 521/** 522 * Tracking structure for where new instructions should be inserted. Create 523 * with one of the vir_after_inst()-style helper functions. 524 * 525 * This does not protect against removal of the block or instruction, so we 526 * have an assert in instruction removal to try to catch it. 527 */ 528struct vir_cursor { 529 enum vir_cursor_mode mode; 530 struct list_head *link; 531}; 532 533static inline struct vir_cursor 534vir_before_inst(struct qinst *inst) 535{ 536 return (struct vir_cursor){ vir_cursor_addtail, &inst->link }; 537} 538 539static inline struct vir_cursor 540vir_after_inst(struct qinst *inst) 541{ 542 return (struct vir_cursor){ vir_cursor_add, &inst->link }; 543} 544 545static inline struct vir_cursor 546vir_before_block(struct qblock *block) 547{ 548 return (struct vir_cursor){ vir_cursor_add, &block->instructions }; 549} 550 551static inline struct vir_cursor 552vir_after_block(struct qblock *block) 553{ 554 return (struct vir_cursor){ vir_cursor_addtail, &block->instructions }; 555} 556 557enum v3d_compilation_result { 558 V3D_COMPILATION_SUCCEEDED, 559 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION, 560 V3D_COMPILATION_FAILED, 561}; 562 563/** 564 * Compiler state saved across compiler invocations, for any expensive global 565 * setup. 566 */ 567struct v3d_compiler { 568 const struct v3d_device_info *devinfo; 569 struct ra_regs *regs; 570 struct ra_class *reg_class_any[3]; 571 struct ra_class *reg_class_r5[3]; 572 struct ra_class *reg_class_phys[3]; 573 struct ra_class *reg_class_phys_or_acc[3]; 574}; 575 576/** 577 * This holds partially interpolated inputs as provided by hardware 578 * (The Vp = A*(x - x0) + B*(y - y0) term), as well as the C coefficient 579 * required to compute the final interpolated value. 580 */ 581struct v3d_interp_input { 582 struct qreg vp; 583 struct qreg C; 584 unsigned mode; /* interpolation mode */ 585}; 586 587struct v3d_compile { 588 const struct v3d_device_info *devinfo; 589 nir_shader *s; 590 nir_function_impl *impl; 591 struct exec_list *cf_node_list; 592 const struct v3d_compiler *compiler; 593 594 void (*debug_output)(const char *msg, 595 void *debug_output_data); 596 void *debug_output_data; 597 598 /** 599 * Mapping from nir_register * or nir_ssa_def * to array of struct 600 * qreg for the values. 601 */ 602 struct hash_table *def_ht; 603 604 /* For each temp, the instruction generating its value. */ 605 struct qinst **defs; 606 uint32_t defs_array_size; 607 608 /* TMU pipelining tracking */ 609 struct { 610 /* NIR registers that have been updated with a TMU operation 611 * that has not been flushed yet. 612 */ 613 struct set *outstanding_regs; 614 615 uint32_t output_fifo_size; 616 617 struct { 618 nir_dest *dest; 619 uint8_t num_components; 620 uint8_t component_mask; 621 } flush[MAX_TMU_QUEUE_SIZE]; 622 uint32_t flush_count; 623 } tmu; 624 625 /** 626 * Inputs to the shader, arranged by TGSI declaration order. 627 * 628 * Not all fragment shader QFILE_VARY reads are present in this array. 629 */ 630 struct qreg *inputs; 631 /** 632 * Partially interpolated inputs to the shader. 633 */ 634 struct v3d_interp_input *interp; 635 struct qreg *outputs; 636 bool msaa_per_sample_output; 637 struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; 638 struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; 639 uint32_t inputs_array_size; 640 uint32_t outputs_array_size; 641 uint32_t uniforms_array_size; 642 643 /* Booleans for whether the corresponding QFILE_VARY[i] is 644 * flat-shaded. This includes gl_FragColor flat-shading, which is 645 * customized based on the shademodel_flat shader key. 646 */ 647 uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 648 649 uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 650 651 uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; 652 653 bool uses_center_w; 654 bool writes_z; 655 bool uses_implicit_point_line_varyings; 656 657 /* True if a fragment shader reads gl_PrimitiveID */ 658 bool fs_uses_primitive_id; 659 660 /* If the fragment shader does anything that requires to force 661 * per-sample MSAA, such as reading gl_SampleID. 662 */ 663 bool force_per_sample_msaa; 664 665 /* Whether we are using the fallback scheduler. This will be set after 666 * register allocation has failed once. 667 */ 668 bool fallback_scheduler; 669 670 /* Disable TMU pipelining. This may increase the chances of being able 671 * to compile shaders with high register pressure that require to emit 672 * TMU spills. 673 */ 674 bool disable_tmu_pipelining; 675 bool pipelined_any_tmu; 676 677 /* Disable sorting of UBO loads with constant offset. This may 678 * increase the chances of being able to compile shaders with high 679 * register pressure. 680 */ 681 bool disable_constant_ubo_load_sorting; 682 bool sorted_any_ubo_loads; 683 684 /* Emits ldunif for each new uniform, even if the uniform was already 685 * emitted in the same block. Useful to compile shaders with high 686 * register pressure or to disable the optimization during uniform 687 * spills. 688 */ 689 bool disable_ldunif_opt; 690 691 /* Disables loop unrolling to reduce register pressure. */ 692 bool disable_loop_unrolling; 693 bool unrolled_any_loops; 694 695 /* Minimum number of threads we are willing to use to register allocate 696 * a shader with the current compilation strategy. This only prevents 697 * us from lowering the thread count to register allocate successfully, 698 * which can be useful when we prefer doing other changes to the 699 * compilation strategy before dropping thread count. 700 */ 701 uint32_t min_threads_for_reg_alloc; 702 703 /* Whether TMU spills are allowed. If this is disabled it may cause 704 * register allocation to fail. We set this to favor other compilation 705 * strategies that can reduce register pressure and hopefully reduce or 706 * eliminate TMU spills in the shader. 707 */ 708 bool tmu_spilling_allowed; 709 710 /* The UBO index and block used with the last unifa load, as well as the 711 * current unifa offset *after* emitting that load. This is used to skip 712 * unifa writes (and their 3 delay slot) when the next UBO load reads 713 * right after the previous one in the same block. 714 */ 715 struct qblock *current_unifa_block; 716 int32_t current_unifa_index; 717 uint32_t current_unifa_offset; 718 719 /* State for whether we're executing on each channel currently. 0 if 720 * yes, otherwise a block number + 1 that the channel jumped to. 721 */ 722 struct qreg execute; 723 bool in_control_flow; 724 725 struct qreg line_x, point_x, point_y, primitive_id; 726 727 /** 728 * Instance ID, which comes in before the vertex attribute payload if 729 * the shader record requests it. 730 */ 731 struct qreg iid; 732 733 /** 734 * Base Instance ID, which comes in before the vertex attribute payload 735 * (after Instance ID) if the shader record requests it. 736 */ 737 struct qreg biid; 738 739 /** 740 * Vertex ID, which comes in before the vertex attribute payload 741 * (after Base Instance) if the shader record requests it. 742 */ 743 struct qreg vid; 744 745 /* Fragment shader payload regs. */ 746 struct qreg payload_w, payload_w_centroid, payload_z; 747 748 struct qreg cs_payload[2]; 749 struct qreg cs_shared_offset; 750 int local_invocation_index_bits; 751 752 /* If the shader uses subgroup functionality */ 753 bool has_subgroups; 754 755 uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; 756 uint32_t vpm_output_size; 757 758 /* Size in bytes of registers that have been spilled. This is how much 759 * space needs to be available in the spill BO per thread per QPU. 760 */ 761 uint32_t spill_size; 762 /* Shader-db stats */ 763 uint32_t spills, fills, loops; 764 /** 765 * Register spilling's per-thread base address, shared between each 766 * spill/fill's addressing calculations. 767 */ 768 struct qreg spill_base; 769 /* Bit vector of which temps may be spilled */ 770 BITSET_WORD *spillable; 771 772 /** 773 * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. 774 * 775 * This includes those that aren't part of the VPM varyings, like 776 * point/line coordinates. 777 */ 778 struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; 779 780 /** 781 * An entry per outputs[] in the VS indicating what the VARYING_SLOT_* 782 * of the output is. Used to emit from the VS in the order that the 783 * FS needs. 784 */ 785 struct v3d_varying_slot *output_slots; 786 787 struct pipe_shader_state *shader_state; 788 struct v3d_key *key; 789 struct v3d_fs_key *fs_key; 790 struct v3d_gs_key *gs_key; 791 struct v3d_vs_key *vs_key; 792 793 /* Live ranges of temps. */ 794 int *temp_start, *temp_end; 795 bool live_intervals_valid; 796 797 uint32_t *uniform_data; 798 enum quniform_contents *uniform_contents; 799 uint32_t uniform_array_size; 800 uint32_t num_uniforms; 801 uint32_t output_position_index; 802 nir_variable *output_color_var[4]; 803 uint32_t output_sample_mask_index; 804 805 struct qreg undef; 806 uint32_t num_temps; 807 808 struct vir_cursor cursor; 809 struct list_head blocks; 810 int next_block_index; 811 struct qblock *cur_block; 812 struct qblock *loop_cont_block; 813 struct qblock *loop_break_block; 814 /** 815 * Which temp, if any, do we currently have in the flags? 816 * This is set when processing a comparison instruction, and 817 * reset to -1 by anything else that touches the flags. 818 */ 819 int32_t flags_temp; 820 enum v3d_qpu_cond flags_cond; 821 822 uint64_t *qpu_insts; 823 uint32_t qpu_inst_count; 824 uint32_t qpu_inst_size; 825 uint32_t qpu_inst_stalled_count; 826 uint32_t nop_count; 827 828 /* For the FS, the number of varying inputs not counting the 829 * point/line varyings payload 830 */ 831 uint32_t num_inputs; 832 833 uint32_t program_id; 834 uint32_t variant_id; 835 836 /* Set to compile program in in 1x, 2x, or 4x threaded mode, where 837 * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of 838 * limiting ourselves to the part of the physical reg space. 839 * 840 * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On 841 * V3D 4.x, all shaders are 2x threaded, and 4x only divides the 842 * physical reg space in half. 843 */ 844 uint8_t threads; 845 struct qinst *last_thrsw; 846 bool last_thrsw_at_top_level; 847 848 bool emitted_tlb_load; 849 bool lock_scoreboard_on_first_thrsw; 850 851 /* Total number of spilled registers in the program */ 852 uint32_t spill_count; 853 854 enum v3d_compilation_result compilation_result; 855 856 bool tmu_dirty_rcl; 857}; 858 859struct v3d_uniform_list { 860 enum quniform_contents *contents; 861 uint32_t *data; 862 uint32_t count; 863}; 864 865struct v3d_prog_data { 866 struct v3d_uniform_list uniforms; 867 868 uint32_t spill_size; 869 870 uint8_t threads; 871 872 /* For threads > 1, whether the program should be dispatched in the 873 * after-final-THRSW state. 874 */ 875 bool single_seg; 876 877 bool tmu_dirty_rcl; 878 879 bool has_control_barrier; 880}; 881 882struct v3d_vs_prog_data { 883 struct v3d_prog_data base; 884 885 bool uses_iid, uses_biid, uses_vid; 886 887 /* Number of components read from each vertex attribute. */ 888 uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; 889 890 /* Total number of components read, for the shader state record. */ 891 uint32_t vpm_input_size; 892 893 /* Total number of components written, for the shader state record. */ 894 uint32_t vpm_output_size; 895 896 /* Set if there should be separate VPM segments for input and output. 897 * If unset, vpm_input_size will be 0. 898 */ 899 bool separate_segments; 900 901 /* Value to be programmed in VCM_CACHE_SIZE. */ 902 uint8_t vcm_cache_size; 903 904 /* Maps the nir->data.location to its 905 * nir->data.driver_location. In general we are using the 906 * driver location as index (like vattr_sizes above), so this 907 * map is useful when what we have is the location 908 * 909 * Returns -1 if the location is not used 910 */ 911 int32_t driver_location_map[V3D_MAX_VS_INPUTS]; 912}; 913 914struct v3d_gs_prog_data { 915 struct v3d_prog_data base; 916 917 /* Whether the program reads gl_PrimitiveIDIn */ 918 bool uses_pid; 919 920 /* Number of components read from each input varying. */ 921 uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4]; 922 923 /* Number of inputs */ 924 uint8_t num_inputs; 925 struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS]; 926 927 /* Total number of components written, for the shader state record. */ 928 uint32_t vpm_output_size; 929 930 /* Maximum SIMD dispatch width to not exceed VPM output size limits 931 * in the geometry shader. Notice that the final dispatch width has to 932 * be decided at draw time and could be lower based on the VPM pressure 933 * added by other shader stages. 934 */ 935 uint8_t simd_width; 936 937 /* Output primitive type */ 938 uint8_t out_prim_type; 939 940 /* Number of GS invocations */ 941 uint8_t num_invocations; 942 943 bool writes_psiz; 944}; 945 946struct v3d_fs_prog_data { 947 struct v3d_prog_data base; 948 949 /* Whether the program reads gl_PrimitiveID */ 950 bool uses_pid; 951 952 struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; 953 954 /* Array of flat shade flags. 955 * 956 * Each entry is only 24 bits (high 8 bits 0), to match the hardware 957 * packet layout. 958 */ 959 uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 960 961 uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 962 963 uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; 964 965 uint8_t num_inputs; 966 bool writes_z; 967 bool disable_ez; 968 bool uses_center_w; 969 bool uses_implicit_point_line_varyings; 970 bool lock_scoreboard_on_first_thrsw; 971 bool force_per_sample_msaa; 972}; 973 974struct v3d_compute_prog_data { 975 struct v3d_prog_data base; 976 /* Size in bytes of the workgroup's shared space. */ 977 uint32_t shared_size; 978 uint16_t local_size[3]; 979 /* If the shader uses subgroup functionality */ 980 bool has_subgroups; 981}; 982 983struct vpm_config { 984 uint32_t As; 985 uint32_t Vc; 986 uint32_t Gs; 987 uint32_t Gd; 988 uint32_t Gv; 989 uint32_t Ve; 990 uint32_t gs_width; 991}; 992 993bool 994v3d_compute_vpm_config(struct v3d_device_info *devinfo, 995 struct v3d_vs_prog_data *vs_bin, 996 struct v3d_vs_prog_data *vs, 997 struct v3d_gs_prog_data *gs_bin, 998 struct v3d_gs_prog_data *gs, 999 struct vpm_config *vpm_cfg_bin, 1000 struct vpm_config *vpm_cfg); 1001 1002static inline bool 1003vir_has_uniform(struct qinst *inst) 1004{ 1005 return inst->uniform != ~0; 1006} 1007 1008const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); 1009void v3d_compiler_free(const struct v3d_compiler *compiler); 1010void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); 1011 1012uint64_t *v3d_compile(const struct v3d_compiler *compiler, 1013 struct v3d_key *key, 1014 struct v3d_prog_data **prog_data, 1015 nir_shader *s, 1016 void (*debug_output)(const char *msg, 1017 void *debug_output_data), 1018 void *debug_output_data, 1019 int program_id, int variant_id, 1020 uint32_t *final_assembly_size); 1021 1022uint32_t v3d_prog_data_size(gl_shader_stage stage); 1023void v3d_nir_to_vir(struct v3d_compile *c); 1024 1025void vir_compile_destroy(struct v3d_compile *c); 1026const char *vir_get_stage_name(struct v3d_compile *c); 1027struct qblock *vir_new_block(struct v3d_compile *c); 1028void vir_set_emit_block(struct v3d_compile *c, struct qblock *block); 1029void vir_link_blocks(struct qblock *predecessor, struct qblock *successor); 1030struct qblock *vir_entry_block(struct v3d_compile *c); 1031struct qblock *vir_exit_block(struct v3d_compile *c); 1032struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, 1033 struct qreg src0, struct qreg src1); 1034struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, 1035 struct qreg src0, struct qreg src1); 1036struct qinst *vir_branch_inst(struct v3d_compile *c, 1037 enum v3d_qpu_branch_cond cond); 1038void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst); 1039uint32_t vir_get_uniform_index(struct v3d_compile *c, 1040 enum quniform_contents contents, 1041 uint32_t data); 1042struct qreg vir_uniform(struct v3d_compile *c, 1043 enum quniform_contents contents, 1044 uint32_t data); 1045void vir_schedule_instructions(struct v3d_compile *c); 1046void v3d_setup_spill_base(struct v3d_compile *c); 1047struct v3d_qpu_instr v3d_qpu_nop(void); 1048 1049struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); 1050struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst); 1051void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond); 1052enum v3d_qpu_cond vir_get_cond(struct qinst *inst); 1053void vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf); 1054void vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf); 1055void vir_set_unpack(struct qinst *inst, int src, 1056 enum v3d_qpu_input_unpack unpack); 1057void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack); 1058 1059struct qreg vir_get_temp(struct v3d_compile *c); 1060void vir_calculate_live_intervals(struct v3d_compile *c); 1061int vir_get_nsrc(struct qinst *inst); 1062bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); 1063bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); 1064bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op); 1065bool vir_is_raw_mov(struct qinst *inst); 1066bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); 1067bool vir_is_add(struct qinst *inst); 1068bool vir_is_mul(struct qinst *inst); 1069bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); 1070bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); 1071struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); 1072uint8_t vir_channels_written(struct qinst *inst); 1073struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); 1074void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 1075 struct qreg result); 1076bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components); 1077void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, 1078 uint32_t component_mask); 1079void ntq_flush_tmu(struct v3d_compile *c); 1080void vir_emit_thrsw(struct v3d_compile *c); 1081 1082void vir_dump(struct v3d_compile *c); 1083void vir_dump_inst(struct v3d_compile *c, struct qinst *inst); 1084void vir_dump_uniform(enum quniform_contents contents, uint32_t data); 1085 1086void vir_validate(struct v3d_compile *c); 1087 1088void vir_optimize(struct v3d_compile *c); 1089bool vir_opt_algebraic(struct v3d_compile *c); 1090bool vir_opt_constant_folding(struct v3d_compile *c); 1091bool vir_opt_copy_propagate(struct v3d_compile *c); 1092bool vir_opt_dead_code(struct v3d_compile *c); 1093bool vir_opt_peephole_sf(struct v3d_compile *c); 1094bool vir_opt_redundant_flags(struct v3d_compile *c); 1095bool vir_opt_small_immediates(struct v3d_compile *c); 1096bool vir_opt_vpm(struct v3d_compile *c); 1097bool vir_opt_constant_alu(struct v3d_compile *c); 1098void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); 1099void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); 1100void v3d_nir_lower_line_smooth(nir_shader *shader); 1101void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); 1102void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c); 1103void v3d_nir_lower_scratch(nir_shader *s); 1104void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); 1105void v3d_nir_lower_image_load_store(nir_shader *s); 1106void vir_lower_uniforms(struct v3d_compile *c); 1107 1108void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); 1109void v3d33_vir_vpm_write_setup(struct v3d_compile *c); 1110void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); 1111void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); 1112void v3d40_vir_emit_image_load_store(struct v3d_compile *c, 1113 nir_intrinsic_instr *instr); 1114 1115void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); 1116uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); 1117void qpu_validate(struct v3d_compile *c); 1118struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); 1119bool vir_init_reg_sets(struct v3d_compiler *compiler); 1120 1121int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str); 1122 1123bool v3d_gl_format_is_return_32(GLenum format); 1124 1125uint32_t 1126v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); 1127 1128static inline bool 1129quniform_contents_is_texture_p0(enum quniform_contents contents) 1130{ 1131 return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 && 1132 contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 + 1133 V3D_MAX_TEXTURE_SAMPLERS)); 1134} 1135 1136static inline bool 1137vir_in_nonuniform_control_flow(struct v3d_compile *c) 1138{ 1139 return c->execute.file != QFILE_NULL; 1140} 1141 1142static inline struct qreg 1143vir_uniform_ui(struct v3d_compile *c, uint32_t ui) 1144{ 1145 return vir_uniform(c, QUNIFORM_CONSTANT, ui); 1146} 1147 1148static inline struct qreg 1149vir_uniform_f(struct v3d_compile *c, float f) 1150{ 1151 return vir_uniform(c, QUNIFORM_CONSTANT, fui(f)); 1152} 1153 1154#define VIR_ALU0(name, vir_inst, op) \ 1155static inline struct qreg \ 1156vir_##name(struct v3d_compile *c) \ 1157{ \ 1158 return vir_emit_def(c, vir_inst(op, c->undef, \ 1159 c->undef, c->undef)); \ 1160} \ 1161static inline struct qinst * \ 1162vir_##name##_dest(struct v3d_compile *c, struct qreg dest) \ 1163{ \ 1164 return vir_emit_nondef(c, vir_inst(op, dest, \ 1165 c->undef, c->undef)); \ 1166} 1167 1168#define VIR_ALU1(name, vir_inst, op) \ 1169static inline struct qreg \ 1170vir_##name(struct v3d_compile *c, struct qreg a) \ 1171{ \ 1172 return vir_emit_def(c, vir_inst(op, c->undef, \ 1173 a, c->undef)); \ 1174} \ 1175static inline struct qinst * \ 1176vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1177 struct qreg a) \ 1178{ \ 1179 return vir_emit_nondef(c, vir_inst(op, dest, a, \ 1180 c->undef)); \ 1181} 1182 1183#define VIR_ALU2(name, vir_inst, op) \ 1184static inline struct qreg \ 1185vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ 1186{ \ 1187 return vir_emit_def(c, vir_inst(op, c->undef, a, b)); \ 1188} \ 1189static inline struct qinst * \ 1190vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1191 struct qreg a, struct qreg b) \ 1192{ \ 1193 return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \ 1194} 1195 1196#define VIR_NODST_0(name, vir_inst, op) \ 1197static inline struct qinst * \ 1198vir_##name(struct v3d_compile *c) \ 1199{ \ 1200 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1201 c->undef, c->undef)); \ 1202} 1203 1204#define VIR_NODST_1(name, vir_inst, op) \ 1205static inline struct qinst * \ 1206vir_##name(struct v3d_compile *c, struct qreg a) \ 1207{ \ 1208 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1209 a, c->undef)); \ 1210} 1211 1212#define VIR_NODST_2(name, vir_inst, op) \ 1213static inline struct qinst * \ 1214vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ 1215{ \ 1216 return vir_emit_nondef(c, vir_inst(op, c->undef, \ 1217 a, b)); \ 1218} 1219 1220#define VIR_SFU(name) \ 1221static inline struct qreg \ 1222vir_##name(struct v3d_compile *c, struct qreg a) \ 1223{ \ 1224 if (c->devinfo->ver >= 41) { \ 1225 return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ 1226 c->undef, \ 1227 a, c->undef)); \ 1228 } else { \ 1229 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ 1230 return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ 1231 } \ 1232} \ 1233static inline struct qinst * \ 1234vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ 1235 struct qreg a) \ 1236{ \ 1237 if (c->devinfo->ver >= 41) { \ 1238 return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ 1239 dest, \ 1240 a, c->undef)); \ 1241 } else { \ 1242 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ 1243 return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ 1244 } \ 1245} 1246 1247#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) 1248#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name) 1249#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name) 1250#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name) 1251#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name) 1252#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name) 1253#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name) 1254#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name) 1255#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name) 1256#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name) 1257#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name) 1258 1259VIR_A_ALU2(FADD) 1260VIR_A_ALU2(VFPACK) 1261VIR_A_ALU2(FSUB) 1262VIR_A_ALU2(FMIN) 1263VIR_A_ALU2(FMAX) 1264 1265VIR_A_ALU2(ADD) 1266VIR_A_ALU2(SUB) 1267VIR_A_ALU2(SHL) 1268VIR_A_ALU2(SHR) 1269VIR_A_ALU2(ASR) 1270VIR_A_ALU2(ROR) 1271VIR_A_ALU2(MIN) 1272VIR_A_ALU2(MAX) 1273VIR_A_ALU2(UMIN) 1274VIR_A_ALU2(UMAX) 1275VIR_A_ALU2(AND) 1276VIR_A_ALU2(OR) 1277VIR_A_ALU2(XOR) 1278VIR_A_ALU2(VADD) 1279VIR_A_ALU2(VSUB) 1280VIR_A_NODST_2(STVPMV) 1281VIR_A_NODST_2(STVPMD) 1282VIR_A_ALU1(NOT) 1283VIR_A_ALU1(NEG) 1284VIR_A_ALU1(FLAPUSH) 1285VIR_A_ALU1(FLBPUSH) 1286VIR_A_ALU1(FLPOP) 1287VIR_A_ALU0(FLAFIRST) 1288VIR_A_ALU0(FLNAFIRST) 1289VIR_A_ALU1(SETMSF) 1290VIR_A_ALU1(SETREVF) 1291VIR_A_ALU0(TIDX) 1292VIR_A_ALU0(EIDX) 1293VIR_A_ALU1(LDVPMV_IN) 1294VIR_A_ALU1(LDVPMV_OUT) 1295VIR_A_ALU1(LDVPMD_IN) 1296VIR_A_ALU1(LDVPMD_OUT) 1297VIR_A_ALU2(LDVPMG_IN) 1298VIR_A_ALU2(LDVPMG_OUT) 1299VIR_A_ALU0(TMUWT) 1300 1301VIR_A_ALU0(IID) 1302VIR_A_ALU0(FXCD) 1303VIR_A_ALU0(XCD) 1304VIR_A_ALU0(FYCD) 1305VIR_A_ALU0(YCD) 1306VIR_A_ALU0(MSF) 1307VIR_A_ALU0(REVF) 1308VIR_A_ALU0(BARRIERID) 1309VIR_A_ALU0(SAMPID) 1310VIR_A_NODST_1(VPMSETUP) 1311VIR_A_NODST_0(VPMWT) 1312VIR_A_ALU2(FCMP) 1313VIR_A_ALU2(VFMAX) 1314 1315VIR_A_ALU1(FROUND) 1316VIR_A_ALU1(FTOIN) 1317VIR_A_ALU1(FTRUNC) 1318VIR_A_ALU1(FTOIZ) 1319VIR_A_ALU1(FFLOOR) 1320VIR_A_ALU1(FTOUZ) 1321VIR_A_ALU1(FCEIL) 1322VIR_A_ALU1(FTOC) 1323 1324VIR_A_ALU1(FDX) 1325VIR_A_ALU1(FDY) 1326 1327VIR_A_ALU1(ITOF) 1328VIR_A_ALU1(CLZ) 1329VIR_A_ALU1(UTOF) 1330 1331VIR_M_ALU2(UMUL24) 1332VIR_M_ALU2(FMUL) 1333VIR_M_ALU2(SMUL24) 1334VIR_M_NODST_2(MULTOP) 1335 1336VIR_M_ALU1(MOV) 1337VIR_M_ALU1(FMOV) 1338 1339VIR_SFU(RECIP) 1340VIR_SFU(RSQRT) 1341VIR_SFU(EXP) 1342VIR_SFU(LOG) 1343VIR_SFU(SIN) 1344VIR_SFU(RSQRT2) 1345 1346static inline struct qinst * 1347vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, 1348 struct qreg dest, struct qreg src) 1349{ 1350 struct qinst *mov = vir_MOV_dest(c, dest, src); 1351 vir_set_cond(mov, cond); 1352 return mov; 1353} 1354 1355static inline struct qreg 1356vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond, 1357 struct qreg src0, struct qreg src1) 1358{ 1359 struct qreg t = vir_get_temp(c); 1360 vir_MOV_dest(c, t, src1); 1361 vir_MOV_cond(c, cond, t, src0); 1362 return t; 1363} 1364 1365static inline struct qinst * 1366vir_NOP(struct v3d_compile *c) 1367{ 1368 return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP, 1369 c->undef, c->undef, c->undef)); 1370} 1371 1372static inline struct qreg 1373vir_LDTMU(struct v3d_compile *c) 1374{ 1375 if (c->devinfo->ver >= 41) { 1376 struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1377 c->undef, c->undef); 1378 ldtmu->qpu.sig.ldtmu = true; 1379 1380 return vir_emit_def(c, ldtmu); 1381 } else { 1382 vir_NOP(c)->qpu.sig.ldtmu = true; 1383 return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); 1384 } 1385} 1386 1387static inline struct qreg 1388vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) 1389{ 1390 vir_MULTOP(c, src0, src1); 1391 return vir_UMUL24(c, src0, src1); 1392} 1393 1394static inline struct qreg 1395vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) 1396{ 1397 assert(c->devinfo->ver >= 41); /* XXX */ 1398 assert((config & 0xffffff00) == 0xffffff00); 1399 1400 struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1401 c->undef, c->undef); 1402 ldtlb->qpu.sig.ldtlbu = true; 1403 ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); 1404 return vir_emit_def(c, ldtlb); 1405} 1406 1407static inline struct qreg 1408vir_TLB_COLOR_READ(struct v3d_compile *c) 1409{ 1410 assert(c->devinfo->ver >= 41); /* XXX */ 1411 1412 struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, 1413 c->undef, c->undef); 1414 ldtlb->qpu.sig.ldtlb = true; 1415 return vir_emit_def(c, ldtlb); 1416} 1417 1418static inline struct qinst * 1419vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) 1420{ 1421 /* The actual uniform_data value will be set at scheduling time */ 1422 return vir_emit_nondef(c, vir_branch_inst(c, cond)); 1423} 1424 1425#define vir_for_each_block(block, c) \ 1426 list_for_each_entry(struct qblock, block, &c->blocks, link) 1427 1428#define vir_for_each_block_rev(block, c) \ 1429 list_for_each_entry_rev(struct qblock, block, &c->blocks, link) 1430 1431/* Loop over the non-NULL members of the successors array. */ 1432#define vir_for_each_successor(succ, block) \ 1433 for (struct qblock *succ = block->successors[0]; \ 1434 succ != NULL; \ 1435 succ = (succ == block->successors[1] ? NULL : \ 1436 block->successors[1])) 1437 1438#define vir_for_each_inst(inst, block) \ 1439 list_for_each_entry(struct qinst, inst, &block->instructions, link) 1440 1441#define vir_for_each_inst_rev(inst, block) \ 1442 list_for_each_entry_rev(struct qinst, inst, &block->instructions, link) 1443 1444#define vir_for_each_inst_safe(inst, block) \ 1445 list_for_each_entry_safe(struct qinst, inst, &block->instructions, link) 1446 1447#define vir_for_each_inst_inorder(inst, c) \ 1448 vir_for_each_block(_block, c) \ 1449 vir_for_each_inst(inst, _block) 1450 1451#define vir_for_each_inst_inorder_safe(inst, c) \ 1452 vir_for_each_block(_block, c) \ 1453 vir_for_each_inst_safe(inst, _block) 1454 1455#endif /* V3D_COMPILER_H */ 1456