v3d_compiler.h revision 7ec681f3
1/*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#ifndef V3D_COMPILER_H
25#define V3D_COMPILER_H
26
27#include <assert.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <stdbool.h>
31#include <stdint.h>
32#include <string.h>
33
34#include "util/macros.h"
35#include "common/v3d_debug.h"
36#include "common/v3d_device_info.h"
37#include "common/v3d_limits.h"
38#include "compiler/nir/nir.h"
39#include "util/list.h"
40#include "util/u_math.h"
41
42#include "qpu/qpu_instr.h"
43#include "pipe/p_state.h"
44
45/**
46 * Maximum number of outstanding TMU operations we can queue for execution.
47 *
48 * This is mostly limited by the size of the TMU fifos. The Input and Config
49 * fifos can stall, but we prefer that than injecting TMU flushes manually
50 * in the driver, so we can ignore these, but we can't overflow the Output fifo,
51 * which has 16 / threads per-thread entries, meaning that the maximum number
52 * of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader.
53 * This means that at most we can have 8 outstanding TMU loads, if each load
54 * is just one component.
55 *
56 * NOTE: we could actually have a larger value here because TMU stores don't
57 * consume any entries in the Output fifo (so we could have any number of
58 * outstanding stores) and the driver keeps track of used Output fifo entries
59 * and will flush if we ever needs more than 8, but since loads are much more
60 * common than stores, it is probably not worth it.
61 */
62#define MAX_TMU_QUEUE_SIZE 8
63
64/**
65 * Maximum offset distance in bytes between two consecutive constant UBO loads
66 * for the same UBO where we would favor updating the unifa address by emitting
67 * dummy ldunifa instructions to avoid writing the unifa register.
68 */
69#define MAX_UNIFA_SKIP_DISTANCE 16
70
71struct nir_builder;
72
73struct v3d_fs_inputs {
74        /**
75         * Array of the meanings of the VPM inputs this shader needs.
76         *
77         * It doesn't include those that aren't part of the VPM, like
78         * point/line coordinates.
79         */
80        struct v3d_varying_slot *input_slots;
81        uint32_t num_inputs;
82};
83
84enum qfile {
85        /** An unused source or destination register. */
86        QFILE_NULL,
87
88        /** A physical register, such as the W coordinate payload. */
89        QFILE_REG,
90        /** One of the regsiters for fixed function interactions. */
91        QFILE_MAGIC,
92
93        /**
94         *  A virtual register, that will be allocated to actual accumulator
95         * or physical registers later.
96         */
97        QFILE_TEMP,
98
99        /**
100         * VPM reads use this with an index value to say what part of the VPM
101         * is being read.
102         */
103        QFILE_VPM,
104
105        /**
106         * Stores an immediate value in the index field that will be used
107         * directly by qpu_load_imm().
108         */
109        QFILE_LOAD_IMM,
110
111        /**
112         * Stores an immediate value in the index field that can be turned
113         * into a small immediate field by qpu_encode_small_immediate().
114         */
115        QFILE_SMALL_IMM,
116};
117
118/**
119 * A reference to a QPU register or a virtual temp register.
120 */
121struct qreg {
122        enum qfile file;
123        uint32_t index;
124};
125
126static inline struct qreg vir_reg(enum qfile file, uint32_t index)
127{
128        return (struct qreg){file, index};
129}
130
131static inline struct qreg vir_magic_reg(uint32_t index)
132{
133        return (struct qreg){QFILE_MAGIC, index};
134}
135
136static inline struct qreg vir_nop_reg(void)
137{
138        return (struct qreg){QFILE_NULL, 0};
139}
140
141/**
142 * A reference to an actual register at the QPU level, for register
143 * allocation.
144 */
145struct qpu_reg {
146        bool magic;
147        bool smimm;
148        int index;
149};
150
151struct qinst {
152        /** Entry in qblock->instructions */
153        struct list_head link;
154
155        /**
156         * The instruction being wrapped.  Its condition codes, pack flags,
157         * signals, etc. will all be used, with just the register references
158         * being replaced by the contents of qinst->dst and qinst->src[].
159         */
160        struct v3d_qpu_instr qpu;
161
162        /* Pre-register-allocation references to src/dst registers */
163        struct qreg dst;
164        struct qreg src[3];
165        bool is_last_thrsw;
166
167        /* If the instruction reads a uniform (other than through src[i].file
168         * == QFILE_UNIF), that uniform's index in c->uniform_contents.  ~0
169         * otherwise.
170         */
171        int uniform;
172};
173
174enum quniform_contents {
175        /**
176         * Indicates that a constant 32-bit value is copied from the program's
177         * uniform contents.
178         */
179        QUNIFORM_CONSTANT,
180        /**
181         * Indicates that the program's uniform contents are used as an index
182         * into the GL uniform storage.
183         */
184        QUNIFORM_UNIFORM,
185
186        /** @{
187         * Scaling factors from clip coordinates to relative to the viewport
188         * center.
189         *
190         * This is used by the coordinate and vertex shaders to produce the
191         * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
192         * point offsets from the viewport ccenter.
193         */
194        QUNIFORM_VIEWPORT_X_SCALE,
195        QUNIFORM_VIEWPORT_Y_SCALE,
196        /** @} */
197
198        QUNIFORM_VIEWPORT_Z_OFFSET,
199        QUNIFORM_VIEWPORT_Z_SCALE,
200
201        QUNIFORM_USER_CLIP_PLANE,
202
203        /**
204         * A reference to a V3D 3.x texture config parameter 0 uniform.
205         *
206         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
207         * defines texture type, miplevels, and such.  It will be found as a
208         * parameter to the first QOP_TEX_[STRB] instruction in a sequence.
209         */
210        QUNIFORM_TEXTURE_CONFIG_P0_0,
211        QUNIFORM_TEXTURE_CONFIG_P0_1,
212        QUNIFORM_TEXTURE_CONFIG_P0_2,
213        QUNIFORM_TEXTURE_CONFIG_P0_3,
214        QUNIFORM_TEXTURE_CONFIG_P0_4,
215        QUNIFORM_TEXTURE_CONFIG_P0_5,
216        QUNIFORM_TEXTURE_CONFIG_P0_6,
217        QUNIFORM_TEXTURE_CONFIG_P0_7,
218        QUNIFORM_TEXTURE_CONFIG_P0_8,
219        QUNIFORM_TEXTURE_CONFIG_P0_9,
220        QUNIFORM_TEXTURE_CONFIG_P0_10,
221        QUNIFORM_TEXTURE_CONFIG_P0_11,
222        QUNIFORM_TEXTURE_CONFIG_P0_12,
223        QUNIFORM_TEXTURE_CONFIG_P0_13,
224        QUNIFORM_TEXTURE_CONFIG_P0_14,
225        QUNIFORM_TEXTURE_CONFIG_P0_15,
226        QUNIFORM_TEXTURE_CONFIG_P0_16,
227        QUNIFORM_TEXTURE_CONFIG_P0_17,
228        QUNIFORM_TEXTURE_CONFIG_P0_18,
229        QUNIFORM_TEXTURE_CONFIG_P0_19,
230        QUNIFORM_TEXTURE_CONFIG_P0_20,
231        QUNIFORM_TEXTURE_CONFIG_P0_21,
232        QUNIFORM_TEXTURE_CONFIG_P0_22,
233        QUNIFORM_TEXTURE_CONFIG_P0_23,
234        QUNIFORM_TEXTURE_CONFIG_P0_24,
235        QUNIFORM_TEXTURE_CONFIG_P0_25,
236        QUNIFORM_TEXTURE_CONFIG_P0_26,
237        QUNIFORM_TEXTURE_CONFIG_P0_27,
238        QUNIFORM_TEXTURE_CONFIG_P0_28,
239        QUNIFORM_TEXTURE_CONFIG_P0_29,
240        QUNIFORM_TEXTURE_CONFIG_P0_30,
241        QUNIFORM_TEXTURE_CONFIG_P0_31,
242        QUNIFORM_TEXTURE_CONFIG_P0_32,
243
244        /**
245         * A reference to a V3D 3.x texture config parameter 1 uniform.
246         *
247         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
248         * has the pointer to the indirect texture state.  Our data[] field
249         * will have a packed p1 value, but the address field will be just
250         * which texture unit's texture should be referenced.
251         */
252        QUNIFORM_TEXTURE_CONFIG_P1,
253
254        /* A V3D 4.x texture config parameter.  The high 8 bits will be
255         * which texture or sampler is being sampled, and the driver must
256         * replace the address field with the appropriate address.
257         */
258        QUNIFORM_TMU_CONFIG_P0,
259        QUNIFORM_TMU_CONFIG_P1,
260
261        QUNIFORM_IMAGE_TMU_CONFIG_P0,
262
263        QUNIFORM_TEXTURE_FIRST_LEVEL,
264
265        QUNIFORM_TEXTURE_WIDTH,
266        QUNIFORM_TEXTURE_HEIGHT,
267        QUNIFORM_TEXTURE_DEPTH,
268        QUNIFORM_TEXTURE_ARRAY_SIZE,
269        QUNIFORM_TEXTURE_LEVELS,
270        QUNIFORM_TEXTURE_SAMPLES,
271
272        QUNIFORM_UBO_ADDR,
273
274        QUNIFORM_TEXRECT_SCALE_X,
275        QUNIFORM_TEXRECT_SCALE_Y,
276
277        /* Returns the base offset of the SSBO given by the data value. */
278        QUNIFORM_SSBO_OFFSET,
279
280        /* Returns the size of the SSBO or UBO given by the data value. */
281        QUNIFORM_GET_SSBO_SIZE,
282        QUNIFORM_GET_UBO_SIZE,
283
284        /* Sizes (in pixels) of a shader image given by the data value. */
285        QUNIFORM_IMAGE_WIDTH,
286        QUNIFORM_IMAGE_HEIGHT,
287        QUNIFORM_IMAGE_DEPTH,
288        QUNIFORM_IMAGE_ARRAY_SIZE,
289
290        QUNIFORM_LINE_WIDTH,
291
292        /* The line width sent to hardware. This includes the expanded width
293         * when anti-aliasing is enabled.
294         */
295        QUNIFORM_AA_LINE_WIDTH,
296
297        /* Number of workgroups passed to glDispatchCompute in the dimension
298         * selected by the data value.
299         */
300        QUNIFORM_NUM_WORK_GROUPS,
301
302        /* Base workgroup offset passed to vkCmdDispatchBase in the dimension
303         * selected by the data value.
304         */
305        QUNIFORM_WORK_GROUP_BASE,
306
307        /**
308         * Returns the the offset of the scratch buffer for register spilling.
309         */
310        QUNIFORM_SPILL_OFFSET,
311        QUNIFORM_SPILL_SIZE_PER_THREAD,
312
313        /**
314         * Returns the offset of the shared memory for compute shaders.
315         *
316         * This will be accessed using TMU general memory operations, so the
317         * L2T cache will effectively be the shared memory area.
318         */
319        QUNIFORM_SHARED_OFFSET,
320
321        /**
322         * Returns the number of layers in the framebuffer.
323         *
324         * This is used to cap gl_Layer in geometry shaders to avoid
325         * out-of-bounds accesses into the tile state during binning.
326         */
327        QUNIFORM_FB_LAYERS,
328
329        /**
330         * Current value of gl_ViewIndex for Multiview rendering.
331         */
332        QUNIFORM_VIEW_INDEX,
333};
334
335static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
336{
337        assert(value < (1 << 24));
338        return unit << 24 | value;
339}
340
341static inline uint32_t v3d_unit_data_get_unit(uint32_t data)
342{
343        return data >> 24;
344}
345
346static inline uint32_t v3d_unit_data_get_offset(uint32_t data)
347{
348        return data & 0xffffff;
349}
350
351struct v3d_varying_slot {
352        uint8_t slot_and_component;
353};
354
355static inline struct v3d_varying_slot
356v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component)
357{
358        assert(slot < 255 / 4);
359        return (struct v3d_varying_slot){ (slot << 2) + component };
360}
361
362static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot)
363{
364        return slot.slot_and_component >> 2;
365}
366
367static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
368{
369        return slot.slot_and_component & 3;
370}
371
372enum v3d_execution_environment {
373   V3D_ENVIRONMENT_OPENGL = 0,
374   V3D_ENVIRONMENT_VULKAN,
375};
376
377struct v3d_key {
378        void *shader_state;
379        struct {
380                uint8_t swizzle[4];
381        } tex[V3D_MAX_TEXTURE_SAMPLERS];
382        struct {
383                uint8_t return_size;
384                uint8_t return_channels;
385        } sampler[V3D_MAX_TEXTURE_SAMPLERS];
386
387        uint8_t num_tex_used;
388        uint8_t num_samplers_used;
389        uint8_t ucp_enables;
390        bool is_last_geometry_stage;
391        bool robust_buffer_access;
392
393        enum v3d_execution_environment environment;
394};
395
396struct v3d_fs_key {
397        struct v3d_key base;
398        bool is_points;
399        bool is_lines;
400        bool line_smoothing;
401        bool point_coord_upper_left;
402        bool msaa;
403        bool sample_coverage;
404        bool sample_alpha_to_coverage;
405        bool sample_alpha_to_one;
406        /* Mask of which color render targets are present. */
407        uint8_t cbufs;
408        uint8_t swap_color_rb;
409        /* Mask of which render targets need to be written as 32-bit floats */
410        uint8_t f32_color_rb;
411        /* Masks of which render targets need to be written as ints/uints.
412         * Used by gallium to work around lost information in TGSI.
413         */
414        uint8_t int_color_rb;
415        uint8_t uint_color_rb;
416
417        /* Color format information per render target. Only set when logic
418         * operations are enabled.
419         */
420        struct {
421                enum pipe_format format;
422                const uint8_t *swizzle;
423        } color_fmt[V3D_MAX_DRAW_BUFFERS];
424
425        uint8_t logicop_func;
426        uint32_t point_sprite_mask;
427
428        struct pipe_rt_blend_state blend;
429
430        /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
431         *
432         * - If there is a geometry shader, then gl_PrimitiveID must be written
433         *   by it and the fragment shader loads it as a regular explicit input
434         *   varying. This is the only valid use case in GLES 3.1.
435         *
436         * - If there is not a geometry shader (allowed since GLES 3.2 and
437         *   Vulkan 1.0), then gl_PrimitiveID must be implicitly written by
438         *   hardware and is considered an implicit input varying in the
439         *   fragment shader.
440         */
441        bool has_gs;
442};
443
444struct v3d_gs_key {
445        struct v3d_key base;
446
447        struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS];
448        uint8_t num_used_outputs;
449
450        bool is_coord;
451        bool per_vertex_point_size;
452};
453
454struct v3d_vs_key {
455        struct v3d_key base;
456
457        struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS];
458        uint8_t num_used_outputs;
459
460        /* A bit-mask indicating if we need to swap the R/B channels for
461         * vertex attributes. Since the hardware doesn't provide any
462         * means to swizzle vertex attributes we need to do it in the shader.
463         */
464        uint32_t va_swap_rb_mask;
465
466        bool is_coord;
467        bool per_vertex_point_size;
468        bool clamp_color;
469};
470
471/** A basic block of VIR intructions. */
472struct qblock {
473        struct list_head link;
474
475        struct list_head instructions;
476
477        struct set *predecessors;
478        struct qblock *successors[2];
479
480        int index;
481
482        /* Instruction IPs for the first and last instruction of the block.
483         * Set by qpu_schedule.c.
484         */
485        uint32_t start_qpu_ip;
486        uint32_t end_qpu_ip;
487
488        /* Instruction IP for the branch instruction of the block.  Set by
489         * qpu_schedule.c.
490         */
491        uint32_t branch_qpu_ip;
492
493        /** Offset within the uniform stream at the start of the block. */
494        uint32_t start_uniform;
495        /** Offset within the uniform stream of the branch instruction */
496        uint32_t branch_uniform;
497
498        /**
499         * Has the terminating branch of this block already been emitted
500         * by a break or continue?
501         */
502        bool branch_emitted;
503
504        /** @{ used by v3d_vir_live_variables.c */
505        BITSET_WORD *def;
506        BITSET_WORD *defin;
507        BITSET_WORD *defout;
508        BITSET_WORD *use;
509        BITSET_WORD *live_in;
510        BITSET_WORD *live_out;
511        int start_ip, end_ip;
512        /** @} */
513};
514
515/** Which util/list.h add mode we should use when inserting an instruction. */
516enum vir_cursor_mode {
517        vir_cursor_add,
518        vir_cursor_addtail,
519};
520
521/**
522 * Tracking structure for where new instructions should be inserted.  Create
523 * with one of the vir_after_inst()-style helper functions.
524 *
525 * This does not protect against removal of the block or instruction, so we
526 * have an assert in instruction removal to try to catch it.
527 */
528struct vir_cursor {
529        enum vir_cursor_mode mode;
530        struct list_head *link;
531};
532
533static inline struct vir_cursor
534vir_before_inst(struct qinst *inst)
535{
536        return (struct vir_cursor){ vir_cursor_addtail, &inst->link };
537}
538
539static inline struct vir_cursor
540vir_after_inst(struct qinst *inst)
541{
542        return (struct vir_cursor){ vir_cursor_add, &inst->link };
543}
544
545static inline struct vir_cursor
546vir_before_block(struct qblock *block)
547{
548        return (struct vir_cursor){ vir_cursor_add, &block->instructions };
549}
550
551static inline struct vir_cursor
552vir_after_block(struct qblock *block)
553{
554        return (struct vir_cursor){ vir_cursor_addtail, &block->instructions };
555}
556
557enum v3d_compilation_result {
558        V3D_COMPILATION_SUCCEEDED,
559        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION,
560        V3D_COMPILATION_FAILED,
561};
562
563/**
564 * Compiler state saved across compiler invocations, for any expensive global
565 * setup.
566 */
567struct v3d_compiler {
568        const struct v3d_device_info *devinfo;
569        struct ra_regs *regs;
570        struct ra_class *reg_class_any[3];
571        struct ra_class *reg_class_r5[3];
572        struct ra_class *reg_class_phys[3];
573        struct ra_class *reg_class_phys_or_acc[3];
574};
575
576/**
577 * This holds partially interpolated inputs as provided by hardware
578 * (The Vp = A*(x - x0) + B*(y - y0) term), as well as the C coefficient
579 * required to compute the final interpolated value.
580 */
581struct v3d_interp_input {
582   struct qreg vp;
583   struct qreg C;
584   unsigned mode; /* interpolation mode */
585};
586
587struct v3d_compile {
588        const struct v3d_device_info *devinfo;
589        nir_shader *s;
590        nir_function_impl *impl;
591        struct exec_list *cf_node_list;
592        const struct v3d_compiler *compiler;
593
594        void (*debug_output)(const char *msg,
595                             void *debug_output_data);
596        void *debug_output_data;
597
598        /**
599         * Mapping from nir_register * or nir_ssa_def * to array of struct
600         * qreg for the values.
601         */
602        struct hash_table *def_ht;
603
604        /* For each temp, the instruction generating its value. */
605        struct qinst **defs;
606        uint32_t defs_array_size;
607
608        /* TMU pipelining tracking */
609        struct {
610                /* NIR registers that have been updated with a TMU operation
611                 * that has not been flushed yet.
612                 */
613                struct set *outstanding_regs;
614
615                uint32_t output_fifo_size;
616
617                struct {
618                        nir_dest *dest;
619                        uint8_t num_components;
620                        uint8_t component_mask;
621                } flush[MAX_TMU_QUEUE_SIZE];
622                uint32_t flush_count;
623        } tmu;
624
625        /**
626         * Inputs to the shader, arranged by TGSI declaration order.
627         *
628         * Not all fragment shader QFILE_VARY reads are present in this array.
629         */
630        struct qreg *inputs;
631        /**
632         * Partially interpolated inputs to the shader.
633         */
634        struct v3d_interp_input *interp;
635        struct qreg *outputs;
636        bool msaa_per_sample_output;
637        struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
638        struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
639        uint32_t inputs_array_size;
640        uint32_t outputs_array_size;
641        uint32_t uniforms_array_size;
642
643        /* Booleans for whether the corresponding QFILE_VARY[i] is
644         * flat-shaded.  This includes gl_FragColor flat-shading, which is
645         * customized based on the shademodel_flat shader key.
646         */
647        uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
648
649        uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
650
651        uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
652
653        bool uses_center_w;
654        bool writes_z;
655        bool uses_implicit_point_line_varyings;
656
657        /* True if a fragment shader reads gl_PrimitiveID */
658        bool fs_uses_primitive_id;
659
660        /* If the fragment shader does anything that requires to force
661         * per-sample MSAA, such as reading gl_SampleID.
662         */
663        bool force_per_sample_msaa;
664
665        /* Whether we are using the fallback scheduler. This will be set after
666         * register allocation has failed once.
667         */
668        bool fallback_scheduler;
669
670        /* Disable TMU pipelining. This may increase the chances of being able
671         * to compile shaders with high register pressure that require to emit
672         * TMU spills.
673         */
674        bool disable_tmu_pipelining;
675        bool pipelined_any_tmu;
676
677        /* Disable sorting of UBO loads with constant offset. This may
678         * increase the chances of being able to compile shaders with high
679         * register pressure.
680         */
681        bool disable_constant_ubo_load_sorting;
682        bool sorted_any_ubo_loads;
683
684        /* Emits ldunif for each new uniform, even if the uniform was already
685         * emitted in the same block. Useful to compile shaders with high
686         * register pressure or to disable the optimization during uniform
687         * spills.
688         */
689        bool disable_ldunif_opt;
690
691        /* Disables loop unrolling to reduce register pressure. */
692        bool disable_loop_unrolling;
693        bool unrolled_any_loops;
694
695        /* Minimum number of threads we are willing to use to register allocate
696         * a shader with the current compilation strategy. This only prevents
697         * us from lowering the thread count to register allocate successfully,
698         * which can be useful when we prefer doing other changes to the
699         * compilation strategy before dropping thread count.
700         */
701        uint32_t min_threads_for_reg_alloc;
702
703        /* Whether TMU spills are allowed. If this is disabled it may cause
704         * register allocation to fail. We set this to favor other compilation
705         * strategies that can reduce register pressure and hopefully reduce or
706         * eliminate TMU spills in the shader.
707         */
708        bool tmu_spilling_allowed;
709
710        /* The UBO index and block used with the last unifa load, as well as the
711         * current unifa offset *after* emitting that load. This is used to skip
712         * unifa writes (and their 3 delay slot) when the next UBO load reads
713         * right after the previous one in the same block.
714         */
715        struct qblock *current_unifa_block;
716        int32_t current_unifa_index;
717        uint32_t current_unifa_offset;
718
719        /* State for whether we're executing on each channel currently.  0 if
720         * yes, otherwise a block number + 1 that the channel jumped to.
721         */
722        struct qreg execute;
723        bool in_control_flow;
724
725        struct qreg line_x, point_x, point_y, primitive_id;
726
727        /**
728         * Instance ID, which comes in before the vertex attribute payload if
729         * the shader record requests it.
730         */
731        struct qreg iid;
732
733        /**
734         * Base Instance ID, which comes in before the vertex attribute payload
735         * (after Instance ID) if the shader record requests it.
736         */
737        struct qreg biid;
738
739        /**
740         * Vertex ID, which comes in before the vertex attribute payload
741         * (after Base Instance) if the shader record requests it.
742         */
743        struct qreg vid;
744
745        /* Fragment shader payload regs. */
746        struct qreg payload_w, payload_w_centroid, payload_z;
747
748        struct qreg cs_payload[2];
749        struct qreg cs_shared_offset;
750        int local_invocation_index_bits;
751
752        /* If the shader uses subgroup functionality */
753        bool has_subgroups;
754
755        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
756        uint32_t vpm_output_size;
757
758        /* Size in bytes of registers that have been spilled. This is how much
759         * space needs to be available in the spill BO per thread per QPU.
760         */
761        uint32_t spill_size;
762        /* Shader-db stats */
763        uint32_t spills, fills, loops;
764        /**
765         * Register spilling's per-thread base address, shared between each
766         * spill/fill's addressing calculations.
767         */
768        struct qreg spill_base;
769        /* Bit vector of which temps may be spilled */
770        BITSET_WORD *spillable;
771
772        /**
773         * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
774         *
775         * This includes those that aren't part of the VPM varyings, like
776         * point/line coordinates.
777         */
778        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
779
780        /**
781         * An entry per outputs[] in the VS indicating what the VARYING_SLOT_*
782         * of the output is.  Used to emit from the VS in the order that the
783         * FS needs.
784         */
785        struct v3d_varying_slot *output_slots;
786
787        struct pipe_shader_state *shader_state;
788        struct v3d_key *key;
789        struct v3d_fs_key *fs_key;
790        struct v3d_gs_key *gs_key;
791        struct v3d_vs_key *vs_key;
792
793        /* Live ranges of temps. */
794        int *temp_start, *temp_end;
795        bool live_intervals_valid;
796
797        uint32_t *uniform_data;
798        enum quniform_contents *uniform_contents;
799        uint32_t uniform_array_size;
800        uint32_t num_uniforms;
801        uint32_t output_position_index;
802        nir_variable *output_color_var[4];
803        uint32_t output_sample_mask_index;
804
805        struct qreg undef;
806        uint32_t num_temps;
807
808        struct vir_cursor cursor;
809        struct list_head blocks;
810        int next_block_index;
811        struct qblock *cur_block;
812        struct qblock *loop_cont_block;
813        struct qblock *loop_break_block;
814        /**
815         * Which temp, if any, do we currently have in the flags?
816         * This is set when processing a comparison instruction, and
817         * reset to -1 by anything else that touches the flags.
818         */
819        int32_t flags_temp;
820        enum v3d_qpu_cond flags_cond;
821
822        uint64_t *qpu_insts;
823        uint32_t qpu_inst_count;
824        uint32_t qpu_inst_size;
825        uint32_t qpu_inst_stalled_count;
826        uint32_t nop_count;
827
828        /* For the FS, the number of varying inputs not counting the
829         * point/line varyings payload
830         */
831        uint32_t num_inputs;
832
833        uint32_t program_id;
834        uint32_t variant_id;
835
836        /* Set to compile program in in 1x, 2x, or 4x threaded mode, where
837         * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
838         * limiting ourselves to the part of the physical reg space.
839         *
840         * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x.  On
841         * V3D 4.x, all shaders are 2x threaded, and 4x only divides the
842         * physical reg space in half.
843         */
844        uint8_t threads;
845        struct qinst *last_thrsw;
846        bool last_thrsw_at_top_level;
847
848        bool emitted_tlb_load;
849        bool lock_scoreboard_on_first_thrsw;
850
851        /* Total number of spilled registers in the program */
852        uint32_t spill_count;
853
854        enum v3d_compilation_result compilation_result;
855
856        bool tmu_dirty_rcl;
857};
858
859struct v3d_uniform_list {
860        enum quniform_contents *contents;
861        uint32_t *data;
862        uint32_t count;
863};
864
865struct v3d_prog_data {
866        struct v3d_uniform_list uniforms;
867
868        uint32_t spill_size;
869
870        uint8_t threads;
871
872        /* For threads > 1, whether the program should be dispatched in the
873         * after-final-THRSW state.
874         */
875        bool single_seg;
876
877        bool tmu_dirty_rcl;
878
879        bool has_control_barrier;
880};
881
882struct v3d_vs_prog_data {
883        struct v3d_prog_data base;
884
885        bool uses_iid, uses_biid, uses_vid;
886
887        /* Number of components read from each vertex attribute. */
888        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
889
890        /* Total number of components read, for the shader state record. */
891        uint32_t vpm_input_size;
892
893        /* Total number of components written, for the shader state record. */
894        uint32_t vpm_output_size;
895
896        /* Set if there should be separate VPM segments for input and output.
897         * If unset, vpm_input_size will be 0.
898         */
899        bool separate_segments;
900
901        /* Value to be programmed in VCM_CACHE_SIZE. */
902        uint8_t vcm_cache_size;
903
904        /* Maps the nir->data.location to its
905         * nir->data.driver_location. In general we are using the
906         * driver location as index (like vattr_sizes above), so this
907         * map is useful when what we have is the location
908         *
909         * Returns -1 if the location is not used
910         */
911        int32_t driver_location_map[V3D_MAX_VS_INPUTS];
912};
913
914struct v3d_gs_prog_data {
915        struct v3d_prog_data base;
916
917        /* Whether the program reads gl_PrimitiveIDIn */
918        bool uses_pid;
919
920        /* Number of components read from each input varying. */
921        uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4];
922
923        /* Number of inputs */
924        uint8_t num_inputs;
925        struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS];
926
927        /* Total number of components written, for the shader state record. */
928        uint32_t vpm_output_size;
929
930        /* Maximum SIMD dispatch width to not exceed VPM output size limits
931         * in the geometry shader. Notice that the final dispatch width has to
932         * be decided at draw time and could be lower based on the VPM pressure
933         * added by other shader stages.
934         */
935        uint8_t simd_width;
936
937        /* Output primitive type */
938        uint8_t out_prim_type;
939
940        /* Number of GS invocations */
941        uint8_t num_invocations;
942
943        bool writes_psiz;
944};
945
946struct v3d_fs_prog_data {
947        struct v3d_prog_data base;
948
949        /* Whether the program reads gl_PrimitiveID */
950        bool uses_pid;
951
952        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
953
954        /* Array of flat shade flags.
955         *
956         * Each entry is only 24 bits (high 8 bits 0), to match the hardware
957         * packet layout.
958         */
959        uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
960
961        uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
962
963        uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
964
965        uint8_t num_inputs;
966        bool writes_z;
967        bool disable_ez;
968        bool uses_center_w;
969        bool uses_implicit_point_line_varyings;
970        bool lock_scoreboard_on_first_thrsw;
971        bool force_per_sample_msaa;
972};
973
974struct v3d_compute_prog_data {
975        struct v3d_prog_data base;
976        /* Size in bytes of the workgroup's shared space. */
977        uint32_t shared_size;
978        uint16_t local_size[3];
979        /* If the shader uses subgroup functionality */
980        bool has_subgroups;
981};
982
983struct vpm_config {
984   uint32_t As;
985   uint32_t Vc;
986   uint32_t Gs;
987   uint32_t Gd;
988   uint32_t Gv;
989   uint32_t Ve;
990   uint32_t gs_width;
991};
992
993bool
994v3d_compute_vpm_config(struct v3d_device_info *devinfo,
995                       struct v3d_vs_prog_data *vs_bin,
996                       struct v3d_vs_prog_data *vs,
997                       struct v3d_gs_prog_data *gs_bin,
998                       struct v3d_gs_prog_data *gs,
999                       struct vpm_config *vpm_cfg_bin,
1000                       struct vpm_config *vpm_cfg);
1001
1002static inline bool
1003vir_has_uniform(struct qinst *inst)
1004{
1005        return inst->uniform != ~0;
1006}
1007
1008const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
1009void v3d_compiler_free(const struct v3d_compiler *compiler);
1010void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
1011
1012uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1013                      struct v3d_key *key,
1014                      struct v3d_prog_data **prog_data,
1015                      nir_shader *s,
1016                      void (*debug_output)(const char *msg,
1017                                           void *debug_output_data),
1018                      void *debug_output_data,
1019                      int program_id, int variant_id,
1020                      uint32_t *final_assembly_size);
1021
1022uint32_t v3d_prog_data_size(gl_shader_stage stage);
1023void v3d_nir_to_vir(struct v3d_compile *c);
1024
1025void vir_compile_destroy(struct v3d_compile *c);
1026const char *vir_get_stage_name(struct v3d_compile *c);
1027struct qblock *vir_new_block(struct v3d_compile *c);
1028void vir_set_emit_block(struct v3d_compile *c, struct qblock *block);
1029void vir_link_blocks(struct qblock *predecessor, struct qblock *successor);
1030struct qblock *vir_entry_block(struct v3d_compile *c);
1031struct qblock *vir_exit_block(struct v3d_compile *c);
1032struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
1033                           struct qreg src0, struct qreg src1);
1034struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
1035                           struct qreg src0, struct qreg src1);
1036struct qinst *vir_branch_inst(struct v3d_compile *c,
1037                              enum v3d_qpu_branch_cond cond);
1038void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
1039uint32_t vir_get_uniform_index(struct v3d_compile *c,
1040                               enum quniform_contents contents,
1041                               uint32_t data);
1042struct qreg vir_uniform(struct v3d_compile *c,
1043                        enum quniform_contents contents,
1044                        uint32_t data);
1045void vir_schedule_instructions(struct v3d_compile *c);
1046void v3d_setup_spill_base(struct v3d_compile *c);
1047struct v3d_qpu_instr v3d_qpu_nop(void);
1048
1049struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
1050struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
1051void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
1052enum v3d_qpu_cond vir_get_cond(struct qinst *inst);
1053void vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf);
1054void vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf);
1055void vir_set_unpack(struct qinst *inst, int src,
1056                    enum v3d_qpu_input_unpack unpack);
1057void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack);
1058
1059struct qreg vir_get_temp(struct v3d_compile *c);
1060void vir_calculate_live_intervals(struct v3d_compile *c);
1061int vir_get_nsrc(struct qinst *inst);
1062bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
1063bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
1064bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op);
1065bool vir_is_raw_mov(struct qinst *inst);
1066bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
1067bool vir_is_add(struct qinst *inst);
1068bool vir_is_mul(struct qinst *inst);
1069bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
1070bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
1071struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
1072uint8_t vir_channels_written(struct qinst *inst);
1073struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
1074void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
1075                    struct qreg result);
1076bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
1077void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
1078                               uint32_t component_mask);
1079void ntq_flush_tmu(struct v3d_compile *c);
1080void vir_emit_thrsw(struct v3d_compile *c);
1081
1082void vir_dump(struct v3d_compile *c);
1083void vir_dump_inst(struct v3d_compile *c, struct qinst *inst);
1084void vir_dump_uniform(enum quniform_contents contents, uint32_t data);
1085
1086void vir_validate(struct v3d_compile *c);
1087
1088void vir_optimize(struct v3d_compile *c);
1089bool vir_opt_algebraic(struct v3d_compile *c);
1090bool vir_opt_constant_folding(struct v3d_compile *c);
1091bool vir_opt_copy_propagate(struct v3d_compile *c);
1092bool vir_opt_dead_code(struct v3d_compile *c);
1093bool vir_opt_peephole_sf(struct v3d_compile *c);
1094bool vir_opt_redundant_flags(struct v3d_compile *c);
1095bool vir_opt_small_immediates(struct v3d_compile *c);
1096bool vir_opt_vpm(struct v3d_compile *c);
1097bool vir_opt_constant_alu(struct v3d_compile *c);
1098void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
1099void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
1100void v3d_nir_lower_line_smooth(nir_shader *shader);
1101void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
1102void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
1103void v3d_nir_lower_scratch(nir_shader *s);
1104void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
1105void v3d_nir_lower_image_load_store(nir_shader *s);
1106void vir_lower_uniforms(struct v3d_compile *c);
1107
1108void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
1109void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
1110void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
1111void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
1112void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
1113                                     nir_intrinsic_instr *instr);
1114
1115void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
1116uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
1117void qpu_validate(struct v3d_compile *c);
1118struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
1119bool vir_init_reg_sets(struct v3d_compiler *compiler);
1120
1121int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);
1122
1123bool v3d_gl_format_is_return_32(GLenum format);
1124
1125uint32_t
1126v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
1127
1128static inline bool
1129quniform_contents_is_texture_p0(enum quniform_contents contents)
1130{
1131        return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 &&
1132                contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 +
1133                            V3D_MAX_TEXTURE_SAMPLERS));
1134}
1135
1136static inline bool
1137vir_in_nonuniform_control_flow(struct v3d_compile *c)
1138{
1139        return c->execute.file != QFILE_NULL;
1140}
1141
1142static inline struct qreg
1143vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
1144{
1145        return vir_uniform(c, QUNIFORM_CONSTANT, ui);
1146}
1147
1148static inline struct qreg
1149vir_uniform_f(struct v3d_compile *c, float f)
1150{
1151        return vir_uniform(c, QUNIFORM_CONSTANT, fui(f));
1152}
1153
1154#define VIR_ALU0(name, vir_inst, op)                                     \
1155static inline struct qreg                                                \
1156vir_##name(struct v3d_compile *c)                                        \
1157{                                                                        \
1158        return vir_emit_def(c, vir_inst(op, c->undef,                    \
1159                                        c->undef, c->undef));            \
1160}                                                                        \
1161static inline struct qinst *                                             \
1162vir_##name##_dest(struct v3d_compile *c, struct qreg dest)               \
1163{                                                                        \
1164        return vir_emit_nondef(c, vir_inst(op, dest,                     \
1165                                           c->undef, c->undef));         \
1166}
1167
1168#define VIR_ALU1(name, vir_inst, op)                                     \
1169static inline struct qreg                                                \
1170vir_##name(struct v3d_compile *c, struct qreg a)                         \
1171{                                                                        \
1172        return vir_emit_def(c, vir_inst(op, c->undef,                    \
1173                                        a, c->undef));                   \
1174}                                                                        \
1175static inline struct qinst *                                             \
1176vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
1177                  struct qreg a)                                         \
1178{                                                                        \
1179        return vir_emit_nondef(c, vir_inst(op, dest, a,          \
1180                                           c->undef));                   \
1181}
1182
1183#define VIR_ALU2(name, vir_inst, op)                                       \
1184static inline struct qreg                                                \
1185vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
1186{                                                                        \
1187        return vir_emit_def(c, vir_inst(op, c->undef, a, b));    \
1188}                                                                        \
1189static inline struct qinst *                                             \
1190vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
1191                  struct qreg a, struct qreg b)                          \
1192{                                                                        \
1193        return vir_emit_nondef(c, vir_inst(op, dest, a, b));     \
1194}
1195
1196#define VIR_NODST_0(name, vir_inst, op)                                 \
1197static inline struct qinst *                                            \
1198vir_##name(struct v3d_compile *c)                                       \
1199{                                                                       \
1200        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
1201                                           c->undef, c->undef));        \
1202}
1203
1204#define VIR_NODST_1(name, vir_inst, op)                                               \
1205static inline struct qinst *                                            \
1206vir_##name(struct v3d_compile *c, struct qreg a)                        \
1207{                                                                       \
1208        return vir_emit_nondef(c, vir_inst(op, c->undef,        \
1209                                           a, c->undef));               \
1210}
1211
1212#define VIR_NODST_2(name, vir_inst, op)                                               \
1213static inline struct qinst *                                            \
1214vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
1215{                                                                       \
1216        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
1217                                           a, b));                      \
1218}
1219
1220#define VIR_SFU(name)                                                      \
1221static inline struct qreg                                                \
1222vir_##name(struct v3d_compile *c, struct qreg a)                         \
1223{                                                                        \
1224        if (c->devinfo->ver >= 41) {                                     \
1225                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
1226                                                    c->undef,            \
1227                                                    a, c->undef));       \
1228        } else {                                                         \
1229                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
1230                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
1231        }                                                                \
1232}                                                                        \
1233static inline struct qinst *                                             \
1234vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
1235                  struct qreg a)                                         \
1236{                                                                        \
1237        if (c->devinfo->ver >= 41) {                                     \
1238                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
1239                                                       dest,             \
1240                                                       a, c->undef));    \
1241        } else {                                                         \
1242                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
1243                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
1244        }                                                                \
1245}
1246
1247#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
1248#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
1249#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
1250#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name)
1251#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name)
1252#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name)
1253#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name)
1254#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name)
1255#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name)
1256#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name)
1257#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name)
1258
1259VIR_A_ALU2(FADD)
1260VIR_A_ALU2(VFPACK)
1261VIR_A_ALU2(FSUB)
1262VIR_A_ALU2(FMIN)
1263VIR_A_ALU2(FMAX)
1264
1265VIR_A_ALU2(ADD)
1266VIR_A_ALU2(SUB)
1267VIR_A_ALU2(SHL)
1268VIR_A_ALU2(SHR)
1269VIR_A_ALU2(ASR)
1270VIR_A_ALU2(ROR)
1271VIR_A_ALU2(MIN)
1272VIR_A_ALU2(MAX)
1273VIR_A_ALU2(UMIN)
1274VIR_A_ALU2(UMAX)
1275VIR_A_ALU2(AND)
1276VIR_A_ALU2(OR)
1277VIR_A_ALU2(XOR)
1278VIR_A_ALU2(VADD)
1279VIR_A_ALU2(VSUB)
1280VIR_A_NODST_2(STVPMV)
1281VIR_A_NODST_2(STVPMD)
1282VIR_A_ALU1(NOT)
1283VIR_A_ALU1(NEG)
1284VIR_A_ALU1(FLAPUSH)
1285VIR_A_ALU1(FLBPUSH)
1286VIR_A_ALU1(FLPOP)
1287VIR_A_ALU0(FLAFIRST)
1288VIR_A_ALU0(FLNAFIRST)
1289VIR_A_ALU1(SETMSF)
1290VIR_A_ALU1(SETREVF)
1291VIR_A_ALU0(TIDX)
1292VIR_A_ALU0(EIDX)
1293VIR_A_ALU1(LDVPMV_IN)
1294VIR_A_ALU1(LDVPMV_OUT)
1295VIR_A_ALU1(LDVPMD_IN)
1296VIR_A_ALU1(LDVPMD_OUT)
1297VIR_A_ALU2(LDVPMG_IN)
1298VIR_A_ALU2(LDVPMG_OUT)
1299VIR_A_ALU0(TMUWT)
1300
1301VIR_A_ALU0(IID)
1302VIR_A_ALU0(FXCD)
1303VIR_A_ALU0(XCD)
1304VIR_A_ALU0(FYCD)
1305VIR_A_ALU0(YCD)
1306VIR_A_ALU0(MSF)
1307VIR_A_ALU0(REVF)
1308VIR_A_ALU0(BARRIERID)
1309VIR_A_ALU0(SAMPID)
1310VIR_A_NODST_1(VPMSETUP)
1311VIR_A_NODST_0(VPMWT)
1312VIR_A_ALU2(FCMP)
1313VIR_A_ALU2(VFMAX)
1314
1315VIR_A_ALU1(FROUND)
1316VIR_A_ALU1(FTOIN)
1317VIR_A_ALU1(FTRUNC)
1318VIR_A_ALU1(FTOIZ)
1319VIR_A_ALU1(FFLOOR)
1320VIR_A_ALU1(FTOUZ)
1321VIR_A_ALU1(FCEIL)
1322VIR_A_ALU1(FTOC)
1323
1324VIR_A_ALU1(FDX)
1325VIR_A_ALU1(FDY)
1326
1327VIR_A_ALU1(ITOF)
1328VIR_A_ALU1(CLZ)
1329VIR_A_ALU1(UTOF)
1330
1331VIR_M_ALU2(UMUL24)
1332VIR_M_ALU2(FMUL)
1333VIR_M_ALU2(SMUL24)
1334VIR_M_NODST_2(MULTOP)
1335
1336VIR_M_ALU1(MOV)
1337VIR_M_ALU1(FMOV)
1338
1339VIR_SFU(RECIP)
1340VIR_SFU(RSQRT)
1341VIR_SFU(EXP)
1342VIR_SFU(LOG)
1343VIR_SFU(SIN)
1344VIR_SFU(RSQRT2)
1345
1346static inline struct qinst *
1347vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
1348             struct qreg dest, struct qreg src)
1349{
1350        struct qinst *mov = vir_MOV_dest(c, dest, src);
1351        vir_set_cond(mov, cond);
1352        return mov;
1353}
1354
1355static inline struct qreg
1356vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond,
1357        struct qreg src0, struct qreg src1)
1358{
1359        struct qreg t = vir_get_temp(c);
1360        vir_MOV_dest(c, t, src1);
1361        vir_MOV_cond(c, cond, t, src0);
1362        return t;
1363}
1364
1365static inline struct qinst *
1366vir_NOP(struct v3d_compile *c)
1367{
1368        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP,
1369                                               c->undef, c->undef, c->undef));
1370}
1371
1372static inline struct qreg
1373vir_LDTMU(struct v3d_compile *c)
1374{
1375        if (c->devinfo->ver >= 41) {
1376                struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1377                                                   c->undef, c->undef);
1378                ldtmu->qpu.sig.ldtmu = true;
1379
1380                return vir_emit_def(c, ldtmu);
1381        } else {
1382                vir_NOP(c)->qpu.sig.ldtmu = true;
1383                return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
1384        }
1385}
1386
1387static inline struct qreg
1388vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
1389{
1390        vir_MULTOP(c, src0, src1);
1391        return vir_UMUL24(c, src0, src1);
1392}
1393
1394static inline struct qreg
1395vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
1396{
1397        assert(c->devinfo->ver >= 41); /* XXX */
1398        assert((config & 0xffffff00) == 0xffffff00);
1399
1400        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1401                                           c->undef, c->undef);
1402        ldtlb->qpu.sig.ldtlbu = true;
1403        ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
1404        return vir_emit_def(c, ldtlb);
1405}
1406
1407static inline struct qreg
1408vir_TLB_COLOR_READ(struct v3d_compile *c)
1409{
1410        assert(c->devinfo->ver >= 41); /* XXX */
1411
1412        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1413                                           c->undef, c->undef);
1414        ldtlb->qpu.sig.ldtlb = true;
1415        return vir_emit_def(c, ldtlb);
1416}
1417
1418static inline struct qinst *
1419vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
1420{
1421        /* The actual uniform_data value will be set at scheduling time */
1422        return vir_emit_nondef(c, vir_branch_inst(c, cond));
1423}
1424
1425#define vir_for_each_block(block, c)                                    \
1426        list_for_each_entry(struct qblock, block, &c->blocks, link)
1427
1428#define vir_for_each_block_rev(block, c)                                \
1429        list_for_each_entry_rev(struct qblock, block, &c->blocks, link)
1430
1431/* Loop over the non-NULL members of the successors array. */
1432#define vir_for_each_successor(succ, block)                             \
1433        for (struct qblock *succ = block->successors[0];                \
1434             succ != NULL;                                              \
1435             succ = (succ == block->successors[1] ? NULL :              \
1436                     block->successors[1]))
1437
1438#define vir_for_each_inst(inst, block)                                  \
1439        list_for_each_entry(struct qinst, inst, &block->instructions, link)
1440
1441#define vir_for_each_inst_rev(inst, block)                                  \
1442        list_for_each_entry_rev(struct qinst, inst, &block->instructions, link)
1443
1444#define vir_for_each_inst_safe(inst, block)                             \
1445        list_for_each_entry_safe(struct qinst, inst, &block->instructions, link)
1446
1447#define vir_for_each_inst_inorder(inst, c)                              \
1448        vir_for_each_block(_block, c)                                   \
1449                vir_for_each_inst(inst, _block)
1450
1451#define vir_for_each_inst_inorder_safe(inst, c)                         \
1452        vir_for_each_block(_block, c)                                   \
1453                vir_for_each_inst_safe(inst, _block)
1454
1455#endif /* V3D_COMPILER_H */
1456