ir3_lower_subgroups.c revision 7ec681f3
1/*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "ir3.h"
25
26/* Lower several macro-instructions needed for shader subgroup support that
27 * must be turned into if statements. We do this after RA and post-RA
28 * scheduling to give the scheduler a chance to rearrange them, because RA
29 * may need to insert OPC_META_READ_FIRST to handle splitting live ranges, and
30 * also because some (e.g. BALLOT and READ_FIRST) must produce a shared
31 * register that cannot be spilled to a normal register until after the if,
32 * which makes implementing spilling more complicated if they are already
33 * lowered.
34 */
35
36static void
37replace_pred(struct ir3_block *block, struct ir3_block *old_pred,
38             struct ir3_block *new_pred)
39{
40   for (unsigned i = 0; i < block->predecessors_count; i++) {
41      if (block->predecessors[i] == old_pred) {
42         block->predecessors[i] = new_pred;
43         return;
44      }
45   }
46}
47
48static void
49replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,
50                      struct ir3_block *new_pred)
51{
52   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
53      if (block->physical_predecessors[i] == old_pred) {
54         block->physical_predecessors[i] = new_pred;
55         return;
56      }
57   }
58}
59
60static void
61mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
62{
63   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
64   struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
65   mov_dst->wrmask = dst->wrmask;
66   struct ir3_register *src = ir3_src_create(
67      mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
68   src->uim_val = immed;
69   mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
70   mov->cat1.src_type = mov->cat1.dst_type;
71   mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
72}
73
74static struct ir3_block *
75split_block(struct ir3 *ir, struct ir3_block *before_block,
76            struct ir3_instruction *instr, struct ir3_block **then)
77{
78   struct ir3_block *then_block = ir3_block_create(ir);
79   struct ir3_block *after_block = ir3_block_create(ir);
80   list_add(&then_block->node, &before_block->node);
81   list_add(&after_block->node, &then_block->node);
82
83   for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
84      after_block->successors[i] = before_block->successors[i];
85      if (after_block->successors[i])
86         replace_pred(after_block->successors[i], before_block, after_block);
87   }
88
89   for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors);
90        i++) {
91      after_block->physical_successors[i] =
92         before_block->physical_successors[i];
93      if (after_block->physical_successors[i]) {
94         replace_physical_pred(after_block->physical_successors[i],
95                               before_block, after_block);
96      }
97   }
98
99   before_block->successors[0] = then_block;
100   before_block->successors[1] = after_block;
101   before_block->physical_successors[0] = then_block;
102   before_block->physical_successors[1] = after_block;
103   ir3_block_add_predecessor(then_block, before_block);
104   ir3_block_add_predecessor(after_block, before_block);
105   ir3_block_add_physical_predecessor(then_block, before_block);
106   ir3_block_add_physical_predecessor(after_block, before_block);
107
108   then_block->successors[0] = after_block;
109   then_block->physical_successors[0] = after_block;
110   ir3_block_add_predecessor(after_block, then_block);
111   ir3_block_add_physical_predecessor(after_block, then_block);
112
113   foreach_instr_from_safe (rem_instr, &instr->node,
114                            &before_block->instr_list) {
115      list_del(&rem_instr->node);
116      list_addtail(&rem_instr->node, &after_block->instr_list);
117      rem_instr->block = after_block;
118   }
119
120   after_block->brtype = before_block->brtype;
121   after_block->condition = before_block->condition;
122
123   *then = then_block;
124   return after_block;
125}
126
127static bool
128lower_block(struct ir3 *ir, struct ir3_block **block)
129{
130   bool progress = false;
131
132   foreach_instr_safe (instr, &(*block)->instr_list) {
133      switch (instr->opc) {
134      case OPC_BALLOT_MACRO:
135      case OPC_ANY_MACRO:
136      case OPC_ALL_MACRO:
137      case OPC_ELECT_MACRO:
138      case OPC_READ_COND_MACRO:
139      case OPC_READ_FIRST_MACRO:
140      case OPC_SWZ_SHARED_MACRO:
141         break;
142      default:
143         continue;
144      }
145
146      struct ir3_block *before_block = *block;
147      struct ir3_block *then_block;
148      struct ir3_block *after_block =
149         split_block(ir, before_block, instr, &then_block);
150
151      /* For ballot, the destination must be initialized to 0 before we do
152       * the movmsk because the condition may be 0 and then the movmsk will
153       * be skipped. Because it's a shared register we have to wrap the
154       * initialization in a getone block.
155       */
156      if (instr->opc == OPC_BALLOT_MACRO) {
157         before_block->brtype = IR3_BRANCH_GETONE;
158         before_block->condition = NULL;
159         mov_immed(instr->dsts[0], then_block, 0);
160         before_block = after_block;
161         after_block = split_block(ir, before_block, instr, &then_block);
162      }
163
164      switch (instr->opc) {
165      case OPC_BALLOT_MACRO:
166      case OPC_READ_COND_MACRO:
167      case OPC_ANY_MACRO:
168      case OPC_ALL_MACRO:
169         before_block->condition = instr->srcs[0]->def->instr;
170         break;
171      default:
172         before_block->condition = NULL;
173         break;
174      }
175
176      switch (instr->opc) {
177      case OPC_BALLOT_MACRO:
178      case OPC_READ_COND_MACRO:
179         before_block->brtype = IR3_BRANCH_COND;
180         break;
181      case OPC_ANY_MACRO:
182         before_block->brtype = IR3_BRANCH_ANY;
183         break;
184      case OPC_ALL_MACRO:
185         before_block->brtype = IR3_BRANCH_ALL;
186         break;
187      case OPC_ELECT_MACRO:
188      case OPC_READ_FIRST_MACRO:
189      case OPC_SWZ_SHARED_MACRO:
190         before_block->brtype = IR3_BRANCH_GETONE;
191         break;
192      default:
193         unreachable("bad opcode");
194      }
195
196      switch (instr->opc) {
197      case OPC_ALL_MACRO:
198      case OPC_ANY_MACRO:
199      case OPC_ELECT_MACRO:
200         mov_immed(instr->dsts[0], then_block, 1);
201         mov_immed(instr->dsts[0], before_block, 0);
202         break;
203
204      case OPC_BALLOT_MACRO: {
205         unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
206         struct ir3_instruction *movmsk =
207            ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
208         ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
209         movmsk->repeat = comp_count - 1;
210         break;
211      }
212
213      case OPC_READ_COND_MACRO:
214      case OPC_READ_FIRST_MACRO: {
215         struct ir3_instruction *mov =
216            ir3_instr_create(then_block, OPC_MOV, 1, 1);
217         unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
218         ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
219         struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
220         *new_src = *instr->srcs[src];
221         mov->cat1.dst_type = TYPE_U32;
222         mov->cat1.src_type =
223            (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
224         break;
225      }
226
227      case OPC_SWZ_SHARED_MACRO: {
228         struct ir3_instruction *swz =
229            ir3_instr_create(then_block, OPC_SWZ, 2, 2);
230         ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
231         ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
232         ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
233         ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
234         swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
235         swz->repeat = 1;
236         break;
237      }
238
239      default:
240         unreachable("bad opcode");
241      }
242
243      *block = after_block;
244      list_delinit(&instr->node);
245      progress = true;
246   }
247
248   return progress;
249}
250
251bool
252ir3_lower_subgroups(struct ir3 *ir)
253{
254   bool progress = false;
255
256   foreach_block (block, &ir->block_list)
257      progress |= lower_block(ir, &block);
258
259   return progress;
260}
261