1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "nir.h"
25#include "nir_builder.h"
26#include "util/u_math.h"
27
28/**
29 * \file nir_opt_intrinsics.c
30 */
31
32static nir_intrinsic_instr *
33lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
34                                      unsigned int component)
35{
36   nir_ssa_def *comp;
37   if (component == 0)
38      comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
39   else
40      comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
41
42   nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
43   nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
44   intr->const_index[0] = intrin->const_index[0];
45   intr->const_index[1] = intrin->const_index[1];
46   intr->src[0] = nir_src_for_ssa(comp);
47   if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
48      nir_src_copy(&intr->src[1], &intrin->src[1]);
49
50   intr->num_components = 1;
51   nir_builder_instr_insert(b, &intr->instr);
52   return intr;
53}
54
55static nir_ssa_def *
56lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
57{
58   assert(intrin->src[0].ssa->bit_size == 64);
59   nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
60   nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
61   return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa);
62}
63
64static nir_ssa_def *
65ballot_type_to_uint(nir_builder *b, nir_ssa_def *value,
66                    const nir_lower_subgroups_options *options)
67{
68   /* Only the new-style SPIR-V subgroup instructions take a ballot result as
69    * an argument, so we only use this on uvec4 types.
70    */
71   assert(value->num_components == 4 && value->bit_size == 32);
72
73   return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
74                           options->ballot_bit_size);
75}
76
77static nir_ssa_def *
78uint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
79                    unsigned num_components, unsigned bit_size)
80{
81   assert(util_is_power_of_two_nonzero(num_components));
82   assert(util_is_power_of_two_nonzero(value->num_components));
83
84   unsigned total_bits = bit_size * num_components;
85
86   /* If the source doesn't have enough bits, zero-pad */
87   if (total_bits > value->bit_size * value->num_components)
88      value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
89
90   value = nir_bitcast_vector(b, value, bit_size);
91
92   /* If the source has too many components, truncate.  This can happen if,
93    * for instance, we're implementing GL_ARB_shader_ballot or
94    * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
95    * architecture with a native 128-bit uvec4 ballot.  This comes up in Zink
96    * for OpenGL on Vulkan.  It's the job of the driver calling this lowering
97    * pass to ensure that it's restricted subgroup sizes sufficiently that we
98    * have enough ballot bits.
99    */
100   if (value->num_components > num_components)
101      value = nir_channels(b, value, (nir_component_mask_t)BITFIELD_MASK(num_components));
102
103   return value;
104}
105
106static nir_ssa_def *
107lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin,
108                            bool lower_to_32bit)
109{
110   /* This is safe to call on scalar things but it would be silly */
111   assert(intrin->dest.ssa.num_components > 1);
112
113   nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0],
114                                           intrin->num_components);
115   nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS];
116
117   for (unsigned i = 0; i < intrin->num_components; i++) {
118      nir_intrinsic_instr *chan_intrin =
119         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
120      nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
121                        1, intrin->dest.ssa.bit_size, NULL);
122      chan_intrin->num_components = 1;
123
124      /* value */
125      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
126      /* invocation */
127      if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
128         assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
129         nir_src_copy(&chan_intrin->src[1], &intrin->src[1]);
130      }
131
132      chan_intrin->const_index[0] = intrin->const_index[0];
133      chan_intrin->const_index[1] = intrin->const_index[1];
134
135      if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) {
136         reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin);
137      } else {
138         nir_builder_instr_insert(b, &chan_intrin->instr);
139         reads[i] = &chan_intrin->dest.ssa;
140      }
141   }
142
143   return nir_vec(b, reads, intrin->num_components);
144}
145
146static nir_ssa_def *
147lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
148{
149   assert(intrin->src[0].is_ssa);
150   nir_ssa_def *value = intrin->src[0].ssa;
151
152   nir_ssa_def *result = NULL;
153   for (unsigned i = 0; i < intrin->num_components; i++) {
154      nir_intrinsic_instr *chan_intrin =
155         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
156      nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
157                        1, intrin->dest.ssa.bit_size, NULL);
158      chan_intrin->num_components = 1;
159      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
160      nir_builder_instr_insert(b, &chan_intrin->instr);
161
162      if (result) {
163         result = nir_iand(b, result, &chan_intrin->dest.ssa);
164      } else {
165         result = &chan_intrin->dest.ssa;
166      }
167   }
168
169   return result;
170}
171
172static nir_ssa_def *
173lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
174{
175   assert(intrin->src[0].is_ssa);
176   nir_ssa_def *value = intrin->src[0].ssa;
177
178   /* We have to implicitly lower to scalar */
179   nir_ssa_def *all_eq = NULL;
180   for (unsigned i = 0; i < intrin->num_components; i++) {
181      nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
182
183      nir_ssa_def *is_eq;
184      if (intrin->intrinsic == nir_intrinsic_vote_feq) {
185         is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
186      } else {
187         is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
188      }
189
190      if (all_eq == NULL) {
191         all_eq = is_eq;
192      } else {
193         all_eq = nir_iand(b, all_eq, is_eq);
194      }
195   }
196
197   return nir_vote_all(b, 1, all_eq);
198}
199
200static nir_ssa_def *
201lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin,
202                         const nir_lower_subgroups_options *options)
203{
204   unsigned mask = nir_src_as_uint(intrin->src[1]);
205
206   if (mask >= 32)
207      return NULL;
208
209   nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create(
210      b->shader, nir_intrinsic_masked_swizzle_amd);
211   swizzle->num_components = intrin->num_components;
212   nir_src_copy(&swizzle->src[0], &intrin->src[0]);
213   nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f);
214   nir_ssa_dest_init(&swizzle->instr, &swizzle->dest,
215                     intrin->dest.ssa.num_components,
216                     intrin->dest.ssa.bit_size, NULL);
217
218   if (options->lower_to_scalar && swizzle->num_components > 1) {
219      return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit);
220   } else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) {
221      return lower_subgroup_op_to_32bit(b, swizzle);
222   } else {
223      nir_builder_instr_insert(b, &swizzle->instr);
224      return &swizzle->dest.ssa;
225   }
226}
227
228static nir_ssa_def *
229lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
230              const nir_lower_subgroups_options *options)
231{
232   if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
233       options->lower_shuffle_to_swizzle_amd &&
234       nir_src_is_const(intrin->src[1])) {
235      nir_ssa_def *result =
236         lower_shuffle_to_swizzle(b, intrin, options);
237      if (result)
238         return result;
239   }
240
241   nir_ssa_def *index = nir_load_subgroup_invocation(b);
242   bool is_shuffle = false;
243   switch (intrin->intrinsic) {
244   case nir_intrinsic_shuffle_xor:
245      assert(intrin->src[1].is_ssa);
246      index = nir_ixor(b, index, intrin->src[1].ssa);
247      is_shuffle = true;
248      break;
249   case nir_intrinsic_shuffle_up:
250      assert(intrin->src[1].is_ssa);
251      index = nir_isub(b, index, intrin->src[1].ssa);
252      is_shuffle = true;
253      break;
254   case nir_intrinsic_shuffle_down:
255      assert(intrin->src[1].is_ssa);
256      index = nir_iadd(b, index, intrin->src[1].ssa);
257      is_shuffle = true;
258      break;
259   case nir_intrinsic_quad_broadcast:
260      assert(intrin->src[1].is_ssa);
261      index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)),
262                         intrin->src[1].ssa);
263      break;
264   case nir_intrinsic_quad_swap_horizontal:
265      /* For Quad operations, subgroups are divided into quads where
266       * (invocation % 4) is the index to a square arranged as follows:
267       *
268       *    +---+---+
269       *    | 0 | 1 |
270       *    +---+---+
271       *    | 2 | 3 |
272       *    +---+---+
273       */
274      index = nir_ixor(b, index, nir_imm_int(b, 0x1));
275      break;
276   case nir_intrinsic_quad_swap_vertical:
277      index = nir_ixor(b, index, nir_imm_int(b, 0x2));
278      break;
279   case nir_intrinsic_quad_swap_diagonal:
280      index = nir_ixor(b, index, nir_imm_int(b, 0x3));
281      break;
282   default:
283      unreachable("Invalid intrinsic");
284   }
285
286   nir_intrinsic_instr *shuffle =
287      nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle);
288   shuffle->num_components = intrin->num_components;
289   nir_src_copy(&shuffle->src[0], &intrin->src[0]);
290   shuffle->src[1] = nir_src_for_ssa(index);
291   nir_ssa_dest_init(&shuffle->instr, &shuffle->dest,
292                     intrin->dest.ssa.num_components,
293                     intrin->dest.ssa.bit_size, NULL);
294
295   bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle;
296   if (options->lower_to_scalar && shuffle->num_components > 1) {
297      return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit);
298   } else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) {
299      return lower_subgroup_op_to_32bit(b, shuffle);
300   } else {
301      nir_builder_instr_insert(b, &shuffle->instr);
302      return &shuffle->dest.ssa;
303   }
304}
305
306static bool
307lower_subgroups_filter(const nir_instr *instr, const void *_options)
308{
309   return instr->type == nir_instr_type_intrinsic;
310}
311
312/* Return a ballot-mask-sized value which represents "val" sign-extended and
313 * then shifted left by "shift". Only particular values for "val" are
314 * supported, see below.
315 */
316static nir_ssa_def *
317build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift,
318                      const nir_lower_subgroups_options *options)
319{
320   /* This only works if all the high bits are the same as bit 1. */
321   assert(((val << 62) >> 62) == val);
322
323   /* First compute the result assuming one ballot component. */
324   nir_ssa_def *result =
325      nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
326
327   if (options->ballot_components == 1)
328      return result;
329
330   /* Fix up the result when there is > 1 component. The idea is that nir_ishl
331    * masks out the high bits of the shift value already, so in case there's
332    * more than one component the component which 1 would be shifted into
333    * already has the right value and all we have to do is fixup the other
334    * components. Components below it should always be 0, and components above
335    * it must be either 0 or ~0 because of the assert above. For example, if
336    * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
337    * we'll feed 33 into ishl, which will mask it off to get 1, so we'll
338    * compute a single-component result of 2, which is correct for the second
339    * component, but the first component needs to be 0, which we get by
340    * comparing the high bits of the shift with 0 and selecting the original
341    * answer or 0 for the first component (and something similar with the
342    * second component). This idea is generalized here for any component count
343    */
344   nir_const_value min_shift[4] = { 0 };
345   for (unsigned i = 0; i < options->ballot_components; i++)
346      min_shift[i].i32 = i * options->ballot_bit_size;
347   nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
348
349   nir_const_value max_shift[4] = { 0 };
350   for (unsigned i = 0; i < options->ballot_components; i++)
351      max_shift[i].i32 = (i + 1) * options->ballot_bit_size;
352   nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
353
354   return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
355                    nir_bcsel(b, nir_ult(b, shift, min_shift_val),
356                              nir_imm_intN_t(b, val >> 63, result->bit_size),
357                              result),
358                    nir_imm_intN_t(b, 0, result->bit_size));
359}
360
361static nir_ssa_def *
362build_subgroup_eq_mask(nir_builder *b,
363                       const nir_lower_subgroups_options *options)
364{
365   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
366
367   return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
368}
369
370static nir_ssa_def *
371build_subgroup_ge_mask(nir_builder *b,
372                       const nir_lower_subgroups_options *options)
373{
374   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
375
376   return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
377}
378
379static nir_ssa_def *
380build_subgroup_gt_mask(nir_builder *b,
381                       const nir_lower_subgroups_options *options)
382{
383   nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
384
385   return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
386}
387
388/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
389 * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
390 * above the subgroup size for the masks, but gt_mask and ge_mask make them 1
391 * so we have to "and" with this mask.
392 */
393static nir_ssa_def *
394build_subgroup_mask(nir_builder *b,
395                    const nir_lower_subgroups_options *options)
396{
397   nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
398
399   /* First compute the result assuming one ballot component. */
400   nir_ssa_def *result =
401      nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
402                  nir_isub_imm(b, options->ballot_bit_size,
403                               subgroup_size));
404
405   /* Since the subgroup size and ballot bitsize are both powers of two, there
406    * are two possible cases to consider:
407    *
408    * (1) The subgroup size is less than the ballot bitsize. We need to return
409    * "result" in the first component and 0 in every other component.
410    * (2) The subgroup size is a multiple of the ballot bitsize. We need to
411    * return ~0 if the subgroup size divided by the ballot bitsize is less
412    * than or equal to the index in the vector and 0 otherwise. For example,
413    * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
414    * to return { ~0, ~0, 0, 0 }.
415    *
416    * In case (2) it turns out that "result" will be ~0, because
417    * "ballot_bit_size - subgroup_size" is also a multiple of
418    * "ballot_bit_size" and since nir_ushr masks the shift value it will
419    * shifted by 0. This means that the first component can just be "result"
420    * in all cases.  The other components will also get the correct value in
421    * case (1) if we just use the rule in case (2), so we'll get the correct
422    * result if we just follow (2) and then replace the first component with
423    * "result".
424    */
425   nir_const_value min_idx[4] = { 0 };
426   for (unsigned i = 0; i < options->ballot_components; i++)
427      min_idx[i].i32 = i * options->ballot_bit_size;
428   nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
429
430   nir_ssa_def *result_extended =
431      nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
432
433   return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
434                    result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
435}
436
437static nir_ssa_def *
438vec_bit_count(nir_builder *b, nir_ssa_def *value)
439{
440   nir_ssa_def *vec_result = nir_bit_count(b, value);
441   nir_ssa_def *result = nir_channel(b, vec_result, 0);
442   for (unsigned i = 1; i < value->num_components; i++)
443      result = nir_iadd(b, result, nir_channel(b, vec_result, i));
444   return result;
445}
446
447static nir_ssa_def *
448vec_find_lsb(nir_builder *b, nir_ssa_def *value)
449{
450   nir_ssa_def *vec_result = nir_find_lsb(b, value);
451   nir_ssa_def *result = nir_imm_int(b, -1);
452   for (int i = value->num_components - 1; i >= 0; i--) {
453      nir_ssa_def *channel = nir_channel(b, vec_result, i);
454      /* result = channel >= 0 ? (i * bitsize + channel) : result */
455      result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
456                         nir_iadd_imm(b, channel, i * value->bit_size),
457                         result);
458   }
459   return result;
460}
461
462static nir_ssa_def *
463vec_find_msb(nir_builder *b, nir_ssa_def *value)
464{
465   nir_ssa_def *vec_result = nir_ufind_msb(b, value);
466   nir_ssa_def *result = nir_imm_int(b, -1);
467   for (unsigned i = 0; i < value->num_components; i++) {
468      nir_ssa_def *channel = nir_channel(b, vec_result, i);
469      /* result = channel >= 0 ? (i * bitsize + channel) : result */
470      result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
471                         nir_iadd_imm(b, channel, i * value->bit_size),
472                         result);
473   }
474   return result;
475}
476
477static nir_ssa_def *
478lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
479                             const nir_lower_subgroups_options *options)
480{
481   if (!options->lower_quad_broadcast_dynamic_to_const)
482      return lower_shuffle(b, intrin, options);
483
484   nir_ssa_def *dst = NULL;
485
486   for (unsigned i = 0; i < 4; ++i) {
487      nir_intrinsic_instr *qbcst =
488         nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast);
489
490      qbcst->num_components = intrin->num_components;
491      qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i));
492      nir_src_copy(&qbcst->src[0], &intrin->src[0]);
493      nir_ssa_dest_init(&qbcst->instr, &qbcst->dest,
494                        intrin->dest.ssa.num_components,
495                        intrin->dest.ssa.bit_size, NULL);
496
497      nir_ssa_def *qbcst_dst = NULL;
498
499      if (options->lower_to_scalar && qbcst->num_components > 1) {
500         qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false);
501      } else {
502         nir_builder_instr_insert(b, &qbcst->instr);
503         qbcst_dst = &qbcst->dest.ssa;
504      }
505
506      if (i)
507         dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa,
508                                    nir_src_for_ssa(nir_imm_int(b, i)).ssa),
509                         qbcst_dst, dst);
510      else
511         dst = qbcst_dst;
512   }
513
514   return dst;
515}
516
517static nir_ssa_def *
518lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
519{
520   return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size,
521                                       intrin->src[0].ssa,
522                                       nir_ieq(b, intrin->src[1].ssa,
523                                               nir_load_subgroup_invocation(b)));
524}
525
526static nir_ssa_def *
527lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
528{
529   const nir_lower_subgroups_options *options = _options;
530
531   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
532   switch (intrin->intrinsic) {
533   case nir_intrinsic_vote_any:
534   case nir_intrinsic_vote_all:
535      if (options->lower_vote_trivial)
536         return nir_ssa_for_src(b, intrin->src[0], 1);
537      break;
538
539   case nir_intrinsic_vote_feq:
540   case nir_intrinsic_vote_ieq:
541      if (options->lower_vote_trivial)
542         return nir_imm_true(b);
543
544      if (options->lower_vote_eq)
545         return lower_vote_eq(b, intrin);
546
547      if (options->lower_to_scalar && intrin->num_components > 1)
548         return lower_vote_eq_to_scalar(b, intrin);
549      break;
550
551   case nir_intrinsic_load_subgroup_size:
552      if (options->subgroup_size)
553         return nir_imm_int(b, options->subgroup_size);
554      break;
555
556   case nir_intrinsic_read_invocation:
557      if (options->lower_to_scalar && intrin->num_components > 1)
558         return lower_subgroup_op_to_scalar(b, intrin, false);
559
560      if (options->lower_read_invocation_to_cond)
561         return lower_read_invocation_to_cond(b, intrin);
562
563      break;
564
565   case nir_intrinsic_read_first_invocation:
566      if (options->lower_to_scalar && intrin->num_components > 1)
567         return lower_subgroup_op_to_scalar(b, intrin, false);
568      break;
569
570   case nir_intrinsic_load_subgroup_eq_mask:
571   case nir_intrinsic_load_subgroup_ge_mask:
572   case nir_intrinsic_load_subgroup_gt_mask:
573   case nir_intrinsic_load_subgroup_le_mask:
574   case nir_intrinsic_load_subgroup_lt_mask: {
575      if (!options->lower_subgroup_masks)
576         return NULL;
577
578      nir_ssa_def *val;
579      switch (intrin->intrinsic) {
580      case nir_intrinsic_load_subgroup_eq_mask:
581         val = build_subgroup_eq_mask(b, options);
582         break;
583      case nir_intrinsic_load_subgroup_ge_mask:
584         val = nir_iand(b, build_subgroup_ge_mask(b, options),
585                           build_subgroup_mask(b, options));
586         break;
587      case nir_intrinsic_load_subgroup_gt_mask:
588         val = nir_iand(b, build_subgroup_gt_mask(b, options),
589                           build_subgroup_mask(b, options));
590         break;
591      case nir_intrinsic_load_subgroup_le_mask:
592         val = nir_inot(b, build_subgroup_gt_mask(b, options));
593         break;
594      case nir_intrinsic_load_subgroup_lt_mask:
595         val = nir_inot(b, build_subgroup_ge_mask(b, options));
596         break;
597      default:
598         unreachable("you seriously can't tell this is unreachable?");
599      }
600
601      return uint_to_ballot_type(b, val,
602                                 intrin->dest.ssa.num_components,
603                                 intrin->dest.ssa.bit_size);
604   }
605
606   case nir_intrinsic_ballot: {
607      if (intrin->dest.ssa.num_components == options->ballot_components &&
608          intrin->dest.ssa.bit_size == options->ballot_bit_size)
609         return NULL;
610
611      nir_ssa_def *ballot =
612         nir_ballot(b, options->ballot_components, options->ballot_bit_size,
613                    intrin->src[0].ssa);
614
615      return uint_to_ballot_type(b, ballot,
616                                 intrin->dest.ssa.num_components,
617                                 intrin->dest.ssa.bit_size);
618   }
619
620   case nir_intrinsic_ballot_bitfield_extract:
621   case nir_intrinsic_ballot_bit_count_reduce:
622   case nir_intrinsic_ballot_find_lsb:
623   case nir_intrinsic_ballot_find_msb: {
624      assert(intrin->src[0].is_ssa);
625      nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
626                                                 options);
627
628      if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
629          intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
630         /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
631          *
632          *    "Find the most significant bit set to 1 in Value, considering
633          *    only the bits in Value required to represent all bits of the
634          *    group’s invocations.  If none of the considered bits is set to
635          *    1, the result is undefined."
636          *
637          * It has similar text for the other three.  This means that, in case
638          * the subgroup size is less than 32, we have to mask off the unused
639          * bits.  If the subgroup size is fixed and greater than or equal to
640          * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
641          * the iand.
642          *
643          * We only have to worry about this for BitCount and FindMSB because
644          * FindLSB counts from the bottom and BitfieldExtract selects
645          * individual bits.  In either case, if run outside the range of
646          * valid bits, we hit the undefined results case and we can return
647          * anything we want.
648          */
649         int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
650      }
651
652      switch (intrin->intrinsic) {
653      case nir_intrinsic_ballot_bitfield_extract: {
654         assert(intrin->src[1].is_ssa);
655         nir_ssa_def *idx = intrin->src[1].ssa;
656         if (int_val->num_components > 1) {
657            /* idx will be truncated by nir_ushr, so we just need to select
658             * the right component using the bits of idx that are truncated in
659             * the shift.
660             */
661            int_val =
662               nir_vector_extract(b, int_val,
663                                  nir_udiv_imm(b, idx, int_val->bit_size));
664         }
665
666         return nir_i2b(b, nir_iand_imm(b, nir_ushr(b, int_val, idx), 1));
667      }
668      case nir_intrinsic_ballot_bit_count_reduce:
669         return vec_bit_count(b, int_val);
670      case nir_intrinsic_ballot_find_lsb:
671         return vec_find_lsb(b, int_val);
672      case nir_intrinsic_ballot_find_msb:
673         return vec_find_msb(b, int_val);
674      default:
675         unreachable("you seriously can't tell this is unreachable?");
676      }
677   }
678
679   case nir_intrinsic_ballot_bit_count_exclusive:
680   case nir_intrinsic_ballot_bit_count_inclusive: {
681      nir_ssa_def *mask;
682      if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
683         mask = nir_inot(b, build_subgroup_gt_mask(b, options));
684      } else {
685         mask = nir_inot(b, build_subgroup_ge_mask(b, options));
686      }
687
688      assert(intrin->src[0].is_ssa);
689      nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
690                                                 options);
691
692      return vec_bit_count(b, nir_iand(b, int_val, mask));
693   }
694
695   case nir_intrinsic_elect: {
696      if (!options->lower_elect)
697         return NULL;
698
699      return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
700   }
701
702   case nir_intrinsic_shuffle:
703      if (options->lower_to_scalar && intrin->num_components > 1)
704         return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
705      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
706         return lower_subgroup_op_to_32bit(b, intrin);
707      break;
708   case nir_intrinsic_shuffle_xor:
709   case nir_intrinsic_shuffle_up:
710   case nir_intrinsic_shuffle_down:
711      if (options->lower_shuffle)
712         return lower_shuffle(b, intrin, options);
713      else if (options->lower_to_scalar && intrin->num_components > 1)
714         return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
715      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
716         return lower_subgroup_op_to_32bit(b, intrin);
717      break;
718
719   case nir_intrinsic_quad_broadcast:
720   case nir_intrinsic_quad_swap_horizontal:
721   case nir_intrinsic_quad_swap_vertical:
722   case nir_intrinsic_quad_swap_diagonal:
723      if (options->lower_quad ||
724          (options->lower_quad_broadcast_dynamic &&
725           intrin->intrinsic == nir_intrinsic_quad_broadcast &&
726           !nir_src_is_const(intrin->src[1])))
727         return lower_dynamic_quad_broadcast(b, intrin, options);
728      else if (options->lower_to_scalar && intrin->num_components > 1)
729         return lower_subgroup_op_to_scalar(b, intrin, false);
730      break;
731
732   case nir_intrinsic_reduce: {
733      nir_ssa_def *ret = NULL;
734      /* A cluster size greater than the subgroup size is implemention defined */
735      if (options->subgroup_size &&
736          nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
737         nir_intrinsic_set_cluster_size(intrin, 0);
738         ret = NIR_LOWER_INSTR_PROGRESS;
739      }
740      if (options->lower_to_scalar && intrin->num_components > 1)
741         ret = lower_subgroup_op_to_scalar(b, intrin, false);
742      return ret;
743   }
744   case nir_intrinsic_inclusive_scan:
745   case nir_intrinsic_exclusive_scan:
746      if (options->lower_to_scalar && intrin->num_components > 1)
747         return lower_subgroup_op_to_scalar(b, intrin, false);
748      break;
749
750   default:
751      break;
752   }
753
754   return NULL;
755}
756
757bool
758nir_lower_subgroups(nir_shader *shader,
759                    const nir_lower_subgroups_options *options)
760{
761   return nir_shader_lower_instructions(shader,
762                                        lower_subgroups_filter,
763                                        lower_subgroups_instr,
764                                        (void *)options);
765}
766