1/*
2 * Copyright (c) 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_nir_rt.h"
25#include "brw_nir_rt_builder.h"
26
27static nir_ssa_def *
28build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
29{
30   switch (b->shader->info.stage) {
31   case MESA_SHADER_ANY_HIT:
32      /* Any-hit shaders are always compiled into intersection shaders for
33       * procedural geometry.  If we got here in an any-hit shader, it's for
34       * triangles.
35       */
36      return nir_imm_false(b);
37
38   case MESA_SHADER_INTERSECTION:
39      return nir_imm_true(b);
40
41   default:
42      return nir_ieq(b, hit->leaf_type,
43                        nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
44   }
45}
46
47static void
48lower_rt_intrinsics_impl(nir_function_impl *impl,
49                         const struct intel_device_info *devinfo)
50{
51   nir_builder build;
52   nir_builder_init(&build, impl);
53   nir_builder *b = &build;
54
55   b->cursor = nir_before_block(nir_start_block(b->impl));
56
57   struct brw_nir_rt_globals_defs globals;
58   brw_nir_rt_load_globals(b, &globals);
59
60   nir_ssa_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
61   nir_ssa_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
62
63   gl_shader_stage stage = b->shader->info.stage;
64   struct brw_nir_rt_mem_ray_defs world_ray_in = {};
65   struct brw_nir_rt_mem_ray_defs object_ray_in = {};
66   struct brw_nir_rt_mem_hit_defs hit_in = {};
67   switch (stage) {
68   case MESA_SHADER_ANY_HIT:
69   case MESA_SHADER_CLOSEST_HIT:
70   case MESA_SHADER_INTERSECTION:
71      brw_nir_rt_load_mem_hit(b, &hit_in,
72                              stage == MESA_SHADER_CLOSEST_HIT);
73      brw_nir_rt_load_mem_ray(b, &object_ray_in,
74                              BRW_RT_BVH_LEVEL_OBJECT);
75      FALLTHROUGH;
76
77   case MESA_SHADER_MISS:
78      brw_nir_rt_load_mem_ray(b, &world_ray_in,
79                              BRW_RT_BVH_LEVEL_WORLD);
80      break;
81
82   default:
83      break;
84   }
85
86   nir_ssa_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
87   nir_ssa_def *stack_base_offset = nir_channel(b, hotzone, 0);
88   nir_ssa_def *stack_base_addr =
89      nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
90   ASSERTED bool seen_scratch_base_ptr_load = false;
91   ASSERTED bool found_resume = false;
92
93   nir_foreach_block(block, impl) {
94      nir_foreach_instr_safe(instr, block) {
95         if (instr->type != nir_instr_type_intrinsic)
96            continue;
97
98         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
99
100         b->cursor = nir_after_instr(&intrin->instr);
101
102         nir_ssa_def *sysval = NULL;
103         switch (intrin->intrinsic) {
104         case nir_intrinsic_load_scratch_base_ptr:
105            assert(nir_intrinsic_base(intrin) == 1);
106            seen_scratch_base_ptr_load = true;
107            sysval = stack_base_addr;
108            break;
109
110         case nir_intrinsic_btd_stack_push_intel: {
111            int32_t stack_size = nir_intrinsic_stack_size(intrin);
112            if (stack_size > 0) {
113               nir_ssa_def *child_stack_offset =
114                  nir_iadd_imm(b, stack_base_offset, stack_size);
115               nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
116            }
117            nir_instr_remove(instr);
118            break;
119         }
120
121         case nir_intrinsic_rt_resume:
122            /* This is the first "interesting" instruction */
123            assert(block == nir_start_block(impl));
124            assert(!seen_scratch_base_ptr_load);
125            found_resume = true;
126
127            int32_t stack_size = nir_intrinsic_stack_size(intrin);
128            if (stack_size > 0) {
129               stack_base_offset =
130                  nir_iadd_imm(b, stack_base_offset, -stack_size);
131               nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
132               stack_base_addr = nir_iadd(b, thread_stack_base_addr,
133                                          nir_u2u64(b, stack_base_offset));
134            }
135            nir_instr_remove(instr);
136            break;
137
138         case nir_intrinsic_load_uniform: {
139            /* We don't want to lower this in the launch trampoline. */
140            if (stage == MESA_SHADER_COMPUTE)
141               break;
142
143            assert(intrin->dest.is_ssa);
144            assert(intrin->src[0].is_ssa);
145
146            unsigned bit_size = intrin->dest.ssa.bit_size;
147            assert(bit_size >= 8 && bit_size % 8 == 0);
148            unsigned byte_size = bit_size / 8;
149
150            if (nir_src_is_const(intrin->src[0])) {
151               uint64_t offset = BRW_RT_PUSH_CONST_OFFSET +
152                                 nir_intrinsic_base(intrin) +
153                                 nir_src_as_uint(intrin->src[0]);
154
155               /* Things should be component-aligned. */
156               assert(offset % byte_size == 0);
157
158               unsigned suboffset = offset % 64;
159               uint64_t aligned_offset = offset - suboffset;
160
161               /* Load two just in case we go over a 64B boundary */
162               nir_ssa_def *data[2];
163               for (unsigned i = 0; i < 2; i++) {
164                  nir_ssa_def *addr =
165                     nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b),
166                                     aligned_offset + i * 64);
167                  data[i] = nir_load_global_const_block_intel(b, 16, addr,
168                                                              nir_imm_true(b));
169               }
170
171               sysval = nir_extract_bits(b, data, 2, suboffset * 8,
172                                         intrin->num_components, bit_size);
173            } else {
174               nir_ssa_def *offset32 =
175                  nir_iadd_imm(b, intrin->src[0].ssa,
176                                  BRW_RT_PUSH_CONST_OFFSET +
177                                  nir_intrinsic_base(intrin));
178               nir_ssa_def *addr =
179                  nir_iadd(b, nir_load_btd_global_arg_addr_intel(b),
180                              nir_u2u64(b, offset32));
181               sysval = nir_load_global_constant(b, addr, byte_size,
182                                                 intrin->num_components, bit_size);
183            }
184            break;
185         }
186
187         case nir_intrinsic_load_ray_launch_id:
188            sysval = nir_channels(b, hotzone, 0xe);
189            break;
190
191         case nir_intrinsic_load_ray_launch_size:
192            sysval = globals.launch_size;
193            break;
194
195         case nir_intrinsic_load_ray_world_origin:
196            sysval = world_ray_in.orig;
197            break;
198
199         case nir_intrinsic_load_ray_world_direction:
200            sysval = world_ray_in.dir;
201            break;
202
203         case nir_intrinsic_load_ray_object_origin:
204            sysval = object_ray_in.orig;
205            break;
206
207         case nir_intrinsic_load_ray_object_direction:
208            sysval = object_ray_in.dir;
209            break;
210
211         case nir_intrinsic_load_ray_t_min:
212            /* It shouldn't matter which we pull this from */
213            sysval = world_ray_in.t_near;
214            break;
215
216         case nir_intrinsic_load_ray_t_max:
217            if (stage == MESA_SHADER_MISS)
218               sysval = world_ray_in.t_far;
219            else
220               sysval = hit_in.t;
221            break;
222
223         case nir_intrinsic_load_primitive_id: {
224            /* It's in dw[3] for procedural and dw[2] for quad
225             *
226             * TODO: We really need some helpers here.
227             */
228            nir_ssa_def *offset =
229               nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
230                            nir_iadd_imm(b, hit_in.prim_leaf_index, 12),
231                            nir_imm_int(b, 8));
232            sysval = nir_load_global(b, nir_iadd(b, hit_in.prim_leaf_ptr,
233                                                    nir_u2u64(b, offset)),
234                                     4, /* align */ 1, 32);
235            break;
236         }
237
238         case nir_intrinsic_load_instance_id: {
239            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
240            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
241            sysval = leaf.instance_index;
242            break;
243         }
244
245         case nir_intrinsic_load_ray_object_to_world: {
246            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
247            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
248            sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
249            break;
250         }
251
252         case nir_intrinsic_load_ray_world_to_object: {
253            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
254            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
255            sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
256            break;
257         }
258
259         case nir_intrinsic_load_ray_hit_kind: {
260            nir_ssa_def *tri_hit_kind =
261               nir_bcsel(b, hit_in.front_face,
262                            nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
263                            nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
264            sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
265                                  hit_in.aabb_hit_kind, tri_hit_kind);
266            break;
267         }
268
269         case nir_intrinsic_load_ray_flags:
270            sysval = nir_u2u32(b, world_ray_in.ray_flags);
271            break;
272
273         case nir_intrinsic_load_ray_geometry_index: {
274            nir_ssa_def *geometry_index_dw =
275               nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
276                               1, 32);
277            sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
278            break;
279         }
280
281         case nir_intrinsic_load_ray_instance_custom_index: {
282            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
283            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
284            sysval = leaf.instance_id;
285            break;
286         }
287
288         case nir_intrinsic_load_shader_record_ptr:
289            /* We can't handle this intrinsic in resume shaders because the
290             * handle we get there won't be from the original SBT.  The shader
291             * call lowering/splitting pass should have ensured that this
292             * value was spilled from the initial shader and unspilled in any
293             * resume shaders that need it.
294             */
295            assert(!found_resume);
296            sysval = nir_load_btd_local_arg_addr_intel(b);
297            break;
298
299         case nir_intrinsic_load_ray_base_mem_addr_intel:
300            sysval = globals.base_mem_addr;
301            break;
302
303         case nir_intrinsic_load_ray_hw_stack_size_intel:
304            sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
305            break;
306
307         case nir_intrinsic_load_ray_sw_stack_size_intel:
308            sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
309            break;
310
311         case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
312            sysval = globals.num_dss_rt_stacks;
313            break;
314
315         case nir_intrinsic_load_ray_hit_sbt_addr_intel:
316            sysval = globals.hit_sbt_addr;
317            break;
318
319         case nir_intrinsic_load_ray_hit_sbt_stride_intel:
320            sysval = globals.hit_sbt_stride;
321            break;
322
323         case nir_intrinsic_load_ray_miss_sbt_addr_intel:
324            sysval = globals.miss_sbt_addr;
325            break;
326
327         case nir_intrinsic_load_ray_miss_sbt_stride_intel:
328            sysval = globals.miss_sbt_stride;
329            break;
330
331         case nir_intrinsic_load_callable_sbt_addr_intel:
332            sysval = globals.call_sbt_addr;
333            break;
334
335         case nir_intrinsic_load_callable_sbt_stride_intel:
336            sysval = globals.call_sbt_stride;
337            break;
338
339         case nir_intrinsic_load_btd_resume_sbt_addr_intel:
340            sysval = nir_pack_64_2x32_split(b,
341               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
342               nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
343            break;
344
345         case nir_intrinsic_load_leaf_procedural_intel:
346            sysval = build_leaf_is_procedural(b, &hit_in);
347            break;
348
349         case nir_intrinsic_load_leaf_opaque_intel: {
350            if (stage == MESA_SHADER_INTERSECTION) {
351               /* In intersection shaders, the opaque bit is passed to us in
352                * the front_face bit.
353                */
354               sysval = hit_in.front_face;
355            } else {
356               nir_ssa_def *flags_dw =
357                  nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
358                                  1, 32);
359               sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
360            }
361            break;
362         }
363
364         default:
365            continue;
366         }
367
368         if (sysval) {
369            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
370                                     sysval);
371            nir_instr_remove(&intrin->instr);
372         }
373      }
374   }
375
376   nir_metadata_preserve(impl, nir_metadata_block_index |
377                               nir_metadata_dominance);
378}
379
380/** Lower ray-tracing system values and intrinsics
381 *
382 * In most 3D shader stages, intrinsics are a fairly thin wrapper around
383 * hardware functionality and system values represent magic bits that come
384 * into the shader from FF hardware.  Ray-tracing, however, looks a bit more
385 * like the OpenGL 1.0 world where the underlying hardware is simple and most
386 * of the API implementation is software.
387 *
388 * In particular, most things that are treated as system values (or built-ins
389 * in SPIR-V) don't get magically dropped into registers for us.  Instead, we
390 * have to fetch them from the relevant data structures shared with the
391 * ray-tracing hardware.  Most come from either the RT_DISPATCH_GLOBALS or
392 * from one of the MemHit data structures.  Some, such as primitive_id require
393 * us to fetch the leaf address from the MemHit struct and then manually read
394 * the data out of the BVH.  Instead of trying to emit all this code deep in
395 * the back-end where we can't effectively optimize it, we lower it all to
396 * global memory access in NIR.
397 *
398 * Once this pass is complete, the only real system values left are the two
399 * argument pointer system values for BTD dispatch: btd_local_arg_addr and
400 * btd_global_arg_addr.
401 */
402void
403brw_nir_lower_rt_intrinsics(nir_shader *nir,
404                            const struct intel_device_info *devinfo)
405{
406   nir_foreach_function(function, nir) {
407      if (function->impl)
408         lower_rt_intrinsics_impl(function->impl, devinfo);
409   }
410}
411