1/* 2 * Copyright (c) 2020 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_nir_rt.h" 25#include "brw_nir_rt_builder.h" 26 27static nir_ssa_def * 28build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit) 29{ 30 switch (b->shader->info.stage) { 31 case MESA_SHADER_ANY_HIT: 32 /* Any-hit shaders are always compiled into intersection shaders for 33 * procedural geometry. If we got here in an any-hit shader, it's for 34 * triangles. 35 */ 36 return nir_imm_false(b); 37 38 case MESA_SHADER_INTERSECTION: 39 return nir_imm_true(b); 40 41 default: 42 return nir_ieq(b, hit->leaf_type, 43 nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL)); 44 } 45} 46 47static void 48lower_rt_intrinsics_impl(nir_function_impl *impl, 49 const struct intel_device_info *devinfo) 50{ 51 nir_builder build; 52 nir_builder_init(&build, impl); 53 nir_builder *b = &build; 54 55 b->cursor = nir_before_block(nir_start_block(b->impl)); 56 57 struct brw_nir_rt_globals_defs globals; 58 brw_nir_rt_load_globals(b, &globals); 59 60 nir_ssa_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo); 61 nir_ssa_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32); 62 63 gl_shader_stage stage = b->shader->info.stage; 64 struct brw_nir_rt_mem_ray_defs world_ray_in = {}; 65 struct brw_nir_rt_mem_ray_defs object_ray_in = {}; 66 struct brw_nir_rt_mem_hit_defs hit_in = {}; 67 switch (stage) { 68 case MESA_SHADER_ANY_HIT: 69 case MESA_SHADER_CLOSEST_HIT: 70 case MESA_SHADER_INTERSECTION: 71 brw_nir_rt_load_mem_hit(b, &hit_in, 72 stage == MESA_SHADER_CLOSEST_HIT); 73 brw_nir_rt_load_mem_ray(b, &object_ray_in, 74 BRW_RT_BVH_LEVEL_OBJECT); 75 FALLTHROUGH; 76 77 case MESA_SHADER_MISS: 78 brw_nir_rt_load_mem_ray(b, &world_ray_in, 79 BRW_RT_BVH_LEVEL_WORLD); 80 break; 81 82 default: 83 break; 84 } 85 86 nir_ssa_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo); 87 nir_ssa_def *stack_base_offset = nir_channel(b, hotzone, 0); 88 nir_ssa_def *stack_base_addr = 89 nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset)); 90 ASSERTED bool seen_scratch_base_ptr_load = false; 91 ASSERTED bool found_resume = false; 92 93 nir_foreach_block(block, impl) { 94 nir_foreach_instr_safe(instr, block) { 95 if (instr->type != nir_instr_type_intrinsic) 96 continue; 97 98 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 99 100 b->cursor = nir_after_instr(&intrin->instr); 101 102 nir_ssa_def *sysval = NULL; 103 switch (intrin->intrinsic) { 104 case nir_intrinsic_load_scratch_base_ptr: 105 assert(nir_intrinsic_base(intrin) == 1); 106 seen_scratch_base_ptr_load = true; 107 sysval = stack_base_addr; 108 break; 109 110 case nir_intrinsic_btd_stack_push_intel: { 111 int32_t stack_size = nir_intrinsic_stack_size(intrin); 112 if (stack_size > 0) { 113 nir_ssa_def *child_stack_offset = 114 nir_iadd_imm(b, stack_base_offset, stack_size); 115 nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1); 116 } 117 nir_instr_remove(instr); 118 break; 119 } 120 121 case nir_intrinsic_rt_resume: 122 /* This is the first "interesting" instruction */ 123 assert(block == nir_start_block(impl)); 124 assert(!seen_scratch_base_ptr_load); 125 found_resume = true; 126 127 int32_t stack_size = nir_intrinsic_stack_size(intrin); 128 if (stack_size > 0) { 129 stack_base_offset = 130 nir_iadd_imm(b, stack_base_offset, -stack_size); 131 nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1); 132 stack_base_addr = nir_iadd(b, thread_stack_base_addr, 133 nir_u2u64(b, stack_base_offset)); 134 } 135 nir_instr_remove(instr); 136 break; 137 138 case nir_intrinsic_load_uniform: { 139 /* We don't want to lower this in the launch trampoline. */ 140 if (stage == MESA_SHADER_COMPUTE) 141 break; 142 143 assert(intrin->dest.is_ssa); 144 assert(intrin->src[0].is_ssa); 145 146 unsigned bit_size = intrin->dest.ssa.bit_size; 147 assert(bit_size >= 8 && bit_size % 8 == 0); 148 unsigned byte_size = bit_size / 8; 149 150 if (nir_src_is_const(intrin->src[0])) { 151 uint64_t offset = BRW_RT_PUSH_CONST_OFFSET + 152 nir_intrinsic_base(intrin) + 153 nir_src_as_uint(intrin->src[0]); 154 155 /* Things should be component-aligned. */ 156 assert(offset % byte_size == 0); 157 158 unsigned suboffset = offset % 64; 159 uint64_t aligned_offset = offset - suboffset; 160 161 /* Load two just in case we go over a 64B boundary */ 162 nir_ssa_def *data[2]; 163 for (unsigned i = 0; i < 2; i++) { 164 nir_ssa_def *addr = 165 nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b), 166 aligned_offset + i * 64); 167 data[i] = nir_load_global_const_block_intel(b, 16, addr, 168 nir_imm_true(b)); 169 } 170 171 sysval = nir_extract_bits(b, data, 2, suboffset * 8, 172 intrin->num_components, bit_size); 173 } else { 174 nir_ssa_def *offset32 = 175 nir_iadd_imm(b, intrin->src[0].ssa, 176 BRW_RT_PUSH_CONST_OFFSET + 177 nir_intrinsic_base(intrin)); 178 nir_ssa_def *addr = 179 nir_iadd(b, nir_load_btd_global_arg_addr_intel(b), 180 nir_u2u64(b, offset32)); 181 sysval = nir_load_global_constant(b, addr, byte_size, 182 intrin->num_components, bit_size); 183 } 184 break; 185 } 186 187 case nir_intrinsic_load_ray_launch_id: 188 sysval = nir_channels(b, hotzone, 0xe); 189 break; 190 191 case nir_intrinsic_load_ray_launch_size: 192 sysval = globals.launch_size; 193 break; 194 195 case nir_intrinsic_load_ray_world_origin: 196 sysval = world_ray_in.orig; 197 break; 198 199 case nir_intrinsic_load_ray_world_direction: 200 sysval = world_ray_in.dir; 201 break; 202 203 case nir_intrinsic_load_ray_object_origin: 204 sysval = object_ray_in.orig; 205 break; 206 207 case nir_intrinsic_load_ray_object_direction: 208 sysval = object_ray_in.dir; 209 break; 210 211 case nir_intrinsic_load_ray_t_min: 212 /* It shouldn't matter which we pull this from */ 213 sysval = world_ray_in.t_near; 214 break; 215 216 case nir_intrinsic_load_ray_t_max: 217 if (stage == MESA_SHADER_MISS) 218 sysval = world_ray_in.t_far; 219 else 220 sysval = hit_in.t; 221 break; 222 223 case nir_intrinsic_load_primitive_id: { 224 /* It's in dw[3] for procedural and dw[2] for quad 225 * 226 * TODO: We really need some helpers here. 227 */ 228 nir_ssa_def *offset = 229 nir_bcsel(b, build_leaf_is_procedural(b, &hit_in), 230 nir_iadd_imm(b, hit_in.prim_leaf_index, 12), 231 nir_imm_int(b, 8)); 232 sysval = nir_load_global(b, nir_iadd(b, hit_in.prim_leaf_ptr, 233 nir_u2u64(b, offset)), 234 4, /* align */ 1, 32); 235 break; 236 } 237 238 case nir_intrinsic_load_instance_id: { 239 struct brw_nir_rt_bvh_instance_leaf_defs leaf; 240 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); 241 sysval = leaf.instance_index; 242 break; 243 } 244 245 case nir_intrinsic_load_ray_object_to_world: { 246 struct brw_nir_rt_bvh_instance_leaf_defs leaf; 247 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); 248 sysval = leaf.object_to_world[nir_intrinsic_column(intrin)]; 249 break; 250 } 251 252 case nir_intrinsic_load_ray_world_to_object: { 253 struct brw_nir_rt_bvh_instance_leaf_defs leaf; 254 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); 255 sysval = leaf.world_to_object[nir_intrinsic_column(intrin)]; 256 break; 257 } 258 259 case nir_intrinsic_load_ray_hit_kind: { 260 nir_ssa_def *tri_hit_kind = 261 nir_bcsel(b, hit_in.front_face, 262 nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE), 263 nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE)); 264 sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in), 265 hit_in.aabb_hit_kind, tri_hit_kind); 266 break; 267 } 268 269 case nir_intrinsic_load_ray_flags: 270 sysval = nir_u2u32(b, world_ray_in.ray_flags); 271 break; 272 273 case nir_intrinsic_load_ray_geometry_index: { 274 nir_ssa_def *geometry_index_dw = 275 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4, 276 1, 32); 277 sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29)); 278 break; 279 } 280 281 case nir_intrinsic_load_ray_instance_custom_index: { 282 struct brw_nir_rt_bvh_instance_leaf_defs leaf; 283 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); 284 sysval = leaf.instance_id; 285 break; 286 } 287 288 case nir_intrinsic_load_shader_record_ptr: 289 /* We can't handle this intrinsic in resume shaders because the 290 * handle we get there won't be from the original SBT. The shader 291 * call lowering/splitting pass should have ensured that this 292 * value was spilled from the initial shader and unspilled in any 293 * resume shaders that need it. 294 */ 295 assert(!found_resume); 296 sysval = nir_load_btd_local_arg_addr_intel(b); 297 break; 298 299 case nir_intrinsic_load_ray_base_mem_addr_intel: 300 sysval = globals.base_mem_addr; 301 break; 302 303 case nir_intrinsic_load_ray_hw_stack_size_intel: 304 sysval = nir_imul_imm(b, globals.hw_stack_size, 64); 305 break; 306 307 case nir_intrinsic_load_ray_sw_stack_size_intel: 308 sysval = nir_imul_imm(b, globals.sw_stack_size, 64); 309 break; 310 311 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: 312 sysval = globals.num_dss_rt_stacks; 313 break; 314 315 case nir_intrinsic_load_ray_hit_sbt_addr_intel: 316 sysval = globals.hit_sbt_addr; 317 break; 318 319 case nir_intrinsic_load_ray_hit_sbt_stride_intel: 320 sysval = globals.hit_sbt_stride; 321 break; 322 323 case nir_intrinsic_load_ray_miss_sbt_addr_intel: 324 sysval = globals.miss_sbt_addr; 325 break; 326 327 case nir_intrinsic_load_ray_miss_sbt_stride_intel: 328 sysval = globals.miss_sbt_stride; 329 break; 330 331 case nir_intrinsic_load_callable_sbt_addr_intel: 332 sysval = globals.call_sbt_addr; 333 break; 334 335 case nir_intrinsic_load_callable_sbt_stride_intel: 336 sysval = globals.call_sbt_stride; 337 break; 338 339 case nir_intrinsic_load_btd_resume_sbt_addr_intel: 340 sysval = nir_pack_64_2x32_split(b, 341 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW), 342 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH)); 343 break; 344 345 case nir_intrinsic_load_leaf_procedural_intel: 346 sysval = build_leaf_is_procedural(b, &hit_in); 347 break; 348 349 case nir_intrinsic_load_leaf_opaque_intel: { 350 if (stage == MESA_SHADER_INTERSECTION) { 351 /* In intersection shaders, the opaque bit is passed to us in 352 * the front_face bit. 353 */ 354 sysval = hit_in.front_face; 355 } else { 356 nir_ssa_def *flags_dw = 357 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4, 358 1, 32); 359 sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30)); 360 } 361 break; 362 } 363 364 default: 365 continue; 366 } 367 368 if (sysval) { 369 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, 370 sysval); 371 nir_instr_remove(&intrin->instr); 372 } 373 } 374 } 375 376 nir_metadata_preserve(impl, nir_metadata_block_index | 377 nir_metadata_dominance); 378} 379 380/** Lower ray-tracing system values and intrinsics 381 * 382 * In most 3D shader stages, intrinsics are a fairly thin wrapper around 383 * hardware functionality and system values represent magic bits that come 384 * into the shader from FF hardware. Ray-tracing, however, looks a bit more 385 * like the OpenGL 1.0 world where the underlying hardware is simple and most 386 * of the API implementation is software. 387 * 388 * In particular, most things that are treated as system values (or built-ins 389 * in SPIR-V) don't get magically dropped into registers for us. Instead, we 390 * have to fetch them from the relevant data structures shared with the 391 * ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or 392 * from one of the MemHit data structures. Some, such as primitive_id require 393 * us to fetch the leaf address from the MemHit struct and then manually read 394 * the data out of the BVH. Instead of trying to emit all this code deep in 395 * the back-end where we can't effectively optimize it, we lower it all to 396 * global memory access in NIR. 397 * 398 * Once this pass is complete, the only real system values left are the two 399 * argument pointer system values for BTD dispatch: btd_local_arg_addr and 400 * btd_global_arg_addr. 401 */ 402void 403brw_nir_lower_rt_intrinsics(nir_shader *nir, 404 const struct intel_device_info *devinfo) 405{ 406 nir_foreach_function(function, nir) { 407 if (function->impl) 408 lower_rt_intrinsics_impl(function->impl, devinfo); 409 } 410} 411