17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Intel Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#ifndef BRW_RT_H 257ec681f3Smrg#define BRW_RT_H 267ec681f3Smrg 277ec681f3Smrg#ifdef __cplusplus 287ec681f3Smrgextern "C" { 297ec681f3Smrg#endif 307ec681f3Smrg 317ec681f3Smrg/** Vulkan defines shaderGroupHandleSize = 32 */ 327ec681f3Smrg#define BRW_RT_SBT_HANDLE_SIZE 32 337ec681f3Smrg 347ec681f3Smrg/** Offset after the RT dispatch globals at which "push" constants live */ 357ec681f3Smrg#define BRW_RT_PUSH_CONST_OFFSET 128 367ec681f3Smrg 377ec681f3Smrg/** Stride of the resume SBT */ 387ec681f3Smrg#define BRW_BTD_RESUME_SBT_STRIDE 8 397ec681f3Smrg 407ec681f3Smrg/* Vulkan always uses exactly two levels of BVH: world and object. At the API 417ec681f3Smrg * level, these are referred to as top and bottom. 427ec681f3Smrg */ 437ec681f3Smrgenum brw_rt_bvh_level { 447ec681f3Smrg BRW_RT_BVH_LEVEL_WORLD = 0, 457ec681f3Smrg BRW_RT_BVH_LEVEL_OBJECT = 1, 467ec681f3Smrg}; 477ec681f3Smrg#define BRW_RT_MAX_BVH_LEVELS 2 487ec681f3Smrg 497ec681f3Smrgenum brw_rt_bvh_node_type { 507ec681f3Smrg BRW_RT_BVH_NODE_TYPE_INTERNAL = 0, 517ec681f3Smrg BRW_RT_BVH_NODE_TYPE_INSTANCE = 1, 527ec681f3Smrg BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3, 537ec681f3Smrg BRW_RT_BVH_NODE_TYPE_QUAD = 4, 547ec681f3Smrg}; 557ec681f3Smrg 567ec681f3Smrg/** HitKind values returned for triangle geometry 577ec681f3Smrg * 587ec681f3Smrg * This enum must match the SPIR-V enum. 597ec681f3Smrg */ 607ec681f3Smrgenum brw_rt_hit_kind { 617ec681f3Smrg BRW_RT_HIT_KIND_FRONT_FACE = 0xfe, 627ec681f3Smrg BRW_RT_HIT_KIND_BACK_FACE = 0xff, 637ec681f3Smrg}; 647ec681f3Smrg 657ec681f3Smrg/** Ray flags 667ec681f3Smrg * 677ec681f3Smrg * This enum must match the SPIR-V RayFlags enum. 687ec681f3Smrg */ 697ec681f3Smrgenum brw_rt_ray_flags { 707ec681f3Smrg BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01, 717ec681f3Smrg BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02, 727ec681f3Smrg BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04, 737ec681f3Smrg BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08, 747ec681f3Smrg BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10, 757ec681f3Smrg BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20, 767ec681f3Smrg BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40, 777ec681f3Smrg BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80, 787ec681f3Smrg BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100, 797ec681f3Smrg BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200, 807ec681f3Smrg}; 817ec681f3Smrg 827ec681f3Smrgstruct brw_rt_scratch_layout { 837ec681f3Smrg /** Number of stack IDs per DSS */ 847ec681f3Smrg uint32_t stack_ids_per_dss; 857ec681f3Smrg 867ec681f3Smrg /** Start offset (in bytes) of the hardware MemRay stack */ 877ec681f3Smrg uint32_t ray_stack_start; 887ec681f3Smrg 897ec681f3Smrg /** Stride (in bytes) of the hardware MemRay stack */ 907ec681f3Smrg uint32_t ray_stack_stride; 917ec681f3Smrg 927ec681f3Smrg /** Start offset (in bytes) of the SW stacks */ 937ec681f3Smrg uint64_t sw_stack_start; 947ec681f3Smrg 957ec681f3Smrg /** Size (in bytes) of the SW stack for a single shader invocation */ 967ec681f3Smrg uint32_t sw_stack_size; 977ec681f3Smrg 987ec681f3Smrg /** Total size (in bytes) of the RT scratch memory area */ 997ec681f3Smrg uint64_t total_size; 1007ec681f3Smrg}; 1017ec681f3Smrg 1027ec681f3Smrg/** Parameters passed to the raygen trampoline shader 1037ec681f3Smrg * 1047ec681f3Smrg * This struct is carefully construected to be 32B and must be passed to the 1057ec681f3Smrg * raygen trampoline shader as as inline constant data. 1067ec681f3Smrg */ 1077ec681f3Smrgstruct brw_rt_raygen_trampoline_params { 1087ec681f3Smrg /** The GPU address of the RT_DISPATCH_GLOBALS */ 1097ec681f3Smrg uint64_t rt_disp_globals_addr; 1107ec681f3Smrg 1117ec681f3Smrg /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */ 1127ec681f3Smrg uint64_t raygen_bsr_addr; 1137ec681f3Smrg 1147ec681f3Smrg /** 1 if this is an indirect dispatch, 0 otherwise */ 1157ec681f3Smrg uint8_t is_indirect; 1167ec681f3Smrg 1177ec681f3Smrg /** The integer log2 of the local group size 1187ec681f3Smrg * 1197ec681f3Smrg * Ray-tracing shaders don't have a concept of local vs. global workgroup 1207ec681f3Smrg * size. They only have a single 3D launch size. The raygen trampoline 1217ec681f3Smrg * shader is always dispatched with a local workgroup size equal to the 1227ec681f3Smrg * SIMD width but the shape of the local workgroup is determined at 1237ec681f3Smrg * dispatch time based on the shape of the launch and passed to the 1247ec681f3Smrg * trampoline via this field. (There's no sense having a Z dimension on 1257ec681f3Smrg * the local workgroup if the launch is 2D.) 1267ec681f3Smrg * 1277ec681f3Smrg * We use the integer log2 of the size because there's no point in 1287ec681f3Smrg * non-power-of-two sizes and shifts are cheaper than division. 1297ec681f3Smrg */ 1307ec681f3Smrg uint8_t local_group_size_log2[3]; 1317ec681f3Smrg 1327ec681f3Smrg uint32_t pad[3]; 1337ec681f3Smrg}; 1347ec681f3Smrg 1357ec681f3Smrg/** Size of the "hot zone" in bytes 1367ec681f3Smrg * 1377ec681f3Smrg * The hot zone is a SW-defined data structure which is a single uvec4 1387ec681f3Smrg * containing two bits of information: 1397ec681f3Smrg * 1407ec681f3Smrg * - hotzone.x: Stack offset (in bytes) 1417ec681f3Smrg * 1427ec681f3Smrg * This is the offset (in bytes) into the per-thread scratch space at which 1437ec681f3Smrg * the current shader's stack starts. This is incremented by the calling 1447ec681f3Smrg * shader prior to any shader call type instructions and gets decremented 1457ec681f3Smrg * by the resume shader as part of completing the return operation. 1467ec681f3Smrg * 1477ec681f3Smrg * 1487ec681f3Smrg * - hotzone.yzw: The launch ID associated with the current thread 1497ec681f3Smrg * 1507ec681f3Smrg * Inside a bindless shader, the only information we have is the DSS ID 1517ec681f3Smrg * from the hardware EU and a per-DSS stack ID. In particular, the three- 1527ec681f3Smrg * dimensional launch ID is lost the moment we leave the raygen trampoline. 1537ec681f3Smrg */ 1547ec681f3Smrg#define BRW_RT_SIZEOF_HOTZONE 16 1557ec681f3Smrg 1567ec681f3Smrg/* From the BSpec "Address Computation for Memory Based Data Structures: 1577ec681f3Smrg * Ray and TraversalStack (Async Ray Tracing)": 1587ec681f3Smrg * 1597ec681f3Smrg * sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B. 1607ec681f3Smrg */ 1617ec681f3Smrg#define BRW_RT_SIZEOF_RAY 64 1627ec681f3Smrg#define BRW_RT_SIZEOF_HIT_INFO 32 1637ec681f3Smrg#define BRW_RT_SIZEOF_TRAV_STACK 32 1647ec681f3Smrg 1657ec681f3Smrg/* From the BSpec: 1667ec681f3Smrg * 1677ec681f3Smrg * syncStackSize = (maxBVHLevels % 2 == 1) ? 1687ec681f3Smrg * (sizeof(HitInfo) * 2 + 1697ec681f3Smrg * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) : 1707ec681f3Smrg * (sizeof(HitInfo) * 2 + 1717ec681f3Smrg * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels); 1727ec681f3Smrg * 1737ec681f3Smrg * The select is just to align to 64B. 1747ec681f3Smrg */ 1757ec681f3Smrg#define BRW_RT_SIZEOF_RAY_QUERY \ 1767ec681f3Smrg (BRW_RT_SIZEOF_HIT_INFO * 2 + \ 1777ec681f3Smrg (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ 1787ec681f3Smrg (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) 1797ec681f3Smrg 1807ec681f3Smrg#define BRW_RT_SIZEOF_HW_STACK \ 1817ec681f3Smrg (BRW_RT_SIZEOF_HIT_INFO * 2 + \ 1827ec681f3Smrg BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ 1837ec681f3Smrg BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS) 1847ec681f3Smrg 1857ec681f3Smrg/* This is a mesa-defined region for hit attribute data */ 1867ec681f3Smrg#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64 1877ec681f3Smrg#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK 1887ec681f3Smrg 1897ec681f3Smrg#define BRW_RT_ASYNC_STACK_STRIDE \ 1907ec681f3Smrg ALIGN(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \ 1917ec681f3Smrg BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64) 1927ec681f3Smrg 1937ec681f3Smrgstatic inline void 1947ec681f3Smrgbrw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout, 1957ec681f3Smrg const struct intel_device_info *devinfo, 1967ec681f3Smrg uint32_t stack_ids_per_dss, 1977ec681f3Smrg uint32_t sw_stack_size) 1987ec681f3Smrg{ 1997ec681f3Smrg layout->stack_ids_per_dss = stack_ids_per_dss; 2007ec681f3Smrg 2017ec681f3Smrg const uint32_t dss_count = intel_device_info_num_dual_subslices(devinfo); 2027ec681f3Smrg const uint32_t num_stack_ids = dss_count * stack_ids_per_dss; 2037ec681f3Smrg 2047ec681f3Smrg uint64_t size = 0; 2057ec681f3Smrg 2067ec681f3Smrg /* The first thing in our scratch area is an array of "hot zones" which 2077ec681f3Smrg * store the stack offset as well as the launch IDs for each active 2087ec681f3Smrg * invocation. 2097ec681f3Smrg */ 2107ec681f3Smrg size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids; 2117ec681f3Smrg 2127ec681f3Smrg /* Next, we place the HW ray stacks */ 2137ec681f3Smrg assert(size % 64 == 0); /* Cache-line aligned */ 2147ec681f3Smrg assert(size < UINT32_MAX); 2157ec681f3Smrg layout->ray_stack_start = size; 2167ec681f3Smrg layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE; 2177ec681f3Smrg size += num_stack_ids * layout->ray_stack_stride; 2187ec681f3Smrg 2197ec681f3Smrg /* Finally, we place the SW stacks for the individual ray-tracing shader 2207ec681f3Smrg * invocations. We align these to 64B to ensure that we don't have any 2217ec681f3Smrg * shared cache lines which could hurt performance. 2227ec681f3Smrg */ 2237ec681f3Smrg assert(size % 64 == 0); 2247ec681f3Smrg layout->sw_stack_start = size; 2257ec681f3Smrg layout->sw_stack_size = ALIGN(sw_stack_size, 64); 2267ec681f3Smrg size += num_stack_ids * layout->sw_stack_size; 2277ec681f3Smrg 2287ec681f3Smrg layout->total_size = size; 2297ec681f3Smrg} 2307ec681f3Smrg 2317ec681f3Smrg#ifdef __cplusplus 2327ec681f3Smrg} 2337ec681f3Smrg#endif 2347ec681f3Smrg 2357ec681f3Smrg#endif /* BRW_RT_H */ 236