1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2013 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21b8e80941Smrg * DEALINGS IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg/** 25b8e80941Smrg * \file brw_vec4_tcs.cpp 26b8e80941Smrg * 27b8e80941Smrg * Tessellaton control shader specific code derived from the vec4_visitor class. 28b8e80941Smrg */ 29b8e80941Smrg 30b8e80941Smrg#include "brw_nir.h" 31b8e80941Smrg#include "brw_vec4_tcs.h" 32b8e80941Smrg#include "brw_fs.h" 33b8e80941Smrg#include "dev/gen_debug.h" 34b8e80941Smrg 35b8e80941Smrgnamespace brw { 36b8e80941Smrg 37b8e80941Smrgvec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, 38b8e80941Smrg void *log_data, 39b8e80941Smrg const struct brw_tcs_prog_key *key, 40b8e80941Smrg struct brw_tcs_prog_data *prog_data, 41b8e80941Smrg const nir_shader *nir, 42b8e80941Smrg void *mem_ctx, 43b8e80941Smrg int shader_time_index, 44b8e80941Smrg const struct brw_vue_map *input_vue_map) 45b8e80941Smrg : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, 46b8e80941Smrg nir, mem_ctx, false, shader_time_index), 47b8e80941Smrg input_vue_map(input_vue_map), key(key) 48b8e80941Smrg{ 49b8e80941Smrg} 50b8e80941Smrg 51b8e80941Smrg 52b8e80941Smrgvoid 53b8e80941Smrgvec4_tcs_visitor::setup_payload() 54b8e80941Smrg{ 55b8e80941Smrg int reg = 0; 56b8e80941Smrg 57b8e80941Smrg /* The payload always contains important data in r0, which contains 58b8e80941Smrg * the URB handles that are passed on to the URB write at the end 59b8e80941Smrg * of the thread. 60b8e80941Smrg */ 61b8e80941Smrg reg++; 62b8e80941Smrg 63b8e80941Smrg /* r1.0 - r4.7 may contain the input control point URB handles, 64b8e80941Smrg * which we use to pull vertex data. 65b8e80941Smrg */ 66b8e80941Smrg reg += 4; 67b8e80941Smrg 68b8e80941Smrg /* Push constants may start at r5.0 */ 69b8e80941Smrg reg = setup_uniforms(reg); 70b8e80941Smrg 71b8e80941Smrg this->first_non_payload_grf = reg; 72b8e80941Smrg} 73b8e80941Smrg 74b8e80941Smrg 75b8e80941Smrgvoid 76b8e80941Smrgvec4_tcs_visitor::emit_prolog() 77b8e80941Smrg{ 78b8e80941Smrg invocation_id = src_reg(this, glsl_type::uint_type); 79b8e80941Smrg emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); 80b8e80941Smrg 81b8e80941Smrg /* HS threads are dispatched with the dispatch mask set to 0xFF. 82b8e80941Smrg * If there are an odd number of output vertices, then the final 83b8e80941Smrg * HS instance dispatched will only have its bottom half doing real 84b8e80941Smrg * work, and so we need to disable the upper half: 85b8e80941Smrg */ 86b8e80941Smrg if (nir->info.tess.tcs_vertices_out % 2) { 87b8e80941Smrg emit(CMP(dst_null_d(), invocation_id, 88b8e80941Smrg brw_imm_ud(nir->info.tess.tcs_vertices_out), 89b8e80941Smrg BRW_CONDITIONAL_L)); 90b8e80941Smrg 91b8e80941Smrg /* Matching ENDIF is in emit_thread_end() */ 92b8e80941Smrg emit(IF(BRW_PREDICATE_NORMAL)); 93b8e80941Smrg } 94b8e80941Smrg} 95b8e80941Smrg 96b8e80941Smrg 97b8e80941Smrgvoid 98b8e80941Smrgvec4_tcs_visitor::emit_thread_end() 99b8e80941Smrg{ 100b8e80941Smrg vec4_instruction *inst; 101b8e80941Smrg current_annotation = "thread end"; 102b8e80941Smrg 103b8e80941Smrg if (nir->info.tess.tcs_vertices_out % 2) { 104b8e80941Smrg emit(BRW_OPCODE_ENDIF); 105b8e80941Smrg } 106b8e80941Smrg 107b8e80941Smrg if (devinfo->gen == 7) { 108b8e80941Smrg struct brw_tcs_prog_data *tcs_prog_data = 109b8e80941Smrg (struct brw_tcs_prog_data *) prog_data; 110b8e80941Smrg 111b8e80941Smrg current_annotation = "release input vertices"; 112b8e80941Smrg 113b8e80941Smrg /* Synchronize all threads, so we know that no one is still 114b8e80941Smrg * using the input URB handles. 115b8e80941Smrg */ 116b8e80941Smrg if (tcs_prog_data->instances > 1) { 117b8e80941Smrg dst_reg header = dst_reg(this, glsl_type::uvec4_type); 118b8e80941Smrg emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); 119b8e80941Smrg emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); 120b8e80941Smrg } 121b8e80941Smrg 122b8e80941Smrg /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. 123b8e80941Smrg * We want to compare the bottom half of invocation_id with 0, but 124b8e80941Smrg * use that truth value for the top half as well. Unfortunately, 125b8e80941Smrg * we don't have stride in the vec4 world, nor UV immediates in 126b8e80941Smrg * align16, so we need an opcode to get invocation_id<0,4,0>. 127b8e80941Smrg */ 128b8e80941Smrg set_condmod(BRW_CONDITIONAL_Z, 129b8e80941Smrg emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), 130b8e80941Smrg invocation_id)); 131b8e80941Smrg emit(IF(BRW_PREDICATE_NORMAL)); 132b8e80941Smrg for (unsigned i = 0; i < key->input_vertices; i += 2) { 133b8e80941Smrg /* If we have an odd number of input vertices, the last will be 134b8e80941Smrg * unpaired. We don't want to use an interleaved URB write in 135b8e80941Smrg * that case. 136b8e80941Smrg */ 137b8e80941Smrg const bool is_unpaired = i == key->input_vertices - 1; 138b8e80941Smrg 139b8e80941Smrg dst_reg header(this, glsl_type::uvec4_type); 140b8e80941Smrg emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), 141b8e80941Smrg brw_imm_ud(is_unpaired)); 142b8e80941Smrg } 143b8e80941Smrg emit(BRW_OPCODE_ENDIF); 144b8e80941Smrg } 145b8e80941Smrg 146b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) 147b8e80941Smrg emit_shader_time_end(); 148b8e80941Smrg 149b8e80941Smrg inst = emit(TCS_OPCODE_THREAD_END); 150b8e80941Smrg inst->base_mrf = 14; 151b8e80941Smrg inst->mlen = 2; 152b8e80941Smrg} 153b8e80941Smrg 154b8e80941Smrg 155b8e80941Smrgvoid 156b8e80941Smrgvec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, 157b8e80941Smrg const src_reg &vertex_index, 158b8e80941Smrg unsigned base_offset, 159b8e80941Smrg unsigned first_component, 160b8e80941Smrg const src_reg &indirect_offset) 161b8e80941Smrg{ 162b8e80941Smrg vec4_instruction *inst; 163b8e80941Smrg dst_reg temp(this, glsl_type::ivec4_type); 164b8e80941Smrg temp.type = dst.type; 165b8e80941Smrg 166b8e80941Smrg /* Set up the message header to reference the proper parts of the URB */ 167b8e80941Smrg dst_reg header = dst_reg(this, glsl_type::uvec4_type); 168b8e80941Smrg inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, 169b8e80941Smrg indirect_offset); 170b8e80941Smrg inst->force_writemask_all = true; 171b8e80941Smrg 172b8e80941Smrg /* Read into a temporary, ignoring writemasking. */ 173b8e80941Smrg inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); 174b8e80941Smrg inst->offset = base_offset; 175b8e80941Smrg inst->mlen = 1; 176b8e80941Smrg inst->base_mrf = -1; 177b8e80941Smrg 178b8e80941Smrg /* Copy the temporary to the destination to deal with writemasking. 179b8e80941Smrg * 180b8e80941Smrg * Also attempt to deal with gl_PointSize being in the .w component. 181b8e80941Smrg */ 182b8e80941Smrg if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 183b8e80941Smrg emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); 184b8e80941Smrg } else { 185b8e80941Smrg src_reg src = src_reg(temp); 186b8e80941Smrg src.swizzle = BRW_SWZ_COMP_INPUT(first_component); 187b8e80941Smrg emit(MOV(dst, src)); 188b8e80941Smrg } 189b8e80941Smrg} 190b8e80941Smrg 191b8e80941Smrgvoid 192b8e80941Smrgvec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, 193b8e80941Smrg unsigned base_offset, 194b8e80941Smrg unsigned first_component, 195b8e80941Smrg const src_reg &indirect_offset) 196b8e80941Smrg{ 197b8e80941Smrg vec4_instruction *inst; 198b8e80941Smrg 199b8e80941Smrg /* Set up the message header to reference the proper parts of the URB */ 200b8e80941Smrg dst_reg header = dst_reg(this, glsl_type::uvec4_type); 201b8e80941Smrg inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, 202b8e80941Smrg brw_imm_ud(dst.writemask << first_component), indirect_offset); 203b8e80941Smrg inst->force_writemask_all = true; 204b8e80941Smrg 205b8e80941Smrg vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); 206b8e80941Smrg read->offset = base_offset; 207b8e80941Smrg read->mlen = 1; 208b8e80941Smrg read->base_mrf = -1; 209b8e80941Smrg 210b8e80941Smrg if (first_component) { 211b8e80941Smrg /* Read into a temporary and copy with a swizzle and writemask. */ 212b8e80941Smrg read->dst = retype(dst_reg(this, glsl_type::ivec4_type), dst.type); 213b8e80941Smrg emit(MOV(dst, swizzle(src_reg(read->dst), 214b8e80941Smrg BRW_SWZ_COMP_INPUT(first_component)))); 215b8e80941Smrg } 216b8e80941Smrg} 217b8e80941Smrg 218b8e80941Smrgvoid 219b8e80941Smrgvec4_tcs_visitor::emit_urb_write(const src_reg &value, 220b8e80941Smrg unsigned writemask, 221b8e80941Smrg unsigned base_offset, 222b8e80941Smrg const src_reg &indirect_offset) 223b8e80941Smrg{ 224b8e80941Smrg if (writemask == 0) 225b8e80941Smrg return; 226b8e80941Smrg 227b8e80941Smrg src_reg message(this, glsl_type::uvec4_type, 2); 228b8e80941Smrg vec4_instruction *inst; 229b8e80941Smrg 230b8e80941Smrg inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), 231b8e80941Smrg brw_imm_ud(writemask), indirect_offset); 232b8e80941Smrg inst->force_writemask_all = true; 233b8e80941Smrg inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE), 234b8e80941Smrg value)); 235b8e80941Smrg inst->force_writemask_all = true; 236b8e80941Smrg 237b8e80941Smrg inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); 238b8e80941Smrg inst->offset = base_offset; 239b8e80941Smrg inst->mlen = 2; 240b8e80941Smrg inst->base_mrf = -1; 241b8e80941Smrg} 242b8e80941Smrg 243b8e80941Smrgvoid 244b8e80941Smrgvec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) 245b8e80941Smrg{ 246b8e80941Smrg switch (instr->intrinsic) { 247b8e80941Smrg case nir_intrinsic_load_invocation_id: 248b8e80941Smrg emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), 249b8e80941Smrg invocation_id)); 250b8e80941Smrg break; 251b8e80941Smrg case nir_intrinsic_load_primitive_id: 252b8e80941Smrg emit(TCS_OPCODE_GET_PRIMITIVE_ID, 253b8e80941Smrg get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); 254b8e80941Smrg break; 255b8e80941Smrg case nir_intrinsic_load_patch_vertices_in: 256b8e80941Smrg emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), 257b8e80941Smrg brw_imm_d(key->input_vertices))); 258b8e80941Smrg break; 259b8e80941Smrg case nir_intrinsic_load_per_vertex_input: { 260b8e80941Smrg src_reg indirect_offset = get_indirect_offset(instr); 261b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 262b8e80941Smrg 263b8e80941Smrg src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]), 264b8e80941Smrg BRW_REGISTER_TYPE_UD); 265b8e80941Smrg 266b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 267b8e80941Smrg if (nir_dest_bit_size(instr->dest) == 64) { 268b8e80941Smrg /* We need to emit up to two 32-bit URB reads, then shuffle 269b8e80941Smrg * the result into a temporary, then move to the destination 270b8e80941Smrg * honoring the writemask 271b8e80941Smrg * 272b8e80941Smrg * We don't need to divide first_component by 2 because 273b8e80941Smrg * emit_input_urb_read takes a 32-bit type. 274b8e80941Smrg */ 275b8e80941Smrg dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); 276b8e80941Smrg dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D); 277b8e80941Smrg emit_input_urb_read(tmp_d, vertex_index, imm_offset, 278b8e80941Smrg first_component, indirect_offset); 279b8e80941Smrg if (instr->num_components > 2) { 280b8e80941Smrg emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index, 281b8e80941Smrg imm_offset + 1, 0, indirect_offset); 282b8e80941Smrg } 283b8e80941Smrg 284b8e80941Smrg src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF); 285b8e80941Smrg dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 286b8e80941Smrg shuffle_64bit_data(shuffled, tmp_src, false); 287b8e80941Smrg 288b8e80941Smrg dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); 289b8e80941Smrg dst.writemask = brw_writemask_for_size(instr->num_components); 290b8e80941Smrg emit(MOV(dst, src_reg(shuffled))); 291b8e80941Smrg } else { 292b8e80941Smrg dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); 293b8e80941Smrg dst.writemask = brw_writemask_for_size(instr->num_components); 294b8e80941Smrg emit_input_urb_read(dst, vertex_index, imm_offset, 295b8e80941Smrg first_component, indirect_offset); 296b8e80941Smrg } 297b8e80941Smrg break; 298b8e80941Smrg } 299b8e80941Smrg case nir_intrinsic_load_input: 300b8e80941Smrg unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); 301b8e80941Smrg break; 302b8e80941Smrg case nir_intrinsic_load_output: 303b8e80941Smrg case nir_intrinsic_load_per_vertex_output: { 304b8e80941Smrg src_reg indirect_offset = get_indirect_offset(instr); 305b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 306b8e80941Smrg 307b8e80941Smrg dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); 308b8e80941Smrg dst.writemask = brw_writemask_for_size(instr->num_components); 309b8e80941Smrg 310b8e80941Smrg emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), 311b8e80941Smrg indirect_offset); 312b8e80941Smrg break; 313b8e80941Smrg } 314b8e80941Smrg case nir_intrinsic_store_output: 315b8e80941Smrg case nir_intrinsic_store_per_vertex_output: { 316b8e80941Smrg src_reg value = get_nir_src(instr->src[0]); 317b8e80941Smrg unsigned mask = instr->const_index[1]; 318b8e80941Smrg unsigned swiz = BRW_SWIZZLE_XYZW; 319b8e80941Smrg 320b8e80941Smrg src_reg indirect_offset = get_indirect_offset(instr); 321b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 322b8e80941Smrg 323b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 324b8e80941Smrg if (first_component) { 325b8e80941Smrg if (nir_src_bit_size(instr->src[0]) == 64) 326b8e80941Smrg first_component /= 2; 327b8e80941Smrg assert(swiz == BRW_SWIZZLE_XYZW); 328b8e80941Smrg swiz = BRW_SWZ_COMP_OUTPUT(first_component); 329b8e80941Smrg mask = mask << first_component; 330b8e80941Smrg } 331b8e80941Smrg 332b8e80941Smrg if (nir_src_bit_size(instr->src[0]) == 64) { 333b8e80941Smrg /* For 64-bit data we need to shuffle the data before we write and 334b8e80941Smrg * emit two messages. Also, since each channel is twice as large we 335b8e80941Smrg * need to fix the writemask in each 32-bit message to account for it. 336b8e80941Smrg */ 337b8e80941Smrg value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz); 338b8e80941Smrg dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 339b8e80941Smrg shuffle_64bit_data(shuffled, value, true); 340b8e80941Smrg src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 341b8e80941Smrg 342b8e80941Smrg for (int n = 0; n < 2; n++) { 343b8e80941Smrg unsigned fixed_mask = 0; 344b8e80941Smrg if (mask & WRITEMASK_X) 345b8e80941Smrg fixed_mask |= WRITEMASK_XY; 346b8e80941Smrg if (mask & WRITEMASK_Y) 347b8e80941Smrg fixed_mask |= WRITEMASK_ZW; 348b8e80941Smrg emit_urb_write(shuffled_float, fixed_mask, 349b8e80941Smrg imm_offset, indirect_offset); 350b8e80941Smrg 351b8e80941Smrg shuffled_float = byte_offset(shuffled_float, REG_SIZE); 352b8e80941Smrg mask >>= 2; 353b8e80941Smrg imm_offset++; 354b8e80941Smrg } 355b8e80941Smrg } else { 356b8e80941Smrg emit_urb_write(swizzle(value, swiz), mask, 357b8e80941Smrg imm_offset, indirect_offset); 358b8e80941Smrg } 359b8e80941Smrg break; 360b8e80941Smrg } 361b8e80941Smrg 362b8e80941Smrg case nir_intrinsic_barrier: { 363b8e80941Smrg dst_reg header = dst_reg(this, glsl_type::uvec4_type); 364b8e80941Smrg emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); 365b8e80941Smrg emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); 366b8e80941Smrg break; 367b8e80941Smrg } 368b8e80941Smrg 369b8e80941Smrg default: 370b8e80941Smrg vec4_visitor::nir_emit_intrinsic(instr); 371b8e80941Smrg } 372b8e80941Smrg} 373b8e80941Smrg 374b8e80941Smrg 375b8e80941Smrgextern "C" const unsigned * 376b8e80941Smrgbrw_compile_tcs(const struct brw_compiler *compiler, 377b8e80941Smrg void *log_data, 378b8e80941Smrg void *mem_ctx, 379b8e80941Smrg const struct brw_tcs_prog_key *key, 380b8e80941Smrg struct brw_tcs_prog_data *prog_data, 381b8e80941Smrg nir_shader *nir, 382b8e80941Smrg int shader_time_index, 383b8e80941Smrg char **error_str) 384b8e80941Smrg{ 385b8e80941Smrg const struct gen_device_info *devinfo = compiler->devinfo; 386b8e80941Smrg struct brw_vue_prog_data *vue_prog_data = &prog_data->base; 387b8e80941Smrg const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; 388b8e80941Smrg const unsigned *assembly; 389b8e80941Smrg 390b8e80941Smrg nir->info.outputs_written = key->outputs_written; 391b8e80941Smrg nir->info.patch_outputs_written = key->patch_outputs_written; 392b8e80941Smrg 393b8e80941Smrg struct brw_vue_map input_vue_map; 394b8e80941Smrg brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read, 395b8e80941Smrg nir->info.separate_shader); 396b8e80941Smrg brw_compute_tess_vue_map(&vue_prog_data->vue_map, 397b8e80941Smrg nir->info.outputs_written, 398b8e80941Smrg nir->info.patch_outputs_written); 399b8e80941Smrg 400b8e80941Smrg nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar); 401b8e80941Smrg brw_nir_lower_vue_inputs(nir, &input_vue_map); 402b8e80941Smrg brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map, 403b8e80941Smrg key->tes_primitive_mode); 404b8e80941Smrg if (key->quads_workaround) 405b8e80941Smrg brw_nir_apply_tcs_quads_workaround(nir); 406b8e80941Smrg 407b8e80941Smrg nir = brw_postprocess_nir(nir, compiler, is_scalar); 408b8e80941Smrg 409b8e80941Smrg if (is_scalar) 410b8e80941Smrg prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 8); 411b8e80941Smrg else 412b8e80941Smrg prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, 2); 413b8e80941Smrg 414b8e80941Smrg /* Compute URB entry size. The maximum allowed URB entry size is 32k. 415b8e80941Smrg * That divides up as follows: 416b8e80941Smrg * 417b8e80941Smrg * 32 bytes for the patch header (tessellation factors) 418b8e80941Smrg * 480 bytes for per-patch varyings (a varying component is 4 bytes and 419b8e80941Smrg * gl_MaxTessPatchComponents = 120) 420b8e80941Smrg * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, 421b8e80941Smrg * gl_MaxPatchVertices = 32 and 422b8e80941Smrg * gl_MaxTessControlOutputComponents = 128) 423b8e80941Smrg * 424b8e80941Smrg * 15808 bytes left for varying packing overhead 425b8e80941Smrg */ 426b8e80941Smrg const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots; 427b8e80941Smrg const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots; 428b8e80941Smrg unsigned output_size_bytes = 0; 429b8e80941Smrg /* Note that the patch header is counted in num_per_patch_slots. */ 430b8e80941Smrg output_size_bytes += num_per_patch_slots * 16; 431b8e80941Smrg output_size_bytes += nir->info.tess.tcs_vertices_out * 432b8e80941Smrg num_per_vertex_slots * 16; 433b8e80941Smrg 434b8e80941Smrg assert(output_size_bytes >= 1); 435b8e80941Smrg if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES) 436b8e80941Smrg return NULL; 437b8e80941Smrg 438b8e80941Smrg /* URB entry sizes are stored as a multiple of 64 bytes. */ 439b8e80941Smrg vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; 440b8e80941Smrg 441b8e80941Smrg /* On Cannonlake software shall not program an allocation size that 442b8e80941Smrg * specifies a size that is a multiple of 3 64B (512-bit) cachelines. 443b8e80941Smrg */ 444b8e80941Smrg if (devinfo->gen == 10 && 445b8e80941Smrg vue_prog_data->urb_entry_size % 3 == 0) 446b8e80941Smrg vue_prog_data->urb_entry_size++; 447b8e80941Smrg 448b8e80941Smrg /* HS does not use the usual payload pushing from URB to GRFs, 449b8e80941Smrg * because we don't have enough registers for a full-size payload, and 450b8e80941Smrg * the hardware is broken on Haswell anyway. 451b8e80941Smrg */ 452b8e80941Smrg vue_prog_data->urb_read_length = 0; 453b8e80941Smrg 454b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { 455b8e80941Smrg fprintf(stderr, "TCS Input "); 456b8e80941Smrg brw_print_vue_map(stderr, &input_vue_map); 457b8e80941Smrg fprintf(stderr, "TCS Output "); 458b8e80941Smrg brw_print_vue_map(stderr, &vue_prog_data->vue_map); 459b8e80941Smrg } 460b8e80941Smrg 461b8e80941Smrg if (is_scalar) { 462b8e80941Smrg fs_visitor v(compiler, log_data, mem_ctx, (void *) key, 463b8e80941Smrg &prog_data->base.base, NULL, nir, 8, 464b8e80941Smrg shader_time_index, &input_vue_map); 465b8e80941Smrg if (!v.run_tcs_single_patch()) { 466b8e80941Smrg if (error_str) 467b8e80941Smrg *error_str = ralloc_strdup(mem_ctx, v.fail_msg); 468b8e80941Smrg return NULL; 469b8e80941Smrg } 470b8e80941Smrg 471b8e80941Smrg prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; 472b8e80941Smrg prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; 473b8e80941Smrg 474b8e80941Smrg fs_generator g(compiler, log_data, mem_ctx, 475b8e80941Smrg &prog_data->base.base, v.promoted_constants, false, 476b8e80941Smrg MESA_SHADER_TESS_CTRL); 477b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { 478b8e80941Smrg g.enable_debug(ralloc_asprintf(mem_ctx, 479b8e80941Smrg "%s tessellation control shader %s", 480b8e80941Smrg nir->info.label ? nir->info.label 481b8e80941Smrg : "unnamed", 482b8e80941Smrg nir->info.name)); 483b8e80941Smrg } 484b8e80941Smrg 485b8e80941Smrg g.generate_code(v.cfg, 8); 486b8e80941Smrg 487b8e80941Smrg assembly = g.get_assembly(); 488b8e80941Smrg } else { 489b8e80941Smrg vec4_tcs_visitor v(compiler, log_data, key, prog_data, 490b8e80941Smrg nir, mem_ctx, shader_time_index, &input_vue_map); 491b8e80941Smrg if (!v.run()) { 492b8e80941Smrg if (error_str) 493b8e80941Smrg *error_str = ralloc_strdup(mem_ctx, v.fail_msg); 494b8e80941Smrg return NULL; 495b8e80941Smrg } 496b8e80941Smrg 497b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_TCS)) 498b8e80941Smrg v.dump_instructions(); 499b8e80941Smrg 500b8e80941Smrg 501b8e80941Smrg assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, 502b8e80941Smrg &prog_data->base, v.cfg); 503b8e80941Smrg } 504b8e80941Smrg 505b8e80941Smrg return assembly; 506b8e80941Smrg} 507b8e80941Smrg 508b8e80941Smrg 509b8e80941Smrg} /* namespace brw */ 510