1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2010 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "compiler/glsl/ir.h" 25b8e80941Smrg#include "brw_fs.h" 26b8e80941Smrg#include "brw_nir.h" 27b8e80941Smrg#include "nir_search_helpers.h" 28b8e80941Smrg#include "util/u_math.h" 29b8e80941Smrg#include "util/bitscan.h" 30b8e80941Smrg 31b8e80941Smrgusing namespace brw; 32b8e80941Smrg 33b8e80941Smrgvoid 34b8e80941Smrgfs_visitor::emit_nir_code() 35b8e80941Smrg{ 36b8e80941Smrg /* emit the arrays used for inputs and outputs - load/store intrinsics will 37b8e80941Smrg * be converted to reads/writes of these arrays 38b8e80941Smrg */ 39b8e80941Smrg nir_setup_outputs(); 40b8e80941Smrg nir_setup_uniforms(); 41b8e80941Smrg nir_emit_system_values(); 42b8e80941Smrg 43b8e80941Smrg nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 44b8e80941Smrg} 45b8e80941Smrg 46b8e80941Smrgvoid 47b8e80941Smrgfs_visitor::nir_setup_outputs() 48b8e80941Smrg{ 49b8e80941Smrg if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) 50b8e80941Smrg return; 51b8e80941Smrg 52b8e80941Smrg unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 53b8e80941Smrg 54b8e80941Smrg /* Calculate the size of output registers in a separate pass, before 55b8e80941Smrg * allocating them. With ARB_enhanced_layouts, multiple output variables 56b8e80941Smrg * may occupy the same slot, but have different type sizes. 57b8e80941Smrg */ 58b8e80941Smrg nir_foreach_variable(var, &nir->outputs) { 59b8e80941Smrg const int loc = var->data.driver_location; 60b8e80941Smrg const unsigned var_vec4s = 61b8e80941Smrg var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 62b8e80941Smrg : type_size_vec4(var->type, true); 63b8e80941Smrg vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 64b8e80941Smrg } 65b8e80941Smrg 66b8e80941Smrg for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 67b8e80941Smrg if (vec4s[loc] == 0) { 68b8e80941Smrg loc++; 69b8e80941Smrg continue; 70b8e80941Smrg } 71b8e80941Smrg 72b8e80941Smrg unsigned reg_size = vec4s[loc]; 73b8e80941Smrg 74b8e80941Smrg /* Check if there are any ranges that start within this range and extend 75b8e80941Smrg * past it. If so, include them in this allocation. 76b8e80941Smrg */ 77b8e80941Smrg for (unsigned i = 1; i < reg_size; i++) 78b8e80941Smrg reg_size = MAX2(vec4s[i + loc] + i, reg_size); 79b8e80941Smrg 80b8e80941Smrg fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 81b8e80941Smrg for (unsigned i = 0; i < reg_size; i++) 82b8e80941Smrg outputs[loc + i] = offset(reg, bld, 4 * i); 83b8e80941Smrg 84b8e80941Smrg loc += reg_size; 85b8e80941Smrg } 86b8e80941Smrg} 87b8e80941Smrg 88b8e80941Smrgvoid 89b8e80941Smrgfs_visitor::nir_setup_uniforms() 90b8e80941Smrg{ 91b8e80941Smrg /* Only the first compile gets to set up uniforms. */ 92b8e80941Smrg if (push_constant_loc) { 93b8e80941Smrg assert(pull_constant_loc); 94b8e80941Smrg return; 95b8e80941Smrg } 96b8e80941Smrg 97b8e80941Smrg uniforms = nir->num_uniforms / 4; 98b8e80941Smrg 99b8e80941Smrg if (stage == MESA_SHADER_COMPUTE) { 100b8e80941Smrg /* Add a uniform for the thread local id. It must be the last uniform 101b8e80941Smrg * on the list. 102b8e80941Smrg */ 103b8e80941Smrg assert(uniforms == prog_data->nr_params); 104b8e80941Smrg uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); 105b8e80941Smrg *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 106b8e80941Smrg subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 107b8e80941Smrg } 108b8e80941Smrg} 109b8e80941Smrg 110b8e80941Smrgstatic bool 111b8e80941Smrgemit_system_values_block(nir_block *block, fs_visitor *v) 112b8e80941Smrg{ 113b8e80941Smrg fs_reg *reg; 114b8e80941Smrg 115b8e80941Smrg nir_foreach_instr(instr, block) { 116b8e80941Smrg if (instr->type != nir_instr_type_intrinsic) 117b8e80941Smrg continue; 118b8e80941Smrg 119b8e80941Smrg nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 120b8e80941Smrg switch (intrin->intrinsic) { 121b8e80941Smrg case nir_intrinsic_load_vertex_id: 122b8e80941Smrg case nir_intrinsic_load_base_vertex: 123b8e80941Smrg unreachable("should be lowered by nir_lower_system_values()."); 124b8e80941Smrg 125b8e80941Smrg case nir_intrinsic_load_vertex_id_zero_base: 126b8e80941Smrg case nir_intrinsic_load_is_indexed_draw: 127b8e80941Smrg case nir_intrinsic_load_first_vertex: 128b8e80941Smrg case nir_intrinsic_load_instance_id: 129b8e80941Smrg case nir_intrinsic_load_base_instance: 130b8e80941Smrg case nir_intrinsic_load_draw_id: 131b8e80941Smrg unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 132b8e80941Smrg 133b8e80941Smrg case nir_intrinsic_load_invocation_id: 134b8e80941Smrg if (v->stage == MESA_SHADER_TESS_CTRL) 135b8e80941Smrg break; 136b8e80941Smrg assert(v->stage == MESA_SHADER_GEOMETRY); 137b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 138b8e80941Smrg if (reg->file == BAD_FILE) { 139b8e80941Smrg const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 140b8e80941Smrg fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 141b8e80941Smrg fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 142b8e80941Smrg abld.SHR(iid, g1, brw_imm_ud(27u)); 143b8e80941Smrg *reg = iid; 144b8e80941Smrg } 145b8e80941Smrg break; 146b8e80941Smrg 147b8e80941Smrg case nir_intrinsic_load_sample_pos: 148b8e80941Smrg assert(v->stage == MESA_SHADER_FRAGMENT); 149b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 150b8e80941Smrg if (reg->file == BAD_FILE) 151b8e80941Smrg *reg = *v->emit_samplepos_setup(); 152b8e80941Smrg break; 153b8e80941Smrg 154b8e80941Smrg case nir_intrinsic_load_sample_id: 155b8e80941Smrg assert(v->stage == MESA_SHADER_FRAGMENT); 156b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 157b8e80941Smrg if (reg->file == BAD_FILE) 158b8e80941Smrg *reg = *v->emit_sampleid_setup(); 159b8e80941Smrg break; 160b8e80941Smrg 161b8e80941Smrg case nir_intrinsic_load_sample_mask_in: 162b8e80941Smrg assert(v->stage == MESA_SHADER_FRAGMENT); 163b8e80941Smrg assert(v->devinfo->gen >= 7); 164b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 165b8e80941Smrg if (reg->file == BAD_FILE) 166b8e80941Smrg *reg = *v->emit_samplemaskin_setup(); 167b8e80941Smrg break; 168b8e80941Smrg 169b8e80941Smrg case nir_intrinsic_load_work_group_id: 170b8e80941Smrg assert(v->stage == MESA_SHADER_COMPUTE); 171b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; 172b8e80941Smrg if (reg->file == BAD_FILE) 173b8e80941Smrg *reg = *v->emit_cs_work_group_id_setup(); 174b8e80941Smrg break; 175b8e80941Smrg 176b8e80941Smrg case nir_intrinsic_load_helper_invocation: 177b8e80941Smrg assert(v->stage == MESA_SHADER_FRAGMENT); 178b8e80941Smrg reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 179b8e80941Smrg if (reg->file == BAD_FILE) { 180b8e80941Smrg const fs_builder abld = 181b8e80941Smrg v->bld.annotate("gl_HelperInvocation", NULL); 182b8e80941Smrg 183b8e80941Smrg /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the 184b8e80941Smrg * pixel mask is in g1.7 of the thread payload. 185b8e80941Smrg * 186b8e80941Smrg * We move the per-channel pixel enable bit to the low bit of each 187b8e80941Smrg * channel by shifting the byte containing the pixel mask by the 188b8e80941Smrg * vector immediate 0x76543210UV. 189b8e80941Smrg * 190b8e80941Smrg * The region of <1,8,0> reads only 1 byte (the pixel masks for 191b8e80941Smrg * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 192b8e80941Smrg * masks for 2 and 3) in SIMD16. 193b8e80941Smrg */ 194b8e80941Smrg fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 195b8e80941Smrg 196b8e80941Smrg for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 197b8e80941Smrg const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 198b8e80941Smrg hbld.SHR(offset(shifted, hbld, i), 199b8e80941Smrg stride(retype(brw_vec1_grf(1 + i, 7), 200b8e80941Smrg BRW_REGISTER_TYPE_UB), 201b8e80941Smrg 1, 8, 0), 202b8e80941Smrg brw_imm_v(0x76543210)); 203b8e80941Smrg } 204b8e80941Smrg 205b8e80941Smrg /* A set bit in the pixel mask means the channel is enabled, but 206b8e80941Smrg * that is the opposite of gl_HelperInvocation so we need to invert 207b8e80941Smrg * the mask. 208b8e80941Smrg * 209b8e80941Smrg * The negate source-modifier bit of logical instructions on Gen8+ 210b8e80941Smrg * performs 1's complement negation, so we can use that instead of 211b8e80941Smrg * a NOT instruction. 212b8e80941Smrg */ 213b8e80941Smrg fs_reg inverted = negate(shifted); 214b8e80941Smrg if (v->devinfo->gen < 8) { 215b8e80941Smrg inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 216b8e80941Smrg abld.NOT(inverted, shifted); 217b8e80941Smrg } 218b8e80941Smrg 219b8e80941Smrg /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 220b8e80941Smrg * with 1 and negating. 221b8e80941Smrg */ 222b8e80941Smrg fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 223b8e80941Smrg abld.AND(anded, inverted, brw_imm_uw(1)); 224b8e80941Smrg 225b8e80941Smrg fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 226b8e80941Smrg abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 227b8e80941Smrg *reg = dst; 228b8e80941Smrg } 229b8e80941Smrg break; 230b8e80941Smrg 231b8e80941Smrg default: 232b8e80941Smrg break; 233b8e80941Smrg } 234b8e80941Smrg } 235b8e80941Smrg 236b8e80941Smrg return true; 237b8e80941Smrg} 238b8e80941Smrg 239b8e80941Smrgvoid 240b8e80941Smrgfs_visitor::nir_emit_system_values() 241b8e80941Smrg{ 242b8e80941Smrg nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 243b8e80941Smrg for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 244b8e80941Smrg nir_system_values[i] = fs_reg(); 245b8e80941Smrg } 246b8e80941Smrg 247b8e80941Smrg /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 248b8e80941Smrg * never end up using it. 249b8e80941Smrg */ 250b8e80941Smrg { 251b8e80941Smrg const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 252b8e80941Smrg fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 253b8e80941Smrg reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 254b8e80941Smrg 255b8e80941Smrg const fs_builder allbld8 = abld.group(8, 0).exec_all(); 256b8e80941Smrg allbld8.MOV(reg, brw_imm_v(0x76543210)); 257b8e80941Smrg if (dispatch_width > 8) 258b8e80941Smrg allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 259b8e80941Smrg if (dispatch_width > 16) { 260b8e80941Smrg const fs_builder allbld16 = abld.group(16, 0).exec_all(); 261b8e80941Smrg allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 262b8e80941Smrg } 263b8e80941Smrg } 264b8e80941Smrg 265b8e80941Smrg nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 266b8e80941Smrg nir_foreach_block(block, impl) 267b8e80941Smrg emit_system_values_block(block, this); 268b8e80941Smrg} 269b8e80941Smrg 270b8e80941Smrg/* 271b8e80941Smrg * Returns a type based on a reference_type (word, float, half-float) and a 272b8e80941Smrg * given bit_size. 273b8e80941Smrg * 274b8e80941Smrg * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD. 275b8e80941Smrg * 276b8e80941Smrg * @FIXME: 64-bit return types are always DF on integer types to maintain 277b8e80941Smrg * compability with uses of DF previously to the introduction of int64 278b8e80941Smrg * support. 279b8e80941Smrg */ 280b8e80941Smrgstatic brw_reg_type 281b8e80941Smrgbrw_reg_type_from_bit_size(const unsigned bit_size, 282b8e80941Smrg const brw_reg_type reference_type) 283b8e80941Smrg{ 284b8e80941Smrg switch(reference_type) { 285b8e80941Smrg case BRW_REGISTER_TYPE_HF: 286b8e80941Smrg case BRW_REGISTER_TYPE_F: 287b8e80941Smrg case BRW_REGISTER_TYPE_DF: 288b8e80941Smrg switch(bit_size) { 289b8e80941Smrg case 16: 290b8e80941Smrg return BRW_REGISTER_TYPE_HF; 291b8e80941Smrg case 32: 292b8e80941Smrg return BRW_REGISTER_TYPE_F; 293b8e80941Smrg case 64: 294b8e80941Smrg return BRW_REGISTER_TYPE_DF; 295b8e80941Smrg default: 296b8e80941Smrg unreachable("Invalid bit size"); 297b8e80941Smrg } 298b8e80941Smrg case BRW_REGISTER_TYPE_B: 299b8e80941Smrg case BRW_REGISTER_TYPE_W: 300b8e80941Smrg case BRW_REGISTER_TYPE_D: 301b8e80941Smrg case BRW_REGISTER_TYPE_Q: 302b8e80941Smrg switch(bit_size) { 303b8e80941Smrg case 8: 304b8e80941Smrg return BRW_REGISTER_TYPE_B; 305b8e80941Smrg case 16: 306b8e80941Smrg return BRW_REGISTER_TYPE_W; 307b8e80941Smrg case 32: 308b8e80941Smrg return BRW_REGISTER_TYPE_D; 309b8e80941Smrg case 64: 310b8e80941Smrg return BRW_REGISTER_TYPE_Q; 311b8e80941Smrg default: 312b8e80941Smrg unreachable("Invalid bit size"); 313b8e80941Smrg } 314b8e80941Smrg case BRW_REGISTER_TYPE_UB: 315b8e80941Smrg case BRW_REGISTER_TYPE_UW: 316b8e80941Smrg case BRW_REGISTER_TYPE_UD: 317b8e80941Smrg case BRW_REGISTER_TYPE_UQ: 318b8e80941Smrg switch(bit_size) { 319b8e80941Smrg case 8: 320b8e80941Smrg return BRW_REGISTER_TYPE_UB; 321b8e80941Smrg case 16: 322b8e80941Smrg return BRW_REGISTER_TYPE_UW; 323b8e80941Smrg case 32: 324b8e80941Smrg return BRW_REGISTER_TYPE_UD; 325b8e80941Smrg case 64: 326b8e80941Smrg return BRW_REGISTER_TYPE_UQ; 327b8e80941Smrg default: 328b8e80941Smrg unreachable("Invalid bit size"); 329b8e80941Smrg } 330b8e80941Smrg default: 331b8e80941Smrg unreachable("Unknown type"); 332b8e80941Smrg } 333b8e80941Smrg} 334b8e80941Smrg 335b8e80941Smrgvoid 336b8e80941Smrgfs_visitor::nir_emit_impl(nir_function_impl *impl) 337b8e80941Smrg{ 338b8e80941Smrg nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 339b8e80941Smrg for (unsigned i = 0; i < impl->reg_alloc; i++) { 340b8e80941Smrg nir_locals[i] = fs_reg(); 341b8e80941Smrg } 342b8e80941Smrg 343b8e80941Smrg foreach_list_typed(nir_register, reg, node, &impl->registers) { 344b8e80941Smrg unsigned array_elems = 345b8e80941Smrg reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 346b8e80941Smrg unsigned size = array_elems * reg->num_components; 347b8e80941Smrg const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : 348b8e80941Smrg brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 349b8e80941Smrg nir_locals[reg->index] = bld.vgrf(reg_type, size); 350b8e80941Smrg } 351b8e80941Smrg 352b8e80941Smrg nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 353b8e80941Smrg impl->ssa_alloc); 354b8e80941Smrg 355b8e80941Smrg nir_emit_cf_list(&impl->body); 356b8e80941Smrg} 357b8e80941Smrg 358b8e80941Smrgvoid 359b8e80941Smrgfs_visitor::nir_emit_cf_list(exec_list *list) 360b8e80941Smrg{ 361b8e80941Smrg exec_list_validate(list); 362b8e80941Smrg foreach_list_typed(nir_cf_node, node, node, list) { 363b8e80941Smrg switch (node->type) { 364b8e80941Smrg case nir_cf_node_if: 365b8e80941Smrg nir_emit_if(nir_cf_node_as_if(node)); 366b8e80941Smrg break; 367b8e80941Smrg 368b8e80941Smrg case nir_cf_node_loop: 369b8e80941Smrg nir_emit_loop(nir_cf_node_as_loop(node)); 370b8e80941Smrg break; 371b8e80941Smrg 372b8e80941Smrg case nir_cf_node_block: 373b8e80941Smrg nir_emit_block(nir_cf_node_as_block(node)); 374b8e80941Smrg break; 375b8e80941Smrg 376b8e80941Smrg default: 377b8e80941Smrg unreachable("Invalid CFG node block"); 378b8e80941Smrg } 379b8e80941Smrg } 380b8e80941Smrg} 381b8e80941Smrg 382b8e80941Smrgvoid 383b8e80941Smrgfs_visitor::nir_emit_if(nir_if *if_stmt) 384b8e80941Smrg{ 385b8e80941Smrg bool invert; 386b8e80941Smrg fs_reg cond_reg; 387b8e80941Smrg 388b8e80941Smrg /* If the condition has the form !other_condition, use other_condition as 389b8e80941Smrg * the source, but invert the predicate on the if instruction. 390b8e80941Smrg */ 391b8e80941Smrg nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); 392b8e80941Smrg if (cond != NULL && cond->op == nir_op_inot) { 393b8e80941Smrg assert(!cond->src[0].negate); 394b8e80941Smrg assert(!cond->src[0].abs); 395b8e80941Smrg 396b8e80941Smrg invert = true; 397b8e80941Smrg cond_reg = get_nir_src(cond->src[0].src); 398b8e80941Smrg } else { 399b8e80941Smrg invert = false; 400b8e80941Smrg cond_reg = get_nir_src(if_stmt->condition); 401b8e80941Smrg } 402b8e80941Smrg 403b8e80941Smrg /* first, put the condition into f0 */ 404b8e80941Smrg fs_inst *inst = bld.MOV(bld.null_reg_d(), 405b8e80941Smrg retype(cond_reg, BRW_REGISTER_TYPE_D)); 406b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_NZ; 407b8e80941Smrg 408b8e80941Smrg bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; 409b8e80941Smrg 410b8e80941Smrg nir_emit_cf_list(&if_stmt->then_list); 411b8e80941Smrg 412b8e80941Smrg if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 413b8e80941Smrg bld.emit(BRW_OPCODE_ELSE); 414b8e80941Smrg nir_emit_cf_list(&if_stmt->else_list); 415b8e80941Smrg } 416b8e80941Smrg 417b8e80941Smrg bld.emit(BRW_OPCODE_ENDIF); 418b8e80941Smrg 419b8e80941Smrg if (devinfo->gen < 7) 420b8e80941Smrg limit_dispatch_width(16, "Non-uniform control flow unsupported " 421b8e80941Smrg "in SIMD32 mode."); 422b8e80941Smrg} 423b8e80941Smrg 424b8e80941Smrgvoid 425b8e80941Smrgfs_visitor::nir_emit_loop(nir_loop *loop) 426b8e80941Smrg{ 427b8e80941Smrg bld.emit(BRW_OPCODE_DO); 428b8e80941Smrg 429b8e80941Smrg nir_emit_cf_list(&loop->body); 430b8e80941Smrg 431b8e80941Smrg bld.emit(BRW_OPCODE_WHILE); 432b8e80941Smrg 433b8e80941Smrg if (devinfo->gen < 7) 434b8e80941Smrg limit_dispatch_width(16, "Non-uniform control flow unsupported " 435b8e80941Smrg "in SIMD32 mode."); 436b8e80941Smrg} 437b8e80941Smrg 438b8e80941Smrgvoid 439b8e80941Smrgfs_visitor::nir_emit_block(nir_block *block) 440b8e80941Smrg{ 441b8e80941Smrg nir_foreach_instr(instr, block) { 442b8e80941Smrg nir_emit_instr(instr); 443b8e80941Smrg } 444b8e80941Smrg} 445b8e80941Smrg 446b8e80941Smrgvoid 447b8e80941Smrgfs_visitor::nir_emit_instr(nir_instr *instr) 448b8e80941Smrg{ 449b8e80941Smrg const fs_builder abld = bld.annotate(NULL, instr); 450b8e80941Smrg 451b8e80941Smrg switch (instr->type) { 452b8e80941Smrg case nir_instr_type_alu: 453b8e80941Smrg nir_emit_alu(abld, nir_instr_as_alu(instr)); 454b8e80941Smrg break; 455b8e80941Smrg 456b8e80941Smrg case nir_instr_type_deref: 457b8e80941Smrg unreachable("All derefs should've been lowered"); 458b8e80941Smrg break; 459b8e80941Smrg 460b8e80941Smrg case nir_instr_type_intrinsic: 461b8e80941Smrg switch (stage) { 462b8e80941Smrg case MESA_SHADER_VERTEX: 463b8e80941Smrg nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 464b8e80941Smrg break; 465b8e80941Smrg case MESA_SHADER_TESS_CTRL: 466b8e80941Smrg nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 467b8e80941Smrg break; 468b8e80941Smrg case MESA_SHADER_TESS_EVAL: 469b8e80941Smrg nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 470b8e80941Smrg break; 471b8e80941Smrg case MESA_SHADER_GEOMETRY: 472b8e80941Smrg nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 473b8e80941Smrg break; 474b8e80941Smrg case MESA_SHADER_FRAGMENT: 475b8e80941Smrg nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 476b8e80941Smrg break; 477b8e80941Smrg case MESA_SHADER_COMPUTE: 478b8e80941Smrg nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 479b8e80941Smrg break; 480b8e80941Smrg default: 481b8e80941Smrg unreachable("unsupported shader stage"); 482b8e80941Smrg } 483b8e80941Smrg break; 484b8e80941Smrg 485b8e80941Smrg case nir_instr_type_tex: 486b8e80941Smrg nir_emit_texture(abld, nir_instr_as_tex(instr)); 487b8e80941Smrg break; 488b8e80941Smrg 489b8e80941Smrg case nir_instr_type_load_const: 490b8e80941Smrg nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 491b8e80941Smrg break; 492b8e80941Smrg 493b8e80941Smrg case nir_instr_type_ssa_undef: 494b8e80941Smrg /* We create a new VGRF for undefs on every use (by handling 495b8e80941Smrg * them in get_nir_src()), rather than for each definition. 496b8e80941Smrg * This helps register coalescing eliminate MOVs from undef. 497b8e80941Smrg */ 498b8e80941Smrg break; 499b8e80941Smrg 500b8e80941Smrg case nir_instr_type_jump: 501b8e80941Smrg nir_emit_jump(abld, nir_instr_as_jump(instr)); 502b8e80941Smrg break; 503b8e80941Smrg 504b8e80941Smrg default: 505b8e80941Smrg unreachable("unknown instruction type"); 506b8e80941Smrg } 507b8e80941Smrg} 508b8e80941Smrg 509b8e80941Smrg/** 510b8e80941Smrg * Recognizes a parent instruction of nir_op_extract_* and changes the type to 511b8e80941Smrg * match instr. 512b8e80941Smrg */ 513b8e80941Smrgbool 514b8e80941Smrgfs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 515b8e80941Smrg const fs_reg &result) 516b8e80941Smrg{ 517b8e80941Smrg if (!instr->src[0].src.is_ssa || 518b8e80941Smrg !instr->src[0].src.ssa->parent_instr) 519b8e80941Smrg return false; 520b8e80941Smrg 521b8e80941Smrg if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 522b8e80941Smrg return false; 523b8e80941Smrg 524b8e80941Smrg nir_alu_instr *src0 = 525b8e80941Smrg nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 526b8e80941Smrg 527b8e80941Smrg if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 528b8e80941Smrg src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 529b8e80941Smrg return false; 530b8e80941Smrg 531b8e80941Smrg /* If either opcode has source modifiers, bail. 532b8e80941Smrg * 533b8e80941Smrg * TODO: We can potentially handle source modifiers if both of the opcodes 534b8e80941Smrg * we're combining are signed integers. 535b8e80941Smrg */ 536b8e80941Smrg if (instr->src[0].abs || instr->src[0].negate || 537b8e80941Smrg src0->src[0].abs || src0->src[0].negate) 538b8e80941Smrg return false; 539b8e80941Smrg 540b8e80941Smrg unsigned element = nir_src_as_uint(src0->src[1].src); 541b8e80941Smrg 542b8e80941Smrg /* Element type to extract.*/ 543b8e80941Smrg const brw_reg_type type = brw_int_type( 544b8e80941Smrg src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 545b8e80941Smrg src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 546b8e80941Smrg 547b8e80941Smrg fs_reg op0 = get_nir_src(src0->src[0].src); 548b8e80941Smrg op0.type = brw_type_for_nir_type(devinfo, 549b8e80941Smrg (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 550b8e80941Smrg nir_src_bit_size(src0->src[0].src))); 551b8e80941Smrg op0 = offset(op0, bld, src0->src[0].swizzle[0]); 552b8e80941Smrg 553b8e80941Smrg set_saturate(instr->dest.saturate, 554b8e80941Smrg bld.MOV(result, subscript(op0, type, element))); 555b8e80941Smrg return true; 556b8e80941Smrg} 557b8e80941Smrg 558b8e80941Smrgbool 559b8e80941Smrgfs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 560b8e80941Smrg const fs_reg &result) 561b8e80941Smrg{ 562b8e80941Smrg nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); 563b8e80941Smrg if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) 564b8e80941Smrg return false; 565b8e80941Smrg 566b8e80941Smrg if (!nir_src_is_const(instr->src[1].src) || 567b8e80941Smrg !nir_src_is_const(instr->src[2].src)) 568b8e80941Smrg return false; 569b8e80941Smrg 570b8e80941Smrg const float value1 = nir_src_as_float(instr->src[1].src); 571b8e80941Smrg const float value2 = nir_src_as_float(instr->src[2].src); 572b8e80941Smrg if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) 573b8e80941Smrg return false; 574b8e80941Smrg 575b8e80941Smrg /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ 576b8e80941Smrg assert(value1 == -value2); 577b8e80941Smrg 578b8e80941Smrg fs_reg tmp = vgrf(glsl_type::int_type); 579b8e80941Smrg 580b8e80941Smrg if (devinfo->gen >= 6) { 581b8e80941Smrg /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 582b8e80941Smrg fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 583b8e80941Smrg 584b8e80941Smrg /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 585b8e80941Smrg * 586b8e80941Smrg * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 587b8e80941Smrg * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 588b8e80941Smrg * 589b8e80941Smrg * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 590b8e80941Smrg * 591b8e80941Smrg * This negation looks like it's safe in practice, because bits 0:4 will 592b8e80941Smrg * surely be TRIANGLES 593b8e80941Smrg */ 594b8e80941Smrg 595b8e80941Smrg if (value1 == -1.0f) { 596b8e80941Smrg g0.negate = true; 597b8e80941Smrg } 598b8e80941Smrg 599b8e80941Smrg bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 600b8e80941Smrg g0, brw_imm_uw(0x3f80)); 601b8e80941Smrg } else { 602b8e80941Smrg /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 603b8e80941Smrg fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 604b8e80941Smrg 605b8e80941Smrg /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 606b8e80941Smrg * 607b8e80941Smrg * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 608b8e80941Smrg * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 609b8e80941Smrg * 610b8e80941Smrg * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 611b8e80941Smrg * 612b8e80941Smrg * This negation looks like it's safe in practice, because bits 0:4 will 613b8e80941Smrg * surely be TRIANGLES 614b8e80941Smrg */ 615b8e80941Smrg 616b8e80941Smrg if (value1 == -1.0f) { 617b8e80941Smrg g1_6.negate = true; 618b8e80941Smrg } 619b8e80941Smrg 620b8e80941Smrg bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 621b8e80941Smrg } 622b8e80941Smrg bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 623b8e80941Smrg 624b8e80941Smrg return true; 625b8e80941Smrg} 626b8e80941Smrg 627b8e80941Smrgstatic void 628b8e80941Smrgemit_find_msb_using_lzd(const fs_builder &bld, 629b8e80941Smrg const fs_reg &result, 630b8e80941Smrg const fs_reg &src, 631b8e80941Smrg bool is_signed) 632b8e80941Smrg{ 633b8e80941Smrg fs_inst *inst; 634b8e80941Smrg fs_reg temp = src; 635b8e80941Smrg 636b8e80941Smrg if (is_signed) { 637b8e80941Smrg /* LZD of an absolute value source almost always does the right 638b8e80941Smrg * thing. There are two problem values: 639b8e80941Smrg * 640b8e80941Smrg * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 641b8e80941Smrg * 0. However, findMSB(int(0x80000000)) == 30. 642b8e80941Smrg * 643b8e80941Smrg * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 644b8e80941Smrg * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 645b8e80941Smrg * 646b8e80941Smrg * For a value of zero or negative one, -1 will be returned. 647b8e80941Smrg * 648b8e80941Smrg * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 649b8e80941Smrg * findMSB(-(1<<x)) should return x-1. 650b8e80941Smrg * 651b8e80941Smrg * For all negative number cases, including 0x80000000 and 652b8e80941Smrg * 0xffffffff, the correct value is obtained from LZD if instead of 653b8e80941Smrg * negating the (already negative) value the logical-not is used. A 654b8e80941Smrg * conditonal logical-not can be achieved in two instructions. 655b8e80941Smrg */ 656b8e80941Smrg temp = bld.vgrf(BRW_REGISTER_TYPE_D); 657b8e80941Smrg 658b8e80941Smrg bld.ASR(temp, src, brw_imm_d(31)); 659b8e80941Smrg bld.XOR(temp, temp, src); 660b8e80941Smrg } 661b8e80941Smrg 662b8e80941Smrg bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 663b8e80941Smrg retype(temp, BRW_REGISTER_TYPE_UD)); 664b8e80941Smrg 665b8e80941Smrg /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 666b8e80941Smrg * from the LSB side. Subtract the result from 31 to convert the MSB 667b8e80941Smrg * count into an LSB count. If no bits are set, LZD will return 32. 668b8e80941Smrg * 31-32 = -1, which is exactly what findMSB() is supposed to return. 669b8e80941Smrg */ 670b8e80941Smrg inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 671b8e80941Smrg inst->src[0].negate = true; 672b8e80941Smrg} 673b8e80941Smrg 674b8e80941Smrgstatic brw_rnd_mode 675b8e80941Smrgbrw_rnd_mode_from_nir_op (const nir_op op) { 676b8e80941Smrg switch (op) { 677b8e80941Smrg case nir_op_f2f16_rtz: 678b8e80941Smrg return BRW_RND_MODE_RTZ; 679b8e80941Smrg case nir_op_f2f16_rtne: 680b8e80941Smrg return BRW_RND_MODE_RTNE; 681b8e80941Smrg default: 682b8e80941Smrg unreachable("Operation doesn't support rounding mode"); 683b8e80941Smrg } 684b8e80941Smrg} 685b8e80941Smrg 686b8e80941Smrgfs_reg 687b8e80941Smrgfs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, 688b8e80941Smrg nir_alu_instr *instr, 689b8e80941Smrg fs_reg *op, 690b8e80941Smrg bool need_dest) 691b8e80941Smrg{ 692b8e80941Smrg fs_reg result = 693b8e80941Smrg need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud(); 694b8e80941Smrg 695b8e80941Smrg result.type = brw_type_for_nir_type(devinfo, 696b8e80941Smrg (nir_alu_type)(nir_op_infos[instr->op].output_type | 697b8e80941Smrg nir_dest_bit_size(instr->dest.dest))); 698b8e80941Smrg 699b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 700b8e80941Smrg op[i] = get_nir_src(instr->src[i].src); 701b8e80941Smrg op[i].type = brw_type_for_nir_type(devinfo, 702b8e80941Smrg (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 703b8e80941Smrg nir_src_bit_size(instr->src[i].src))); 704b8e80941Smrg op[i].abs = instr->src[i].abs; 705b8e80941Smrg op[i].negate = instr->src[i].negate; 706b8e80941Smrg } 707b8e80941Smrg 708b8e80941Smrg /* Move and vecN instrutions may still be vectored. Return the raw, 709b8e80941Smrg * vectored source and destination so that fs_visitor::nir_emit_alu can 710b8e80941Smrg * handle it. Other callers should not have to handle these kinds of 711b8e80941Smrg * instructions. 712b8e80941Smrg */ 713b8e80941Smrg switch (instr->op) { 714b8e80941Smrg case nir_op_imov: 715b8e80941Smrg case nir_op_fmov: 716b8e80941Smrg case nir_op_vec2: 717b8e80941Smrg case nir_op_vec3: 718b8e80941Smrg case nir_op_vec4: 719b8e80941Smrg return result; 720b8e80941Smrg default: 721b8e80941Smrg break; 722b8e80941Smrg } 723b8e80941Smrg 724b8e80941Smrg /* At this point, we have dealt with any instruction that operates on 725b8e80941Smrg * more than a single channel. Therefore, we can just adjust the source 726b8e80941Smrg * and destination registers for that channel and emit the instruction. 727b8e80941Smrg */ 728b8e80941Smrg unsigned channel = 0; 729b8e80941Smrg if (nir_op_infos[instr->op].output_size == 0) { 730b8e80941Smrg /* Since NIR is doing the scalarizing for us, we should only ever see 731b8e80941Smrg * vectorized operations with a single channel. 732b8e80941Smrg */ 733b8e80941Smrg assert(util_bitcount(instr->dest.write_mask) == 1); 734b8e80941Smrg channel = ffs(instr->dest.write_mask) - 1; 735b8e80941Smrg 736b8e80941Smrg result = offset(result, bld, channel); 737b8e80941Smrg } 738b8e80941Smrg 739b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 740b8e80941Smrg assert(nir_op_infos[instr->op].input_sizes[i] < 2); 741b8e80941Smrg op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 742b8e80941Smrg } 743b8e80941Smrg 744b8e80941Smrg return result; 745b8e80941Smrg} 746b8e80941Smrg 747b8e80941Smrgvoid 748b8e80941Smrgfs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, 749b8e80941Smrg fs_reg *op) 750b8e80941Smrg{ 751b8e80941Smrg for (unsigned i = 0; i < 2; i++) { 752b8e80941Smrg nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); 753b8e80941Smrg 754b8e80941Smrg if (inot_instr != NULL && inot_instr->op == nir_op_inot && 755b8e80941Smrg !inot_instr->src[0].abs && !inot_instr->src[0].negate) { 756b8e80941Smrg /* The source of the inot is now the source of instr. */ 757b8e80941Smrg prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); 758b8e80941Smrg 759b8e80941Smrg assert(!op[i].negate); 760b8e80941Smrg op[i].negate = true; 761b8e80941Smrg } else { 762b8e80941Smrg op[i] = resolve_source_modifiers(op[i]); 763b8e80941Smrg } 764b8e80941Smrg } 765b8e80941Smrg} 766b8e80941Smrg 767b8e80941Smrgbool 768b8e80941Smrgfs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, 769b8e80941Smrg fs_reg result, 770b8e80941Smrg nir_alu_instr *instr) 771b8e80941Smrg{ 772b8e80941Smrg if (devinfo->gen < 6 || devinfo->gen >= 12) 773b8e80941Smrg return false; 774b8e80941Smrg 775b8e80941Smrg nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); 776b8e80941Smrg 777b8e80941Smrg if (inot_instr == NULL || inot_instr->op != nir_op_inot) 778b8e80941Smrg return false; 779b8e80941Smrg 780b8e80941Smrg /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set 781b8e80941Smrg * of valid size-changing combinations is a bit more complex. 782b8e80941Smrg * 783b8e80941Smrg * The source restriction is just because I was lazy about generating the 784b8e80941Smrg * constant below. 785b8e80941Smrg */ 786b8e80941Smrg if (nir_dest_bit_size(instr->dest.dest) != 32 || 787b8e80941Smrg nir_src_bit_size(inot_instr->src[0].src) != 32) 788b8e80941Smrg return false; 789b8e80941Smrg 790b8e80941Smrg /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, 791b8e80941Smrg * this is float(1 + a). 792b8e80941Smrg */ 793b8e80941Smrg fs_reg op; 794b8e80941Smrg 795b8e80941Smrg prepare_alu_destination_and_sources(bld, inot_instr, &op, false); 796b8e80941Smrg 797b8e80941Smrg /* Ignore the saturate modifier, if there is one. The result of the 798b8e80941Smrg * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. 799b8e80941Smrg */ 800b8e80941Smrg bld.ADD(result, op, brw_imm_d(1)); 801b8e80941Smrg 802b8e80941Smrg return true; 803b8e80941Smrg} 804b8e80941Smrg 805b8e80941Smrg/** 806b8e80941Smrg * Emit code for nir_op_fsign possibly fused with a nir_op_fmul 807b8e80941Smrg * 808b8e80941Smrg * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of 809b8e80941Smrg * the source of \c instr that is a \c nir_op_fsign. 810b8e80941Smrg */ 811b8e80941Smrgvoid 812b8e80941Smrgfs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, 813b8e80941Smrg fs_reg result, fs_reg *op, unsigned fsign_src) 814b8e80941Smrg{ 815b8e80941Smrg fs_inst *inst; 816b8e80941Smrg 817b8e80941Smrg assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); 818b8e80941Smrg assert(fsign_src < nir_op_infos[instr->op].num_inputs); 819b8e80941Smrg 820b8e80941Smrg if (instr->op != nir_op_fsign) { 821b8e80941Smrg const nir_alu_instr *const fsign_instr = 822b8e80941Smrg nir_src_as_alu_instr(instr->src[fsign_src].src); 823b8e80941Smrg 824b8e80941Smrg assert(!fsign_instr->dest.saturate); 825b8e80941Smrg 826b8e80941Smrg /* op[fsign_src] has the nominal result of the fsign, and op[1 - 827b8e80941Smrg * fsign_src] has the other multiply source. This must be rearranged so 828b8e80941Smrg * that op[0] is the source of the fsign op[1] is the other multiply 829b8e80941Smrg * source. 830b8e80941Smrg */ 831b8e80941Smrg if (fsign_src != 0) 832b8e80941Smrg op[1] = op[0]; 833b8e80941Smrg 834b8e80941Smrg op[0] = get_nir_src(fsign_instr->src[0].src); 835b8e80941Smrg 836b8e80941Smrg const nir_alu_type t = 837b8e80941Smrg (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | 838b8e80941Smrg nir_src_bit_size(fsign_instr->src[0].src)); 839b8e80941Smrg 840b8e80941Smrg op[0].type = brw_type_for_nir_type(devinfo, t); 841b8e80941Smrg op[0].abs = fsign_instr->src[0].abs; 842b8e80941Smrg op[0].negate = fsign_instr->src[0].negate; 843b8e80941Smrg 844b8e80941Smrg unsigned channel = 0; 845b8e80941Smrg if (nir_op_infos[instr->op].output_size == 0) { 846b8e80941Smrg /* Since NIR is doing the scalarizing for us, we should only ever see 847b8e80941Smrg * vectorized operations with a single channel. 848b8e80941Smrg */ 849b8e80941Smrg assert(util_bitcount(instr->dest.write_mask) == 1); 850b8e80941Smrg channel = ffs(instr->dest.write_mask) - 1; 851b8e80941Smrg } 852b8e80941Smrg 853b8e80941Smrg op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); 854b8e80941Smrg } else { 855b8e80941Smrg assert(!instr->dest.saturate); 856b8e80941Smrg } 857b8e80941Smrg 858b8e80941Smrg if (op[0].abs) { 859b8e80941Smrg /* Straightforward since the source can be assumed to be either strictly 860b8e80941Smrg * >= 0 or strictly <= 0 depending on the setting of the negate flag. 861b8e80941Smrg */ 862b8e80941Smrg set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); 863b8e80941Smrg 864b8e80941Smrg if (instr->op == nir_op_fsign) { 865b8e80941Smrg inst = (op[0].negate) 866b8e80941Smrg ? bld.MOV(result, brw_imm_f(-1.0f)) 867b8e80941Smrg : bld.MOV(result, brw_imm_f(1.0f)); 868b8e80941Smrg } else { 869b8e80941Smrg op[1].negate = (op[0].negate != op[1].negate); 870b8e80941Smrg inst = bld.MOV(result, op[1]); 871b8e80941Smrg } 872b8e80941Smrg 873b8e80941Smrg set_predicate(BRW_PREDICATE_NORMAL, inst); 874b8e80941Smrg } else if (type_sz(op[0].type) == 2) { 875b8e80941Smrg /* AND(val, 0x8000) gives the sign bit. 876b8e80941Smrg * 877b8e80941Smrg * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. 878b8e80941Smrg */ 879b8e80941Smrg fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); 880b8e80941Smrg bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); 881b8e80941Smrg 882b8e80941Smrg op[0].type = BRW_REGISTER_TYPE_UW; 883b8e80941Smrg result.type = BRW_REGISTER_TYPE_UW; 884b8e80941Smrg bld.AND(result, op[0], brw_imm_uw(0x8000u)); 885b8e80941Smrg 886b8e80941Smrg if (instr->op == nir_op_fsign) 887b8e80941Smrg inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); 888b8e80941Smrg else { 889b8e80941Smrg /* Use XOR here to get the result sign correct. */ 890b8e80941Smrg inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); 891b8e80941Smrg } 892b8e80941Smrg 893b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 894b8e80941Smrg } else if (type_sz(op[0].type) == 4) { 895b8e80941Smrg /* AND(val, 0x80000000) gives the sign bit. 896b8e80941Smrg * 897b8e80941Smrg * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 898b8e80941Smrg * zero. 899b8e80941Smrg */ 900b8e80941Smrg bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 901b8e80941Smrg 902b8e80941Smrg op[0].type = BRW_REGISTER_TYPE_UD; 903b8e80941Smrg result.type = BRW_REGISTER_TYPE_UD; 904b8e80941Smrg bld.AND(result, op[0], brw_imm_ud(0x80000000u)); 905b8e80941Smrg 906b8e80941Smrg if (instr->op == nir_op_fsign) 907b8e80941Smrg inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); 908b8e80941Smrg else { 909b8e80941Smrg /* Use XOR here to get the result sign correct. */ 910b8e80941Smrg inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); 911b8e80941Smrg } 912b8e80941Smrg 913b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 914b8e80941Smrg } else { 915b8e80941Smrg /* For doubles we do the same but we need to consider: 916b8e80941Smrg * 917b8e80941Smrg * - 2-src instructions can't operate with 64-bit immediates 918b8e80941Smrg * - The sign is encoded in the high 32-bit of each DF 919b8e80941Smrg * - We need to produce a DF result. 920b8e80941Smrg */ 921b8e80941Smrg 922b8e80941Smrg fs_reg zero = vgrf(glsl_type::double_type); 923b8e80941Smrg bld.MOV(zero, setup_imm_df(bld, 0.0)); 924b8e80941Smrg bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 925b8e80941Smrg 926b8e80941Smrg bld.MOV(result, zero); 927b8e80941Smrg 928b8e80941Smrg fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 929b8e80941Smrg bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 930b8e80941Smrg brw_imm_ud(0x80000000u)); 931b8e80941Smrg 932b8e80941Smrg if (instr->op == nir_op_fsign) { 933b8e80941Smrg set_predicate(BRW_PREDICATE_NORMAL, 934b8e80941Smrg bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 935b8e80941Smrg } else { 936b8e80941Smrg /* This could be done better in some cases. If the scale is an 937b8e80941Smrg * immediate with the low 32-bits all 0, emitting a separate XOR and 938b8e80941Smrg * OR would allow an algebraic optimization to remove the OR. There 939b8e80941Smrg * are currently zero instances of fsign(double(x))*IMM in shader-db 940b8e80941Smrg * or any test suite, so it is hard to care at this time. 941b8e80941Smrg */ 942b8e80941Smrg fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); 943b8e80941Smrg inst = bld.XOR(result_int64, result_int64, 944b8e80941Smrg retype(op[1], BRW_REGISTER_TYPE_UQ)); 945b8e80941Smrg } 946b8e80941Smrg } 947b8e80941Smrg} 948b8e80941Smrg 949b8e80941Smrg/** 950b8e80941Smrg * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign 951b8e80941Smrg * 952b8e80941Smrg * Checks the operands of a \c nir_op_fmul to determine whether or not 953b8e80941Smrg * \c emit_fsign could fuse the multiplication with the \c sign() calculation. 954b8e80941Smrg * 955b8e80941Smrg * \param instr The multiplication instruction 956b8e80941Smrg * 957b8e80941Smrg * \param fsign_src The source of \c instr that may or may not be a 958b8e80941Smrg * \c nir_op_fsign 959b8e80941Smrg */ 960b8e80941Smrgstatic bool 961b8e80941Smrgcan_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) 962b8e80941Smrg{ 963b8e80941Smrg assert(instr->op == nir_op_fmul); 964b8e80941Smrg 965b8e80941Smrg nir_alu_instr *const fsign_instr = 966b8e80941Smrg nir_src_as_alu_instr(instr->src[fsign_src].src); 967b8e80941Smrg 968b8e80941Smrg /* Rules: 969b8e80941Smrg * 970b8e80941Smrg * 1. instr->src[fsign_src] must be a nir_op_fsign. 971b8e80941Smrg * 2. The nir_op_fsign can only be used by this multiplication. 972b8e80941Smrg * 3. The source that is the nir_op_fsign does not have source modifiers. 973b8e80941Smrg * \c emit_fsign only examines the source modifiers of the source of the 974b8e80941Smrg * \c nir_op_fsign. 975b8e80941Smrg * 976b8e80941Smrg * The nir_op_fsign must also not have the saturate modifier, but steps 977b8e80941Smrg * have already been taken (in nir_opt_algebraic) to ensure that. 978b8e80941Smrg */ 979b8e80941Smrg return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && 980b8e80941Smrg is_used_once(fsign_instr) && 981b8e80941Smrg !instr->src[fsign_src].abs && !instr->src[fsign_src].negate; 982b8e80941Smrg} 983b8e80941Smrg 984b8e80941Smrgvoid 985b8e80941Smrgfs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) 986b8e80941Smrg{ 987b8e80941Smrg struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; 988b8e80941Smrg fs_inst *inst; 989b8e80941Smrg 990b8e80941Smrg fs_reg op[4]; 991b8e80941Smrg fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true); 992b8e80941Smrg 993b8e80941Smrg switch (instr->op) { 994b8e80941Smrg case nir_op_imov: 995b8e80941Smrg case nir_op_fmov: 996b8e80941Smrg case nir_op_vec2: 997b8e80941Smrg case nir_op_vec3: 998b8e80941Smrg case nir_op_vec4: { 999b8e80941Smrg fs_reg temp = result; 1000b8e80941Smrg bool need_extra_copy = false; 1001b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1002b8e80941Smrg if (!instr->src[i].src.is_ssa && 1003b8e80941Smrg instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 1004b8e80941Smrg need_extra_copy = true; 1005b8e80941Smrg temp = bld.vgrf(result.type, 4); 1006b8e80941Smrg break; 1007b8e80941Smrg } 1008b8e80941Smrg } 1009b8e80941Smrg 1010b8e80941Smrg for (unsigned i = 0; i < 4; i++) { 1011b8e80941Smrg if (!(instr->dest.write_mask & (1 << i))) 1012b8e80941Smrg continue; 1013b8e80941Smrg 1014b8e80941Smrg if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { 1015b8e80941Smrg inst = bld.MOV(offset(temp, bld, i), 1016b8e80941Smrg offset(op[0], bld, instr->src[0].swizzle[i])); 1017b8e80941Smrg } else { 1018b8e80941Smrg inst = bld.MOV(offset(temp, bld, i), 1019b8e80941Smrg offset(op[i], bld, instr->src[i].swizzle[0])); 1020b8e80941Smrg } 1021b8e80941Smrg inst->saturate = instr->dest.saturate; 1022b8e80941Smrg } 1023b8e80941Smrg 1024b8e80941Smrg /* In this case the source and destination registers were the same, 1025b8e80941Smrg * so we need to insert an extra set of moves in order to deal with 1026b8e80941Smrg * any swizzling. 1027b8e80941Smrg */ 1028b8e80941Smrg if (need_extra_copy) { 1029b8e80941Smrg for (unsigned i = 0; i < 4; i++) { 1030b8e80941Smrg if (!(instr->dest.write_mask & (1 << i))) 1031b8e80941Smrg continue; 1032b8e80941Smrg 1033b8e80941Smrg bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 1034b8e80941Smrg } 1035b8e80941Smrg } 1036b8e80941Smrg return; 1037b8e80941Smrg } 1038b8e80941Smrg 1039b8e80941Smrg case nir_op_i2f32: 1040b8e80941Smrg case nir_op_u2f32: 1041b8e80941Smrg if (optimize_extract_to_float(instr, result)) 1042b8e80941Smrg return; 1043b8e80941Smrg inst = bld.MOV(result, op[0]); 1044b8e80941Smrg inst->saturate = instr->dest.saturate; 1045b8e80941Smrg break; 1046b8e80941Smrg 1047b8e80941Smrg case nir_op_f2f16_rtne: 1048b8e80941Smrg case nir_op_f2f16_rtz: 1049b8e80941Smrg bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1050b8e80941Smrg brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); 1051b8e80941Smrg /* fallthrough */ 1052b8e80941Smrg case nir_op_f2f16: 1053b8e80941Smrg /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 1054b8e80941Smrg * on the HW gen, it is a special hw opcode or just a MOV, and 1055b8e80941Smrg * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 1056b8e80941Smrg * 1057b8e80941Smrg * But if we want to use that opcode, we need to provide support on 1058b8e80941Smrg * different optimizations and lowerings. As right now HF support is 1059b8e80941Smrg * only for gen8+, it will be better to use directly the MOV, and use 1060b8e80941Smrg * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7. 1061b8e80941Smrg */ 1062b8e80941Smrg assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1063b8e80941Smrg inst = bld.MOV(result, op[0]); 1064b8e80941Smrg inst->saturate = instr->dest.saturate; 1065b8e80941Smrg break; 1066b8e80941Smrg 1067b8e80941Smrg case nir_op_b2i8: 1068b8e80941Smrg case nir_op_b2i16: 1069b8e80941Smrg case nir_op_b2i32: 1070b8e80941Smrg case nir_op_b2i64: 1071b8e80941Smrg case nir_op_b2f16: 1072b8e80941Smrg case nir_op_b2f32: 1073b8e80941Smrg case nir_op_b2f64: 1074b8e80941Smrg if (try_emit_b2fi_of_inot(bld, result, instr)) 1075b8e80941Smrg break; 1076b8e80941Smrg op[0].type = BRW_REGISTER_TYPE_D; 1077b8e80941Smrg op[0].negate = !op[0].negate; 1078b8e80941Smrg /* fallthrough */ 1079b8e80941Smrg case nir_op_i2f64: 1080b8e80941Smrg case nir_op_i2i64: 1081b8e80941Smrg case nir_op_u2f64: 1082b8e80941Smrg case nir_op_u2u64: 1083b8e80941Smrg case nir_op_f2f64: 1084b8e80941Smrg case nir_op_f2i64: 1085b8e80941Smrg case nir_op_f2u64: 1086b8e80941Smrg case nir_op_i2i32: 1087b8e80941Smrg case nir_op_u2u32: 1088b8e80941Smrg case nir_op_f2f32: 1089b8e80941Smrg case nir_op_f2i32: 1090b8e80941Smrg case nir_op_f2u32: 1091b8e80941Smrg case nir_op_i2f16: 1092b8e80941Smrg case nir_op_i2i16: 1093b8e80941Smrg case nir_op_u2f16: 1094b8e80941Smrg case nir_op_u2u16: 1095b8e80941Smrg case nir_op_f2i16: 1096b8e80941Smrg case nir_op_f2u16: 1097b8e80941Smrg case nir_op_i2i8: 1098b8e80941Smrg case nir_op_u2u8: 1099b8e80941Smrg case nir_op_f2i8: 1100b8e80941Smrg case nir_op_f2u8: 1101b8e80941Smrg if (result.type == BRW_REGISTER_TYPE_B || 1102b8e80941Smrg result.type == BRW_REGISTER_TYPE_UB || 1103b8e80941Smrg result.type == BRW_REGISTER_TYPE_HF) 1104b8e80941Smrg assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1105b8e80941Smrg 1106b8e80941Smrg if (op[0].type == BRW_REGISTER_TYPE_B || 1107b8e80941Smrg op[0].type == BRW_REGISTER_TYPE_UB || 1108b8e80941Smrg op[0].type == BRW_REGISTER_TYPE_HF) 1109b8e80941Smrg assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1110b8e80941Smrg 1111b8e80941Smrg inst = bld.MOV(result, op[0]); 1112b8e80941Smrg inst->saturate = instr->dest.saturate; 1113b8e80941Smrg break; 1114b8e80941Smrg 1115b8e80941Smrg case nir_op_fsign: 1116b8e80941Smrg emit_fsign(bld, instr, result, op, 0); 1117b8e80941Smrg break; 1118b8e80941Smrg 1119b8e80941Smrg case nir_op_frcp: 1120b8e80941Smrg inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 1121b8e80941Smrg inst->saturate = instr->dest.saturate; 1122b8e80941Smrg break; 1123b8e80941Smrg 1124b8e80941Smrg case nir_op_fexp2: 1125b8e80941Smrg inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 1126b8e80941Smrg inst->saturate = instr->dest.saturate; 1127b8e80941Smrg break; 1128b8e80941Smrg 1129b8e80941Smrg case nir_op_flog2: 1130b8e80941Smrg inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 1131b8e80941Smrg inst->saturate = instr->dest.saturate; 1132b8e80941Smrg break; 1133b8e80941Smrg 1134b8e80941Smrg case nir_op_fsin: 1135b8e80941Smrg inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 1136b8e80941Smrg inst->saturate = instr->dest.saturate; 1137b8e80941Smrg break; 1138b8e80941Smrg 1139b8e80941Smrg case nir_op_fcos: 1140b8e80941Smrg inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 1141b8e80941Smrg inst->saturate = instr->dest.saturate; 1142b8e80941Smrg break; 1143b8e80941Smrg 1144b8e80941Smrg case nir_op_fddx: 1145b8e80941Smrg if (fs_key->high_quality_derivatives) { 1146b8e80941Smrg inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1147b8e80941Smrg } else { 1148b8e80941Smrg inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1149b8e80941Smrg } 1150b8e80941Smrg inst->saturate = instr->dest.saturate; 1151b8e80941Smrg break; 1152b8e80941Smrg case nir_op_fddx_fine: 1153b8e80941Smrg inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1154b8e80941Smrg inst->saturate = instr->dest.saturate; 1155b8e80941Smrg break; 1156b8e80941Smrg case nir_op_fddx_coarse: 1157b8e80941Smrg inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1158b8e80941Smrg inst->saturate = instr->dest.saturate; 1159b8e80941Smrg break; 1160b8e80941Smrg case nir_op_fddy: 1161b8e80941Smrg if (fs_key->high_quality_derivatives) { 1162b8e80941Smrg inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1163b8e80941Smrg } else { 1164b8e80941Smrg inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1165b8e80941Smrg } 1166b8e80941Smrg inst->saturate = instr->dest.saturate; 1167b8e80941Smrg break; 1168b8e80941Smrg case nir_op_fddy_fine: 1169b8e80941Smrg inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1170b8e80941Smrg inst->saturate = instr->dest.saturate; 1171b8e80941Smrg break; 1172b8e80941Smrg case nir_op_fddy_coarse: 1173b8e80941Smrg inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1174b8e80941Smrg inst->saturate = instr->dest.saturate; 1175b8e80941Smrg break; 1176b8e80941Smrg 1177b8e80941Smrg case nir_op_iadd: 1178b8e80941Smrg case nir_op_fadd: 1179b8e80941Smrg inst = bld.ADD(result, op[0], op[1]); 1180b8e80941Smrg inst->saturate = instr->dest.saturate; 1181b8e80941Smrg break; 1182b8e80941Smrg 1183b8e80941Smrg case nir_op_uadd_sat: 1184b8e80941Smrg inst = bld.ADD(result, op[0], op[1]); 1185b8e80941Smrg inst->saturate = true; 1186b8e80941Smrg break; 1187b8e80941Smrg 1188b8e80941Smrg case nir_op_fmul: 1189b8e80941Smrg for (unsigned i = 0; i < 2; i++) { 1190b8e80941Smrg if (can_fuse_fmul_fsign(instr, i)) { 1191b8e80941Smrg emit_fsign(bld, instr, result, op, i); 1192b8e80941Smrg return; 1193b8e80941Smrg } 1194b8e80941Smrg } 1195b8e80941Smrg 1196b8e80941Smrg inst = bld.MUL(result, op[0], op[1]); 1197b8e80941Smrg inst->saturate = instr->dest.saturate; 1198b8e80941Smrg break; 1199b8e80941Smrg 1200b8e80941Smrg case nir_op_imul_2x32_64: 1201b8e80941Smrg case nir_op_umul_2x32_64: 1202b8e80941Smrg bld.MUL(result, op[0], op[1]); 1203b8e80941Smrg break; 1204b8e80941Smrg 1205b8e80941Smrg case nir_op_imul: 1206b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1207b8e80941Smrg bld.MUL(result, op[0], op[1]); 1208b8e80941Smrg break; 1209b8e80941Smrg 1210b8e80941Smrg case nir_op_imul_high: 1211b8e80941Smrg case nir_op_umul_high: 1212b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1213b8e80941Smrg bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 1214b8e80941Smrg break; 1215b8e80941Smrg 1216b8e80941Smrg case nir_op_idiv: 1217b8e80941Smrg case nir_op_udiv: 1218b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1219b8e80941Smrg bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 1220b8e80941Smrg break; 1221b8e80941Smrg 1222b8e80941Smrg case nir_op_uadd_carry: 1223b8e80941Smrg unreachable("Should have been lowered by carry_to_arith()."); 1224b8e80941Smrg 1225b8e80941Smrg case nir_op_usub_borrow: 1226b8e80941Smrg unreachable("Should have been lowered by borrow_to_arith()."); 1227b8e80941Smrg 1228b8e80941Smrg case nir_op_umod: 1229b8e80941Smrg case nir_op_irem: 1230b8e80941Smrg /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1231b8e80941Smrg * appears that our hardware just does the right thing for signed 1232b8e80941Smrg * remainder. 1233b8e80941Smrg */ 1234b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1235b8e80941Smrg bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1236b8e80941Smrg break; 1237b8e80941Smrg 1238b8e80941Smrg case nir_op_imod: { 1239b8e80941Smrg /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1240b8e80941Smrg bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1241b8e80941Smrg 1242b8e80941Smrg /* Math instructions don't support conditional mod */ 1243b8e80941Smrg inst = bld.MOV(bld.null_reg_d(), result); 1244b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_NZ; 1245b8e80941Smrg 1246b8e80941Smrg /* Now, we need to determine if signs of the sources are different. 1247b8e80941Smrg * When we XOR the sources, the top bit is 0 if they are the same and 1 1248b8e80941Smrg * if they are different. We can then use a conditional modifier to 1249b8e80941Smrg * turn that into a predicate. This leads us to an XOR.l instruction. 1250b8e80941Smrg * 1251b8e80941Smrg * Technically, according to the PRM, you're not allowed to use .l on a 1252b8e80941Smrg * XOR instruction. However, emperical experiments and Curro's reading 1253b8e80941Smrg * of the simulator source both indicate that it's safe. 1254b8e80941Smrg */ 1255b8e80941Smrg fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1256b8e80941Smrg inst = bld.XOR(tmp, op[0], op[1]); 1257b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 1258b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_L; 1259b8e80941Smrg 1260b8e80941Smrg /* If the result of the initial remainder operation is non-zero and the 1261b8e80941Smrg * two sources have different signs, add in a copy of op[1] to get the 1262b8e80941Smrg * final integer modulus value. 1263b8e80941Smrg */ 1264b8e80941Smrg inst = bld.ADD(result, result, op[1]); 1265b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 1266b8e80941Smrg break; 1267b8e80941Smrg } 1268b8e80941Smrg 1269b8e80941Smrg case nir_op_flt32: 1270b8e80941Smrg case nir_op_fge32: 1271b8e80941Smrg case nir_op_feq32: 1272b8e80941Smrg case nir_op_fne32: { 1273b8e80941Smrg fs_reg dest = result; 1274b8e80941Smrg 1275b8e80941Smrg const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1276b8e80941Smrg if (bit_size != 32) 1277b8e80941Smrg dest = bld.vgrf(op[0].type, 1); 1278b8e80941Smrg 1279b8e80941Smrg brw_conditional_mod cond; 1280b8e80941Smrg switch (instr->op) { 1281b8e80941Smrg case nir_op_flt32: 1282b8e80941Smrg cond = BRW_CONDITIONAL_L; 1283b8e80941Smrg break; 1284b8e80941Smrg case nir_op_fge32: 1285b8e80941Smrg cond = BRW_CONDITIONAL_GE; 1286b8e80941Smrg break; 1287b8e80941Smrg case nir_op_feq32: 1288b8e80941Smrg cond = BRW_CONDITIONAL_Z; 1289b8e80941Smrg break; 1290b8e80941Smrg case nir_op_fne32: 1291b8e80941Smrg cond = BRW_CONDITIONAL_NZ; 1292b8e80941Smrg break; 1293b8e80941Smrg default: 1294b8e80941Smrg unreachable("bad opcode"); 1295b8e80941Smrg } 1296b8e80941Smrg 1297b8e80941Smrg bld.CMP(dest, op[0], op[1], cond); 1298b8e80941Smrg 1299b8e80941Smrg if (bit_size > 32) { 1300b8e80941Smrg bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1301b8e80941Smrg } else if(bit_size < 32) { 1302b8e80941Smrg /* When we convert the result to 32-bit we need to be careful and do 1303b8e80941Smrg * it as a signed conversion to get sign extension (for 32-bit true) 1304b8e80941Smrg */ 1305b8e80941Smrg const brw_reg_type src_type = 1306b8e80941Smrg brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1307b8e80941Smrg 1308b8e80941Smrg bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1309b8e80941Smrg } 1310b8e80941Smrg break; 1311b8e80941Smrg } 1312b8e80941Smrg 1313b8e80941Smrg case nir_op_ilt32: 1314b8e80941Smrg case nir_op_ult32: 1315b8e80941Smrg case nir_op_ige32: 1316b8e80941Smrg case nir_op_uge32: 1317b8e80941Smrg case nir_op_ieq32: 1318b8e80941Smrg case nir_op_ine32: { 1319b8e80941Smrg fs_reg dest = result; 1320b8e80941Smrg 1321b8e80941Smrg /* On Gen11 we have an additional issue being that src1 cannot be a byte 1322b8e80941Smrg * type. So we convert both operands for the comparison. 1323b8e80941Smrg */ 1324b8e80941Smrg fs_reg temp_op[2]; 1325b8e80941Smrg temp_op[0] = bld.fix_byte_src(op[0]); 1326b8e80941Smrg temp_op[1] = bld.fix_byte_src(op[1]); 1327b8e80941Smrg 1328b8e80941Smrg const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1329b8e80941Smrg if (bit_size != 32) 1330b8e80941Smrg dest = bld.vgrf(temp_op[0].type, 1); 1331b8e80941Smrg 1332b8e80941Smrg brw_conditional_mod cond; 1333b8e80941Smrg switch (instr->op) { 1334b8e80941Smrg case nir_op_ilt32: 1335b8e80941Smrg case nir_op_ult32: 1336b8e80941Smrg cond = BRW_CONDITIONAL_L; 1337b8e80941Smrg break; 1338b8e80941Smrg case nir_op_ige32: 1339b8e80941Smrg case nir_op_uge32: 1340b8e80941Smrg cond = BRW_CONDITIONAL_GE; 1341b8e80941Smrg break; 1342b8e80941Smrg case nir_op_ieq32: 1343b8e80941Smrg cond = BRW_CONDITIONAL_Z; 1344b8e80941Smrg break; 1345b8e80941Smrg case nir_op_ine32: 1346b8e80941Smrg cond = BRW_CONDITIONAL_NZ; 1347b8e80941Smrg break; 1348b8e80941Smrg default: 1349b8e80941Smrg unreachable("bad opcode"); 1350b8e80941Smrg } 1351b8e80941Smrg bld.CMP(dest, temp_op[0], temp_op[1], cond); 1352b8e80941Smrg 1353b8e80941Smrg if (bit_size > 32) { 1354b8e80941Smrg bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1355b8e80941Smrg } else if (bit_size < 32) { 1356b8e80941Smrg /* When we convert the result to 32-bit we need to be careful and do 1357b8e80941Smrg * it as a signed conversion to get sign extension (for 32-bit true) 1358b8e80941Smrg */ 1359b8e80941Smrg const brw_reg_type src_type = 1360b8e80941Smrg brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1361b8e80941Smrg 1362b8e80941Smrg bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1363b8e80941Smrg } 1364b8e80941Smrg break; 1365b8e80941Smrg } 1366b8e80941Smrg 1367b8e80941Smrg case nir_op_inot: 1368b8e80941Smrg if (devinfo->gen >= 8) { 1369b8e80941Smrg nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); 1370b8e80941Smrg 1371b8e80941Smrg if (inot_src_instr != NULL && 1372b8e80941Smrg (inot_src_instr->op == nir_op_ior || 1373b8e80941Smrg inot_src_instr->op == nir_op_ixor || 1374b8e80941Smrg inot_src_instr->op == nir_op_iand) && 1375b8e80941Smrg !inot_src_instr->src[0].abs && 1376b8e80941Smrg !inot_src_instr->src[0].negate && 1377b8e80941Smrg !inot_src_instr->src[1].abs && 1378b8e80941Smrg !inot_src_instr->src[1].negate) { 1379b8e80941Smrg /* The sources of the source logical instruction are now the 1380b8e80941Smrg * sources of the instruction that will be generated. 1381b8e80941Smrg */ 1382b8e80941Smrg prepare_alu_destination_and_sources(bld, inot_src_instr, op, false); 1383b8e80941Smrg resolve_inot_sources(bld, inot_src_instr, op); 1384b8e80941Smrg 1385b8e80941Smrg /* Smash all of the sources and destination to be signed. This 1386b8e80941Smrg * doesn't matter for the operation of the instruction, but cmod 1387b8e80941Smrg * propagation fails on unsigned sources with negation (due to 1388b8e80941Smrg * fs_inst::can_do_cmod returning false). 1389b8e80941Smrg */ 1390b8e80941Smrg result.type = 1391b8e80941Smrg brw_type_for_nir_type(devinfo, 1392b8e80941Smrg (nir_alu_type)(nir_type_int | 1393b8e80941Smrg nir_dest_bit_size(instr->dest.dest))); 1394b8e80941Smrg op[0].type = 1395b8e80941Smrg brw_type_for_nir_type(devinfo, 1396b8e80941Smrg (nir_alu_type)(nir_type_int | 1397b8e80941Smrg nir_src_bit_size(inot_src_instr->src[0].src))); 1398b8e80941Smrg op[1].type = 1399b8e80941Smrg brw_type_for_nir_type(devinfo, 1400b8e80941Smrg (nir_alu_type)(nir_type_int | 1401b8e80941Smrg nir_src_bit_size(inot_src_instr->src[1].src))); 1402b8e80941Smrg 1403b8e80941Smrg /* For XOR, only invert one of the sources. Arbitrarily choose 1404b8e80941Smrg * the first source. 1405b8e80941Smrg */ 1406b8e80941Smrg op[0].negate = !op[0].negate; 1407b8e80941Smrg if (inot_src_instr->op != nir_op_ixor) 1408b8e80941Smrg op[1].negate = !op[1].negate; 1409b8e80941Smrg 1410b8e80941Smrg switch (inot_src_instr->op) { 1411b8e80941Smrg case nir_op_ior: 1412b8e80941Smrg bld.AND(result, op[0], op[1]); 1413b8e80941Smrg return; 1414b8e80941Smrg 1415b8e80941Smrg case nir_op_iand: 1416b8e80941Smrg bld.OR(result, op[0], op[1]); 1417b8e80941Smrg return; 1418b8e80941Smrg 1419b8e80941Smrg case nir_op_ixor: 1420b8e80941Smrg bld.XOR(result, op[0], op[1]); 1421b8e80941Smrg return; 1422b8e80941Smrg 1423b8e80941Smrg default: 1424b8e80941Smrg unreachable("impossible opcode"); 1425b8e80941Smrg } 1426b8e80941Smrg } 1427b8e80941Smrg op[0] = resolve_source_modifiers(op[0]); 1428b8e80941Smrg } 1429b8e80941Smrg bld.NOT(result, op[0]); 1430b8e80941Smrg break; 1431b8e80941Smrg case nir_op_ixor: 1432b8e80941Smrg if (devinfo->gen >= 8) { 1433b8e80941Smrg resolve_inot_sources(bld, instr, op); 1434b8e80941Smrg } 1435b8e80941Smrg bld.XOR(result, op[0], op[1]); 1436b8e80941Smrg break; 1437b8e80941Smrg case nir_op_ior: 1438b8e80941Smrg if (devinfo->gen >= 8) { 1439b8e80941Smrg resolve_inot_sources(bld, instr, op); 1440b8e80941Smrg } 1441b8e80941Smrg bld.OR(result, op[0], op[1]); 1442b8e80941Smrg break; 1443b8e80941Smrg case nir_op_iand: 1444b8e80941Smrg if (devinfo->gen >= 8) { 1445b8e80941Smrg resolve_inot_sources(bld, instr, op); 1446b8e80941Smrg } 1447b8e80941Smrg bld.AND(result, op[0], op[1]); 1448b8e80941Smrg break; 1449b8e80941Smrg 1450b8e80941Smrg case nir_op_fdot2: 1451b8e80941Smrg case nir_op_fdot3: 1452b8e80941Smrg case nir_op_fdot4: 1453b8e80941Smrg case nir_op_b32all_fequal2: 1454b8e80941Smrg case nir_op_b32all_iequal2: 1455b8e80941Smrg case nir_op_b32all_fequal3: 1456b8e80941Smrg case nir_op_b32all_iequal3: 1457b8e80941Smrg case nir_op_b32all_fequal4: 1458b8e80941Smrg case nir_op_b32all_iequal4: 1459b8e80941Smrg case nir_op_b32any_fnequal2: 1460b8e80941Smrg case nir_op_b32any_inequal2: 1461b8e80941Smrg case nir_op_b32any_fnequal3: 1462b8e80941Smrg case nir_op_b32any_inequal3: 1463b8e80941Smrg case nir_op_b32any_fnequal4: 1464b8e80941Smrg case nir_op_b32any_inequal4: 1465b8e80941Smrg unreachable("Lowered by nir_lower_alu_reductions"); 1466b8e80941Smrg 1467b8e80941Smrg case nir_op_fnoise1_1: 1468b8e80941Smrg case nir_op_fnoise1_2: 1469b8e80941Smrg case nir_op_fnoise1_3: 1470b8e80941Smrg case nir_op_fnoise1_4: 1471b8e80941Smrg case nir_op_fnoise2_1: 1472b8e80941Smrg case nir_op_fnoise2_2: 1473b8e80941Smrg case nir_op_fnoise2_3: 1474b8e80941Smrg case nir_op_fnoise2_4: 1475b8e80941Smrg case nir_op_fnoise3_1: 1476b8e80941Smrg case nir_op_fnoise3_2: 1477b8e80941Smrg case nir_op_fnoise3_3: 1478b8e80941Smrg case nir_op_fnoise3_4: 1479b8e80941Smrg case nir_op_fnoise4_1: 1480b8e80941Smrg case nir_op_fnoise4_2: 1481b8e80941Smrg case nir_op_fnoise4_3: 1482b8e80941Smrg case nir_op_fnoise4_4: 1483b8e80941Smrg unreachable("not reached: should be handled by lower_noise"); 1484b8e80941Smrg 1485b8e80941Smrg case nir_op_ldexp: 1486b8e80941Smrg unreachable("not reached: should be handled by ldexp_to_arith()"); 1487b8e80941Smrg 1488b8e80941Smrg case nir_op_fsqrt: 1489b8e80941Smrg inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1490b8e80941Smrg inst->saturate = instr->dest.saturate; 1491b8e80941Smrg break; 1492b8e80941Smrg 1493b8e80941Smrg case nir_op_frsq: 1494b8e80941Smrg inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1495b8e80941Smrg inst->saturate = instr->dest.saturate; 1496b8e80941Smrg break; 1497b8e80941Smrg 1498b8e80941Smrg case nir_op_i2b32: 1499b8e80941Smrg case nir_op_f2b32: { 1500b8e80941Smrg uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1501b8e80941Smrg if (bit_size == 64) { 1502b8e80941Smrg /* two-argument instructions can't take 64-bit immediates */ 1503b8e80941Smrg fs_reg zero; 1504b8e80941Smrg fs_reg tmp; 1505b8e80941Smrg 1506b8e80941Smrg if (instr->op == nir_op_f2b32) { 1507b8e80941Smrg zero = vgrf(glsl_type::double_type); 1508b8e80941Smrg tmp = vgrf(glsl_type::double_type); 1509b8e80941Smrg bld.MOV(zero, setup_imm_df(bld, 0.0)); 1510b8e80941Smrg } else { 1511b8e80941Smrg zero = vgrf(glsl_type::int64_t_type); 1512b8e80941Smrg tmp = vgrf(glsl_type::int64_t_type); 1513b8e80941Smrg bld.MOV(zero, brw_imm_q(0)); 1514b8e80941Smrg } 1515b8e80941Smrg 1516b8e80941Smrg /* A SIMD16 execution needs to be split in two instructions, so use 1517b8e80941Smrg * a vgrf instead of the flag register as dst so instruction splitting 1518b8e80941Smrg * works 1519b8e80941Smrg */ 1520b8e80941Smrg bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1521b8e80941Smrg bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1522b8e80941Smrg } else { 1523b8e80941Smrg fs_reg zero; 1524b8e80941Smrg if (bit_size == 32) { 1525b8e80941Smrg zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0); 1526b8e80941Smrg } else { 1527b8e80941Smrg assert(bit_size == 16); 1528b8e80941Smrg zero = instr->op == nir_op_f2b32 ? 1529b8e80941Smrg retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1530b8e80941Smrg } 1531b8e80941Smrg bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1532b8e80941Smrg } 1533b8e80941Smrg break; 1534b8e80941Smrg } 1535b8e80941Smrg 1536b8e80941Smrg case nir_op_ftrunc: 1537b8e80941Smrg inst = bld.RNDZ(result, op[0]); 1538b8e80941Smrg inst->saturate = instr->dest.saturate; 1539b8e80941Smrg break; 1540b8e80941Smrg 1541b8e80941Smrg case nir_op_fceil: { 1542b8e80941Smrg op[0].negate = !op[0].negate; 1543b8e80941Smrg fs_reg temp = vgrf(glsl_type::float_type); 1544b8e80941Smrg bld.RNDD(temp, op[0]); 1545b8e80941Smrg temp.negate = true; 1546b8e80941Smrg inst = bld.MOV(result, temp); 1547b8e80941Smrg inst->saturate = instr->dest.saturate; 1548b8e80941Smrg break; 1549b8e80941Smrg } 1550b8e80941Smrg case nir_op_ffloor: 1551b8e80941Smrg inst = bld.RNDD(result, op[0]); 1552b8e80941Smrg inst->saturate = instr->dest.saturate; 1553b8e80941Smrg break; 1554b8e80941Smrg case nir_op_ffract: 1555b8e80941Smrg inst = bld.FRC(result, op[0]); 1556b8e80941Smrg inst->saturate = instr->dest.saturate; 1557b8e80941Smrg break; 1558b8e80941Smrg case nir_op_fround_even: 1559b8e80941Smrg inst = bld.RNDE(result, op[0]); 1560b8e80941Smrg inst->saturate = instr->dest.saturate; 1561b8e80941Smrg break; 1562b8e80941Smrg 1563b8e80941Smrg case nir_op_fquantize2f16: { 1564b8e80941Smrg fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1565b8e80941Smrg fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1566b8e80941Smrg fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1567b8e80941Smrg 1568b8e80941Smrg /* The destination stride must be at least as big as the source stride. */ 1569b8e80941Smrg tmp16.type = BRW_REGISTER_TYPE_W; 1570b8e80941Smrg tmp16.stride = 2; 1571b8e80941Smrg 1572b8e80941Smrg /* Check for denormal */ 1573b8e80941Smrg fs_reg abs_src0 = op[0]; 1574b8e80941Smrg abs_src0.abs = true; 1575b8e80941Smrg bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1576b8e80941Smrg BRW_CONDITIONAL_L); 1577b8e80941Smrg /* Get the appropriately signed zero */ 1578b8e80941Smrg bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1579b8e80941Smrg retype(op[0], BRW_REGISTER_TYPE_UD), 1580b8e80941Smrg brw_imm_ud(0x80000000)); 1581b8e80941Smrg /* Do the actual F32 -> F16 -> F32 conversion */ 1582b8e80941Smrg bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1583b8e80941Smrg bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1584b8e80941Smrg /* Select that or zero based on normal status */ 1585b8e80941Smrg inst = bld.SEL(result, zero, tmp32); 1586b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 1587b8e80941Smrg inst->saturate = instr->dest.saturate; 1588b8e80941Smrg break; 1589b8e80941Smrg } 1590b8e80941Smrg 1591b8e80941Smrg case nir_op_imin: 1592b8e80941Smrg case nir_op_umin: 1593b8e80941Smrg case nir_op_fmin: 1594b8e80941Smrg inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1595b8e80941Smrg inst->saturate = instr->dest.saturate; 1596b8e80941Smrg break; 1597b8e80941Smrg 1598b8e80941Smrg case nir_op_imax: 1599b8e80941Smrg case nir_op_umax: 1600b8e80941Smrg case nir_op_fmax: 1601b8e80941Smrg inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1602b8e80941Smrg inst->saturate = instr->dest.saturate; 1603b8e80941Smrg break; 1604b8e80941Smrg 1605b8e80941Smrg case nir_op_pack_snorm_2x16: 1606b8e80941Smrg case nir_op_pack_snorm_4x8: 1607b8e80941Smrg case nir_op_pack_unorm_2x16: 1608b8e80941Smrg case nir_op_pack_unorm_4x8: 1609b8e80941Smrg case nir_op_unpack_snorm_2x16: 1610b8e80941Smrg case nir_op_unpack_snorm_4x8: 1611b8e80941Smrg case nir_op_unpack_unorm_2x16: 1612b8e80941Smrg case nir_op_unpack_unorm_4x8: 1613b8e80941Smrg case nir_op_unpack_half_2x16: 1614b8e80941Smrg case nir_op_pack_half_2x16: 1615b8e80941Smrg unreachable("not reached: should be handled by lower_packing_builtins"); 1616b8e80941Smrg 1617b8e80941Smrg case nir_op_unpack_half_2x16_split_x: 1618b8e80941Smrg inst = bld.emit(BRW_OPCODE_F16TO32, result, 1619b8e80941Smrg subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1620b8e80941Smrg inst->saturate = instr->dest.saturate; 1621b8e80941Smrg break; 1622b8e80941Smrg case nir_op_unpack_half_2x16_split_y: 1623b8e80941Smrg inst = bld.emit(BRW_OPCODE_F16TO32, result, 1624b8e80941Smrg subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1625b8e80941Smrg inst->saturate = instr->dest.saturate; 1626b8e80941Smrg break; 1627b8e80941Smrg 1628b8e80941Smrg case nir_op_pack_64_2x32_split: 1629b8e80941Smrg case nir_op_pack_32_2x16_split: 1630b8e80941Smrg bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1631b8e80941Smrg break; 1632b8e80941Smrg 1633b8e80941Smrg case nir_op_unpack_64_2x32_split_x: 1634b8e80941Smrg case nir_op_unpack_64_2x32_split_y: { 1635b8e80941Smrg if (instr->op == nir_op_unpack_64_2x32_split_x) 1636b8e80941Smrg bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1637b8e80941Smrg else 1638b8e80941Smrg bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1639b8e80941Smrg break; 1640b8e80941Smrg } 1641b8e80941Smrg 1642b8e80941Smrg case nir_op_unpack_32_2x16_split_x: 1643b8e80941Smrg case nir_op_unpack_32_2x16_split_y: { 1644b8e80941Smrg if (instr->op == nir_op_unpack_32_2x16_split_x) 1645b8e80941Smrg bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1646b8e80941Smrg else 1647b8e80941Smrg bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1648b8e80941Smrg break; 1649b8e80941Smrg } 1650b8e80941Smrg 1651b8e80941Smrg case nir_op_fpow: 1652b8e80941Smrg inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1653b8e80941Smrg inst->saturate = instr->dest.saturate; 1654b8e80941Smrg break; 1655b8e80941Smrg 1656b8e80941Smrg case nir_op_bitfield_reverse: 1657b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1658b8e80941Smrg bld.BFREV(result, op[0]); 1659b8e80941Smrg break; 1660b8e80941Smrg 1661b8e80941Smrg case nir_op_bit_count: 1662b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1663b8e80941Smrg bld.CBIT(result, op[0]); 1664b8e80941Smrg break; 1665b8e80941Smrg 1666b8e80941Smrg case nir_op_ufind_msb: { 1667b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1668b8e80941Smrg emit_find_msb_using_lzd(bld, result, op[0], false); 1669b8e80941Smrg break; 1670b8e80941Smrg } 1671b8e80941Smrg 1672b8e80941Smrg case nir_op_ifind_msb: { 1673b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1674b8e80941Smrg 1675b8e80941Smrg if (devinfo->gen < 7) { 1676b8e80941Smrg emit_find_msb_using_lzd(bld, result, op[0], true); 1677b8e80941Smrg } else { 1678b8e80941Smrg bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1679b8e80941Smrg 1680b8e80941Smrg /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1681b8e80941Smrg * count from the LSB side. If FBH didn't return an error 1682b8e80941Smrg * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1683b8e80941Smrg * count into an LSB count. 1684b8e80941Smrg */ 1685b8e80941Smrg bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1686b8e80941Smrg 1687b8e80941Smrg inst = bld.ADD(result, result, brw_imm_d(31)); 1688b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 1689b8e80941Smrg inst->src[0].negate = true; 1690b8e80941Smrg } 1691b8e80941Smrg break; 1692b8e80941Smrg } 1693b8e80941Smrg 1694b8e80941Smrg case nir_op_find_lsb: 1695b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1696b8e80941Smrg 1697b8e80941Smrg if (devinfo->gen < 7) { 1698b8e80941Smrg fs_reg temp = vgrf(glsl_type::int_type); 1699b8e80941Smrg 1700b8e80941Smrg /* (x & -x) generates a value that consists of only the LSB of x. 1701b8e80941Smrg * For all powers of 2, findMSB(y) == findLSB(y). 1702b8e80941Smrg */ 1703b8e80941Smrg fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1704b8e80941Smrg fs_reg negated_src = src; 1705b8e80941Smrg 1706b8e80941Smrg /* One must be negated, and the other must be non-negated. It 1707b8e80941Smrg * doesn't matter which is which. 1708b8e80941Smrg */ 1709b8e80941Smrg negated_src.negate = true; 1710b8e80941Smrg src.negate = false; 1711b8e80941Smrg 1712b8e80941Smrg bld.AND(temp, src, negated_src); 1713b8e80941Smrg emit_find_msb_using_lzd(bld, result, temp, false); 1714b8e80941Smrg } else { 1715b8e80941Smrg bld.FBL(result, op[0]); 1716b8e80941Smrg } 1717b8e80941Smrg break; 1718b8e80941Smrg 1719b8e80941Smrg case nir_op_ubitfield_extract: 1720b8e80941Smrg case nir_op_ibitfield_extract: 1721b8e80941Smrg unreachable("should have been lowered"); 1722b8e80941Smrg case nir_op_ubfe: 1723b8e80941Smrg case nir_op_ibfe: 1724b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1725b8e80941Smrg bld.BFE(result, op[2], op[1], op[0]); 1726b8e80941Smrg break; 1727b8e80941Smrg case nir_op_bfm: 1728b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1729b8e80941Smrg bld.BFI1(result, op[0], op[1]); 1730b8e80941Smrg break; 1731b8e80941Smrg case nir_op_bfi: 1732b8e80941Smrg assert(nir_dest_bit_size(instr->dest.dest) < 64); 1733b8e80941Smrg bld.BFI2(result, op[0], op[1], op[2]); 1734b8e80941Smrg break; 1735b8e80941Smrg 1736b8e80941Smrg case nir_op_bitfield_insert: 1737b8e80941Smrg unreachable("not reached: should have been lowered"); 1738b8e80941Smrg 1739b8e80941Smrg case nir_op_ishl: 1740b8e80941Smrg bld.SHL(result, op[0], op[1]); 1741b8e80941Smrg break; 1742b8e80941Smrg case nir_op_ishr: 1743b8e80941Smrg bld.ASR(result, op[0], op[1]); 1744b8e80941Smrg break; 1745b8e80941Smrg case nir_op_ushr: 1746b8e80941Smrg bld.SHR(result, op[0], op[1]); 1747b8e80941Smrg break; 1748b8e80941Smrg 1749b8e80941Smrg case nir_op_pack_half_2x16_split: 1750b8e80941Smrg bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1751b8e80941Smrg break; 1752b8e80941Smrg 1753b8e80941Smrg case nir_op_ffma: 1754b8e80941Smrg inst = bld.MAD(result, op[2], op[1], op[0]); 1755b8e80941Smrg inst->saturate = instr->dest.saturate; 1756b8e80941Smrg break; 1757b8e80941Smrg 1758b8e80941Smrg case nir_op_flrp: 1759b8e80941Smrg inst = bld.LRP(result, op[0], op[1], op[2]); 1760b8e80941Smrg inst->saturate = instr->dest.saturate; 1761b8e80941Smrg break; 1762b8e80941Smrg 1763b8e80941Smrg case nir_op_b32csel: 1764b8e80941Smrg if (optimize_frontfacing_ternary(instr, result)) 1765b8e80941Smrg return; 1766b8e80941Smrg 1767b8e80941Smrg bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1768b8e80941Smrg inst = bld.SEL(result, op[1], op[2]); 1769b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 1770b8e80941Smrg break; 1771b8e80941Smrg 1772b8e80941Smrg case nir_op_extract_u8: 1773b8e80941Smrg case nir_op_extract_i8: { 1774b8e80941Smrg unsigned byte = nir_src_as_uint(instr->src[1].src); 1775b8e80941Smrg 1776b8e80941Smrg /* The PRMs say: 1777b8e80941Smrg * 1778b8e80941Smrg * BDW+ 1779b8e80941Smrg * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1780b8e80941Smrg * Use two instructions and a word or DWord intermediate integer type. 1781b8e80941Smrg */ 1782b8e80941Smrg if (nir_dest_bit_size(instr->dest.dest) == 64) { 1783b8e80941Smrg const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1784b8e80941Smrg 1785b8e80941Smrg if (instr->op == nir_op_extract_i8) { 1786b8e80941Smrg /* If we need to sign extend, extract to a word first */ 1787b8e80941Smrg fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1788b8e80941Smrg bld.MOV(w_temp, subscript(op[0], type, byte)); 1789b8e80941Smrg bld.MOV(result, w_temp); 1790b8e80941Smrg } else if (byte & 1) { 1791b8e80941Smrg /* Extract the high byte from the word containing the desired byte 1792b8e80941Smrg * offset. 1793b8e80941Smrg */ 1794b8e80941Smrg bld.SHR(result, 1795b8e80941Smrg subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1796b8e80941Smrg brw_imm_uw(8)); 1797b8e80941Smrg } else { 1798b8e80941Smrg /* Otherwise use an AND with 0xff and a word type */ 1799b8e80941Smrg bld.AND(result, 1800b8e80941Smrg subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1801b8e80941Smrg brw_imm_uw(0xff)); 1802b8e80941Smrg } 1803b8e80941Smrg } else { 1804b8e80941Smrg const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1805b8e80941Smrg bld.MOV(result, subscript(op[0], type, byte)); 1806b8e80941Smrg } 1807b8e80941Smrg break; 1808b8e80941Smrg } 1809b8e80941Smrg 1810b8e80941Smrg case nir_op_extract_u16: 1811b8e80941Smrg case nir_op_extract_i16: { 1812b8e80941Smrg const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 1813b8e80941Smrg unsigned word = nir_src_as_uint(instr->src[1].src); 1814b8e80941Smrg bld.MOV(result, subscript(op[0], type, word)); 1815b8e80941Smrg break; 1816b8e80941Smrg } 1817b8e80941Smrg 1818b8e80941Smrg default: 1819b8e80941Smrg unreachable("unhandled instruction"); 1820b8e80941Smrg } 1821b8e80941Smrg 1822b8e80941Smrg /* If we need to do a boolean resolve, replace the result with -(x & 1) 1823b8e80941Smrg * to sign extend the low bit to 0/~0 1824b8e80941Smrg */ 1825b8e80941Smrg if (devinfo->gen <= 5 && 1826b8e80941Smrg (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 1827b8e80941Smrg fs_reg masked = vgrf(glsl_type::int_type); 1828b8e80941Smrg bld.AND(masked, result, brw_imm_d(1)); 1829b8e80941Smrg masked.negate = true; 1830b8e80941Smrg bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 1831b8e80941Smrg } 1832b8e80941Smrg} 1833b8e80941Smrg 1834b8e80941Smrgvoid 1835b8e80941Smrgfs_visitor::nir_emit_load_const(const fs_builder &bld, 1836b8e80941Smrg nir_load_const_instr *instr) 1837b8e80941Smrg{ 1838b8e80941Smrg const brw_reg_type reg_type = 1839b8e80941Smrg brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 1840b8e80941Smrg fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 1841b8e80941Smrg 1842b8e80941Smrg switch (instr->def.bit_size) { 1843b8e80941Smrg case 8: 1844b8e80941Smrg for (unsigned i = 0; i < instr->def.num_components; i++) 1845b8e80941Smrg bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); 1846b8e80941Smrg break; 1847b8e80941Smrg 1848b8e80941Smrg case 16: 1849b8e80941Smrg for (unsigned i = 0; i < instr->def.num_components; i++) 1850b8e80941Smrg bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); 1851b8e80941Smrg break; 1852b8e80941Smrg 1853b8e80941Smrg case 32: 1854b8e80941Smrg for (unsigned i = 0; i < instr->def.num_components; i++) 1855b8e80941Smrg bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); 1856b8e80941Smrg break; 1857b8e80941Smrg 1858b8e80941Smrg case 64: 1859b8e80941Smrg assert(devinfo->gen >= 7); 1860b8e80941Smrg if (devinfo->gen == 7) { 1861b8e80941Smrg /* We don't get 64-bit integer types until gen8 */ 1862b8e80941Smrg for (unsigned i = 0; i < instr->def.num_components; i++) { 1863b8e80941Smrg bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 1864b8e80941Smrg setup_imm_df(bld, instr->value[i].f64)); 1865b8e80941Smrg } 1866b8e80941Smrg } else { 1867b8e80941Smrg for (unsigned i = 0; i < instr->def.num_components; i++) 1868b8e80941Smrg bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); 1869b8e80941Smrg } 1870b8e80941Smrg break; 1871b8e80941Smrg 1872b8e80941Smrg default: 1873b8e80941Smrg unreachable("Invalid bit size"); 1874b8e80941Smrg } 1875b8e80941Smrg 1876b8e80941Smrg nir_ssa_values[instr->def.index] = reg; 1877b8e80941Smrg} 1878b8e80941Smrg 1879b8e80941Smrgfs_reg 1880b8e80941Smrgfs_visitor::get_nir_src(const nir_src &src) 1881b8e80941Smrg{ 1882b8e80941Smrg fs_reg reg; 1883b8e80941Smrg if (src.is_ssa) { 1884b8e80941Smrg if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { 1885b8e80941Smrg const brw_reg_type reg_type = 1886b8e80941Smrg brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 1887b8e80941Smrg reg = bld.vgrf(reg_type, src.ssa->num_components); 1888b8e80941Smrg } else { 1889b8e80941Smrg reg = nir_ssa_values[src.ssa->index]; 1890b8e80941Smrg } 1891b8e80941Smrg } else { 1892b8e80941Smrg /* We don't handle indirects on locals */ 1893b8e80941Smrg assert(src.reg.indirect == NULL); 1894b8e80941Smrg reg = offset(nir_locals[src.reg.reg->index], bld, 1895b8e80941Smrg src.reg.base_offset * src.reg.reg->num_components); 1896b8e80941Smrg } 1897b8e80941Smrg 1898b8e80941Smrg if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) { 1899b8e80941Smrg /* The only 64-bit type available on gen7 is DF, so use that. */ 1900b8e80941Smrg reg.type = BRW_REGISTER_TYPE_DF; 1901b8e80941Smrg } else { 1902b8e80941Smrg /* To avoid floating-point denorm flushing problems, set the type by 1903b8e80941Smrg * default to an integer type - instructions that need floating point 1904b8e80941Smrg * semantics will set this to F if they need to 1905b8e80941Smrg */ 1906b8e80941Smrg reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 1907b8e80941Smrg BRW_REGISTER_TYPE_D); 1908b8e80941Smrg } 1909b8e80941Smrg 1910b8e80941Smrg return reg; 1911b8e80941Smrg} 1912b8e80941Smrg 1913b8e80941Smrg/** 1914b8e80941Smrg * Return an IMM for constants; otherwise call get_nir_src() as normal. 1915b8e80941Smrg * 1916b8e80941Smrg * This function should not be called on any value which may be 64 bits. 1917b8e80941Smrg * We could theoretically support 64-bit on gen8+ but we choose not to 1918b8e80941Smrg * because it wouldn't work in general (no gen7 support) and there are 1919b8e80941Smrg * enough restrictions in 64-bit immediates that you can't take the return 1920b8e80941Smrg * value and treat it the same as the result of get_nir_src(). 1921b8e80941Smrg */ 1922b8e80941Smrgfs_reg 1923b8e80941Smrgfs_visitor::get_nir_src_imm(const nir_src &src) 1924b8e80941Smrg{ 1925b8e80941Smrg assert(nir_src_bit_size(src) == 32); 1926b8e80941Smrg return nir_src_is_const(src) ? 1927b8e80941Smrg fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src); 1928b8e80941Smrg} 1929b8e80941Smrg 1930b8e80941Smrgfs_reg 1931b8e80941Smrgfs_visitor::get_nir_dest(const nir_dest &dest) 1932b8e80941Smrg{ 1933b8e80941Smrg if (dest.is_ssa) { 1934b8e80941Smrg const brw_reg_type reg_type = 1935b8e80941Smrg brw_reg_type_from_bit_size(dest.ssa.bit_size, 1936b8e80941Smrg dest.ssa.bit_size == 8 ? 1937b8e80941Smrg BRW_REGISTER_TYPE_D : 1938b8e80941Smrg BRW_REGISTER_TYPE_F); 1939b8e80941Smrg nir_ssa_values[dest.ssa.index] = 1940b8e80941Smrg bld.vgrf(reg_type, dest.ssa.num_components); 1941b8e80941Smrg return nir_ssa_values[dest.ssa.index]; 1942b8e80941Smrg } else { 1943b8e80941Smrg /* We don't handle indirects on locals */ 1944b8e80941Smrg assert(dest.reg.indirect == NULL); 1945b8e80941Smrg return offset(nir_locals[dest.reg.reg->index], bld, 1946b8e80941Smrg dest.reg.base_offset * dest.reg.reg->num_components); 1947b8e80941Smrg } 1948b8e80941Smrg} 1949b8e80941Smrg 1950b8e80941Smrgvoid 1951b8e80941Smrgfs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 1952b8e80941Smrg unsigned wr_mask) 1953b8e80941Smrg{ 1954b8e80941Smrg for (unsigned i = 0; i < 4; i++) { 1955b8e80941Smrg if (!((wr_mask >> i) & 1)) 1956b8e80941Smrg continue; 1957b8e80941Smrg 1958b8e80941Smrg fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 1959b8e80941Smrg new_inst->dst = offset(new_inst->dst, bld, i); 1960b8e80941Smrg for (unsigned j = 0; j < new_inst->sources; j++) 1961b8e80941Smrg if (new_inst->src[j].file == VGRF) 1962b8e80941Smrg new_inst->src[j] = offset(new_inst->src[j], bld, i); 1963b8e80941Smrg 1964b8e80941Smrg bld.emit(new_inst); 1965b8e80941Smrg } 1966b8e80941Smrg} 1967b8e80941Smrg 1968b8e80941Smrgstatic fs_inst * 1969b8e80941Smrgemit_pixel_interpolater_send(const fs_builder &bld, 1970b8e80941Smrg enum opcode opcode, 1971b8e80941Smrg const fs_reg &dst, 1972b8e80941Smrg const fs_reg &src, 1973b8e80941Smrg const fs_reg &desc, 1974b8e80941Smrg glsl_interp_mode interpolation) 1975b8e80941Smrg{ 1976b8e80941Smrg struct brw_wm_prog_data *wm_prog_data = 1977b8e80941Smrg brw_wm_prog_data(bld.shader->stage_prog_data); 1978b8e80941Smrg 1979b8e80941Smrg fs_inst *inst = bld.emit(opcode, dst, src, desc); 1980b8e80941Smrg /* 2 floats per slot returned */ 1981b8e80941Smrg inst->size_written = 2 * dst.component_size(inst->exec_size); 1982b8e80941Smrg inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; 1983b8e80941Smrg 1984b8e80941Smrg wm_prog_data->pulls_bary = true; 1985b8e80941Smrg 1986b8e80941Smrg return inst; 1987b8e80941Smrg} 1988b8e80941Smrg 1989b8e80941Smrg/** 1990b8e80941Smrg * Computes 1 << x, given a D/UD register containing some value x. 1991b8e80941Smrg */ 1992b8e80941Smrgstatic fs_reg 1993b8e80941Smrgintexp2(const fs_builder &bld, const fs_reg &x) 1994b8e80941Smrg{ 1995b8e80941Smrg assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 1996b8e80941Smrg 1997b8e80941Smrg fs_reg result = bld.vgrf(x.type, 1); 1998b8e80941Smrg fs_reg one = bld.vgrf(x.type, 1); 1999b8e80941Smrg 2000b8e80941Smrg bld.MOV(one, retype(brw_imm_d(1), one.type)); 2001b8e80941Smrg bld.SHL(result, one, x); 2002b8e80941Smrg return result; 2003b8e80941Smrg} 2004b8e80941Smrg 2005b8e80941Smrgvoid 2006b8e80941Smrgfs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 2007b8e80941Smrg{ 2008b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 2009b8e80941Smrg 2010b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2011b8e80941Smrg 2012b8e80941Smrg if (gs_compile->control_data_header_size_bits == 0) 2013b8e80941Smrg return; 2014b8e80941Smrg 2015b8e80941Smrg /* We can only do EndPrimitive() functionality when the control data 2016b8e80941Smrg * consists of cut bits. Fortunately, the only time it isn't is when the 2017b8e80941Smrg * output type is points, in which case EndPrimitive() is a no-op. 2018b8e80941Smrg */ 2019b8e80941Smrg if (gs_prog_data->control_data_format != 2020b8e80941Smrg GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 2021b8e80941Smrg return; 2022b8e80941Smrg } 2023b8e80941Smrg 2024b8e80941Smrg /* Cut bits use one bit per vertex. */ 2025b8e80941Smrg assert(gs_compile->control_data_bits_per_vertex == 1); 2026b8e80941Smrg 2027b8e80941Smrg fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2028b8e80941Smrg vertex_count.type = BRW_REGISTER_TYPE_UD; 2029b8e80941Smrg 2030b8e80941Smrg /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 2031b8e80941Smrg * vertex n, 0 otherwise. So all we need to do here is mark bit 2032b8e80941Smrg * (vertex_count - 1) % 32 in the cut_bits register to indicate that 2033b8e80941Smrg * EndPrimitive() was called after emitting vertex (vertex_count - 1); 2034b8e80941Smrg * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 2035b8e80941Smrg * 2036b8e80941Smrg * Note that if EndPrimitive() is called before emitting any vertices, this 2037b8e80941Smrg * will cause us to set bit 31 of the control_data_bits register to 1. 2038b8e80941Smrg * That's fine because: 2039b8e80941Smrg * 2040b8e80941Smrg * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 2041b8e80941Smrg * output, so the hardware will ignore cut bit 31. 2042b8e80941Smrg * 2043b8e80941Smrg * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 2044b8e80941Smrg * last vertex, so setting cut bit 31 has no effect (since the primitive 2045b8e80941Smrg * is automatically ended when the GS terminates). 2046b8e80941Smrg * 2047b8e80941Smrg * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 2048b8e80941Smrg * control_data_bits register to 0 when the first vertex is emitted. 2049b8e80941Smrg */ 2050b8e80941Smrg 2051b8e80941Smrg const fs_builder abld = bld.annotate("end primitive"); 2052b8e80941Smrg 2053b8e80941Smrg /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 2054b8e80941Smrg fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2055b8e80941Smrg abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2056b8e80941Smrg fs_reg mask = intexp2(abld, prev_count); 2057b8e80941Smrg /* Note: we're relying on the fact that the GEN SHL instruction only pays 2058b8e80941Smrg * attention to the lower 5 bits of its second source argument, so on this 2059b8e80941Smrg * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 2060b8e80941Smrg * ((vertex_count - 1) % 32). 2061b8e80941Smrg */ 2062b8e80941Smrg abld.OR(this->control_data_bits, this->control_data_bits, mask); 2063b8e80941Smrg} 2064b8e80941Smrg 2065b8e80941Smrgvoid 2066b8e80941Smrgfs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 2067b8e80941Smrg{ 2068b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 2069b8e80941Smrg assert(gs_compile->control_data_bits_per_vertex != 0); 2070b8e80941Smrg 2071b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2072b8e80941Smrg 2073b8e80941Smrg const fs_builder abld = bld.annotate("emit control data bits"); 2074b8e80941Smrg const fs_builder fwa_bld = bld.exec_all(); 2075b8e80941Smrg 2076b8e80941Smrg /* We use a single UD register to accumulate control data bits (32 bits 2077b8e80941Smrg * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 2078b8e80941Smrg * at a time. 2079b8e80941Smrg * 2080b8e80941Smrg * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 2081b8e80941Smrg * We have select a 128-bit group via the Global and Per-Slot Offsets, then 2082b8e80941Smrg * use the Channel Mask phase to enable/disable which DWord within that 2083b8e80941Smrg * group to write. (Remember, different SIMD8 channels may have emitted 2084b8e80941Smrg * different numbers of vertices, so we may need per-slot offsets.) 2085b8e80941Smrg * 2086b8e80941Smrg * Channel masking presents an annoying problem: we may have to replicate 2087b8e80941Smrg * the data up to 4 times: 2088b8e80941Smrg * 2089b8e80941Smrg * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 2090b8e80941Smrg * 2091b8e80941Smrg * To avoid penalizing shaders that emit a small number of vertices, we 2092b8e80941Smrg * can avoid these sometimes: if the size of the control data header is 2093b8e80941Smrg * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 2094b8e80941Smrg * land in the same 128-bit group, so we can skip per-slot offsets. 2095b8e80941Smrg * 2096b8e80941Smrg * Similarly, if the control data header is <= 32 bits, there is only one 2097b8e80941Smrg * DWord, so we can skip channel masks. 2098b8e80941Smrg */ 2099b8e80941Smrg enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 2100b8e80941Smrg 2101b8e80941Smrg fs_reg channel_mask, per_slot_offset; 2102b8e80941Smrg 2103b8e80941Smrg if (gs_compile->control_data_header_size_bits > 32) { 2104b8e80941Smrg opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2105b8e80941Smrg channel_mask = vgrf(glsl_type::uint_type); 2106b8e80941Smrg } 2107b8e80941Smrg 2108b8e80941Smrg if (gs_compile->control_data_header_size_bits > 128) { 2109b8e80941Smrg opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; 2110b8e80941Smrg per_slot_offset = vgrf(glsl_type::uint_type); 2111b8e80941Smrg } 2112b8e80941Smrg 2113b8e80941Smrg /* Figure out which DWord we're trying to write to using the formula: 2114b8e80941Smrg * 2115b8e80941Smrg * dword_index = (vertex_count - 1) * bits_per_vertex / 32 2116b8e80941Smrg * 2117b8e80941Smrg * Since bits_per_vertex is a power of two, and is known at compile 2118b8e80941Smrg * time, this can be optimized to: 2119b8e80941Smrg * 2120b8e80941Smrg * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 2121b8e80941Smrg */ 2122b8e80941Smrg if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { 2123b8e80941Smrg fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2124b8e80941Smrg fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2125b8e80941Smrg abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2126b8e80941Smrg unsigned log2_bits_per_vertex = 2127b8e80941Smrg util_last_bit(gs_compile->control_data_bits_per_vertex); 2128b8e80941Smrg abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 2129b8e80941Smrg 2130b8e80941Smrg if (per_slot_offset.file != BAD_FILE) { 2131b8e80941Smrg /* Set the per-slot offset to dword_index / 4, so that we'll write to 2132b8e80941Smrg * the appropriate OWord within the control data header. 2133b8e80941Smrg */ 2134b8e80941Smrg abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 2135b8e80941Smrg } 2136b8e80941Smrg 2137b8e80941Smrg /* Set the channel masks to 1 << (dword_index % 4), so that we'll 2138b8e80941Smrg * write to the appropriate DWORD within the OWORD. 2139b8e80941Smrg */ 2140b8e80941Smrg fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2141b8e80941Smrg fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 2142b8e80941Smrg channel_mask = intexp2(fwa_bld, channel); 2143b8e80941Smrg /* Then the channel masks need to be in bits 23:16. */ 2144b8e80941Smrg fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 2145b8e80941Smrg } 2146b8e80941Smrg 2147b8e80941Smrg /* Store the control data bits in the message payload and send it. */ 2148b8e80941Smrg unsigned mlen = 2; 2149b8e80941Smrg if (channel_mask.file != BAD_FILE) 2150b8e80941Smrg mlen += 4; /* channel masks, plus 3 extra copies of the data */ 2151b8e80941Smrg if (per_slot_offset.file != BAD_FILE) 2152b8e80941Smrg mlen++; 2153b8e80941Smrg 2154b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2155b8e80941Smrg fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); 2156b8e80941Smrg unsigned i = 0; 2157b8e80941Smrg sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 2158b8e80941Smrg if (per_slot_offset.file != BAD_FILE) 2159b8e80941Smrg sources[i++] = per_slot_offset; 2160b8e80941Smrg if (channel_mask.file != BAD_FILE) 2161b8e80941Smrg sources[i++] = channel_mask; 2162b8e80941Smrg while (i < mlen) { 2163b8e80941Smrg sources[i++] = this->control_data_bits; 2164b8e80941Smrg } 2165b8e80941Smrg 2166b8e80941Smrg abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); 2167b8e80941Smrg fs_inst *inst = abld.emit(opcode, reg_undef, payload); 2168b8e80941Smrg inst->mlen = mlen; 2169b8e80941Smrg /* We need to increment Global Offset by 256-bits to make room for 2170b8e80941Smrg * Broadwell's extra "Vertex Count" payload at the beginning of the 2171b8e80941Smrg * URB entry. Since this is an OWord message, Global Offset is counted 2172b8e80941Smrg * in 128-bit units, so we must set it to 2. 2173b8e80941Smrg */ 2174b8e80941Smrg if (gs_prog_data->static_vertex_count == -1) 2175b8e80941Smrg inst->offset = 2; 2176b8e80941Smrg} 2177b8e80941Smrg 2178b8e80941Smrgvoid 2179b8e80941Smrgfs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 2180b8e80941Smrg unsigned stream_id) 2181b8e80941Smrg{ 2182b8e80941Smrg /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 2183b8e80941Smrg 2184b8e80941Smrg /* Note: we are calling this *before* increasing vertex_count, so 2185b8e80941Smrg * this->vertex_count == vertex_count - 1 in the formula above. 2186b8e80941Smrg */ 2187b8e80941Smrg 2188b8e80941Smrg /* Stream mode uses 2 bits per vertex */ 2189b8e80941Smrg assert(gs_compile->control_data_bits_per_vertex == 2); 2190b8e80941Smrg 2191b8e80941Smrg /* Must be a valid stream */ 2192b8e80941Smrg assert(stream_id < MAX_VERTEX_STREAMS); 2193b8e80941Smrg 2194b8e80941Smrg /* Control data bits are initialized to 0 so we don't have to set any 2195b8e80941Smrg * bits when sending vertices to stream 0. 2196b8e80941Smrg */ 2197b8e80941Smrg if (stream_id == 0) 2198b8e80941Smrg return; 2199b8e80941Smrg 2200b8e80941Smrg const fs_builder abld = bld.annotate("set stream control data bits", NULL); 2201b8e80941Smrg 2202b8e80941Smrg /* reg::sid = stream_id */ 2203b8e80941Smrg fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2204b8e80941Smrg abld.MOV(sid, brw_imm_ud(stream_id)); 2205b8e80941Smrg 2206b8e80941Smrg /* reg:shift_count = 2 * (vertex_count - 1) */ 2207b8e80941Smrg fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2208b8e80941Smrg abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 2209b8e80941Smrg 2210b8e80941Smrg /* Note: we're relying on the fact that the GEN SHL instruction only pays 2211b8e80941Smrg * attention to the lower 5 bits of its second source argument, so on this 2212b8e80941Smrg * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 2213b8e80941Smrg * stream_id << ((2 * (vertex_count - 1)) % 32). 2214b8e80941Smrg */ 2215b8e80941Smrg fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2216b8e80941Smrg abld.SHL(mask, sid, shift_count); 2217b8e80941Smrg abld.OR(this->control_data_bits, this->control_data_bits, mask); 2218b8e80941Smrg} 2219b8e80941Smrg 2220b8e80941Smrgvoid 2221b8e80941Smrgfs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 2222b8e80941Smrg unsigned stream_id) 2223b8e80941Smrg{ 2224b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 2225b8e80941Smrg 2226b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2227b8e80941Smrg 2228b8e80941Smrg fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2229b8e80941Smrg vertex_count.type = BRW_REGISTER_TYPE_UD; 2230b8e80941Smrg 2231b8e80941Smrg /* Haswell and later hardware ignores the "Render Stream Select" bits 2232b8e80941Smrg * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 2233b8e80941Smrg * and instead sends all primitives down the pipeline for rasterization. 2234b8e80941Smrg * If the SOL stage is enabled, "Render Stream Select" is honored and 2235b8e80941Smrg * primitives bound to non-zero streams are discarded after stream output. 2236b8e80941Smrg * 2237b8e80941Smrg * Since the only purpose of primives sent to non-zero streams is to 2238b8e80941Smrg * be recorded by transform feedback, we can simply discard all geometry 2239b8e80941Smrg * bound to these streams when transform feedback is disabled. 2240b8e80941Smrg */ 2241b8e80941Smrg if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 2242b8e80941Smrg return; 2243b8e80941Smrg 2244b8e80941Smrg /* If we're outputting 32 control data bits or less, then we can wait 2245b8e80941Smrg * until the shader is over to output them all. Otherwise we need to 2246b8e80941Smrg * output them as we go. Now is the time to do it, since we're about to 2247b8e80941Smrg * output the vertex_count'th vertex, so it's guaranteed that the 2248b8e80941Smrg * control data bits associated with the (vertex_count - 1)th vertex are 2249b8e80941Smrg * correct. 2250b8e80941Smrg */ 2251b8e80941Smrg if (gs_compile->control_data_header_size_bits > 32) { 2252b8e80941Smrg const fs_builder abld = 2253b8e80941Smrg bld.annotate("emit vertex: emit control data bits"); 2254b8e80941Smrg 2255b8e80941Smrg /* Only emit control data bits if we've finished accumulating a batch 2256b8e80941Smrg * of 32 bits. This is the case when: 2257b8e80941Smrg * 2258b8e80941Smrg * (vertex_count * bits_per_vertex) % 32 == 0 2259b8e80941Smrg * 2260b8e80941Smrg * (in other words, when the last 5 bits of vertex_count * 2261b8e80941Smrg * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 2262b8e80941Smrg * integer n (which is always the case, since bits_per_vertex is 2263b8e80941Smrg * always 1 or 2), this is equivalent to requiring that the last 5-n 2264b8e80941Smrg * bits of vertex_count are 0: 2265b8e80941Smrg * 2266b8e80941Smrg * vertex_count & (2^(5-n) - 1) == 0 2267b8e80941Smrg * 2268b8e80941Smrg * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 2269b8e80941Smrg * equivalent to: 2270b8e80941Smrg * 2271b8e80941Smrg * vertex_count & (32 / bits_per_vertex - 1) == 0 2272b8e80941Smrg * 2273b8e80941Smrg * TODO: If vertex_count is an immediate, we could do some of this math 2274b8e80941Smrg * at compile time... 2275b8e80941Smrg */ 2276b8e80941Smrg fs_inst *inst = 2277b8e80941Smrg abld.AND(bld.null_reg_d(), vertex_count, 2278b8e80941Smrg brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2279b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_Z; 2280b8e80941Smrg 2281b8e80941Smrg abld.IF(BRW_PREDICATE_NORMAL); 2282b8e80941Smrg /* If vertex_count is 0, then no control data bits have been 2283b8e80941Smrg * accumulated yet, so we can skip emitting them. 2284b8e80941Smrg */ 2285b8e80941Smrg abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2286b8e80941Smrg BRW_CONDITIONAL_NEQ); 2287b8e80941Smrg abld.IF(BRW_PREDICATE_NORMAL); 2288b8e80941Smrg emit_gs_control_data_bits(vertex_count); 2289b8e80941Smrg abld.emit(BRW_OPCODE_ENDIF); 2290b8e80941Smrg 2291b8e80941Smrg /* Reset control_data_bits to 0 so we can start accumulating a new 2292b8e80941Smrg * batch. 2293b8e80941Smrg * 2294b8e80941Smrg * Note: in the case where vertex_count == 0, this neutralizes the 2295b8e80941Smrg * effect of any call to EndPrimitive() that the shader may have 2296b8e80941Smrg * made before outputting its first vertex. 2297b8e80941Smrg */ 2298b8e80941Smrg inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2299b8e80941Smrg inst->force_writemask_all = true; 2300b8e80941Smrg abld.emit(BRW_OPCODE_ENDIF); 2301b8e80941Smrg } 2302b8e80941Smrg 2303b8e80941Smrg emit_urb_writes(vertex_count); 2304b8e80941Smrg 2305b8e80941Smrg /* In stream mode we have to set control data bits for all vertices 2306b8e80941Smrg * unless we have disabled control data bits completely (which we do 2307b8e80941Smrg * do for GL_POINTS outputs that don't use streams). 2308b8e80941Smrg */ 2309b8e80941Smrg if (gs_compile->control_data_header_size_bits > 0 && 2310b8e80941Smrg gs_prog_data->control_data_format == 2311b8e80941Smrg GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2312b8e80941Smrg set_gs_stream_control_data_bits(vertex_count, stream_id); 2313b8e80941Smrg } 2314b8e80941Smrg} 2315b8e80941Smrg 2316b8e80941Smrgvoid 2317b8e80941Smrgfs_visitor::emit_gs_input_load(const fs_reg &dst, 2318b8e80941Smrg const nir_src &vertex_src, 2319b8e80941Smrg unsigned base_offset, 2320b8e80941Smrg const nir_src &offset_src, 2321b8e80941Smrg unsigned num_components, 2322b8e80941Smrg unsigned first_component) 2323b8e80941Smrg{ 2324b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2325b8e80941Smrg const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2326b8e80941Smrg 2327b8e80941Smrg /* TODO: figure out push input layout for invocations == 1 */ 2328b8e80941Smrg /* TODO: make this work with 64-bit inputs */ 2329b8e80941Smrg if (gs_prog_data->invocations == 1 && 2330b8e80941Smrg type_sz(dst.type) <= 4 && 2331b8e80941Smrg nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 2332b8e80941Smrg 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { 2333b8e80941Smrg int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + 2334b8e80941Smrg nir_src_as_uint(vertex_src) * push_reg_count; 2335b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 2336b8e80941Smrg bld.MOV(offset(dst, bld, i), 2337b8e80941Smrg fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2338b8e80941Smrg } 2339b8e80941Smrg return; 2340b8e80941Smrg } 2341b8e80941Smrg 2342b8e80941Smrg /* Resort to the pull model. Ensure the VUE handles are provided. */ 2343b8e80941Smrg assert(gs_prog_data->base.include_vue_handles); 2344b8e80941Smrg 2345b8e80941Smrg unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2346b8e80941Smrg fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2347b8e80941Smrg 2348b8e80941Smrg if (gs_prog_data->invocations == 1) { 2349b8e80941Smrg if (nir_src_is_const(vertex_src)) { 2350b8e80941Smrg /* The vertex index is constant; just select the proper URB handle. */ 2351b8e80941Smrg icp_handle = 2352b8e80941Smrg retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0), 2353b8e80941Smrg BRW_REGISTER_TYPE_UD); 2354b8e80941Smrg } else { 2355b8e80941Smrg /* The vertex index is non-constant. We need to use indirect 2356b8e80941Smrg * addressing to fetch the proper URB handle. 2357b8e80941Smrg * 2358b8e80941Smrg * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2359b8e80941Smrg * indicating that channel <n> should read the handle from 2360b8e80941Smrg * DWord <n>. We convert that to bytes by multiplying by 4. 2361b8e80941Smrg * 2362b8e80941Smrg * Next, we convert the vertex index to bytes by multiplying 2363b8e80941Smrg * by 32 (shifting by 5), and add the two together. This is 2364b8e80941Smrg * the final indirect byte offset. 2365b8e80941Smrg */ 2366b8e80941Smrg fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2367b8e80941Smrg fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2368b8e80941Smrg fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2369b8e80941Smrg fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2370b8e80941Smrg 2371b8e80941Smrg /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2372b8e80941Smrg bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2373b8e80941Smrg /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2374b8e80941Smrg bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2375b8e80941Smrg /* Convert vertex_index to bytes (multiply by 32) */ 2376b8e80941Smrg bld.SHL(vertex_offset_bytes, 2377b8e80941Smrg retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2378b8e80941Smrg brw_imm_ud(5u)); 2379b8e80941Smrg bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2380b8e80941Smrg 2381b8e80941Smrg /* Use first_icp_handle as the base offset. There is one register 2382b8e80941Smrg * of URB handles per vertex, so inform the register allocator that 2383b8e80941Smrg * we might read up to nir->info.gs.vertices_in registers. 2384b8e80941Smrg */ 2385b8e80941Smrg bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2386b8e80941Smrg retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2387b8e80941Smrg fs_reg(icp_offset_bytes), 2388b8e80941Smrg brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2389b8e80941Smrg } 2390b8e80941Smrg } else { 2391b8e80941Smrg assert(gs_prog_data->invocations > 1); 2392b8e80941Smrg 2393b8e80941Smrg if (nir_src_is_const(vertex_src)) { 2394b8e80941Smrg unsigned vertex = nir_src_as_uint(vertex_src); 2395b8e80941Smrg assert(devinfo->gen >= 9 || vertex <= 5); 2396b8e80941Smrg bld.MOV(icp_handle, 2397b8e80941Smrg retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8), 2398b8e80941Smrg BRW_REGISTER_TYPE_UD)); 2399b8e80941Smrg } else { 2400b8e80941Smrg /* The vertex index is non-constant. We need to use indirect 2401b8e80941Smrg * addressing to fetch the proper URB handle. 2402b8e80941Smrg * 2403b8e80941Smrg */ 2404b8e80941Smrg fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2405b8e80941Smrg 2406b8e80941Smrg /* Convert vertex_index to bytes (multiply by 4) */ 2407b8e80941Smrg bld.SHL(icp_offset_bytes, 2408b8e80941Smrg retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2409b8e80941Smrg brw_imm_ud(2u)); 2410b8e80941Smrg 2411b8e80941Smrg /* Use first_icp_handle as the base offset. There is one DWord 2412b8e80941Smrg * of URB handles per vertex, so inform the register allocator that 2413b8e80941Smrg * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2414b8e80941Smrg */ 2415b8e80941Smrg bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2416b8e80941Smrg retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2417b8e80941Smrg fs_reg(icp_offset_bytes), 2418b8e80941Smrg brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2419b8e80941Smrg REG_SIZE)); 2420b8e80941Smrg } 2421b8e80941Smrg } 2422b8e80941Smrg 2423b8e80941Smrg fs_inst *inst; 2424b8e80941Smrg 2425b8e80941Smrg fs_reg tmp_dst = dst; 2426b8e80941Smrg fs_reg indirect_offset = get_nir_src(offset_src); 2427b8e80941Smrg unsigned num_iterations = 1; 2428b8e80941Smrg unsigned orig_num_components = num_components; 2429b8e80941Smrg 2430b8e80941Smrg if (type_sz(dst.type) == 8) { 2431b8e80941Smrg if (num_components > 2) { 2432b8e80941Smrg num_iterations = 2; 2433b8e80941Smrg num_components = 2; 2434b8e80941Smrg } 2435b8e80941Smrg fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2436b8e80941Smrg tmp_dst = tmp; 2437b8e80941Smrg first_component = first_component / 2; 2438b8e80941Smrg } 2439b8e80941Smrg 2440b8e80941Smrg for (unsigned iter = 0; iter < num_iterations; iter++) { 2441b8e80941Smrg if (nir_src_is_const(offset_src)) { 2442b8e80941Smrg /* Constant indexing - use global offset. */ 2443b8e80941Smrg if (first_component != 0) { 2444b8e80941Smrg unsigned read_components = num_components + first_component; 2445b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2446b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2447b8e80941Smrg inst->size_written = read_components * 2448b8e80941Smrg tmp.component_size(inst->exec_size); 2449b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 2450b8e80941Smrg bld.MOV(offset(tmp_dst, bld, i), 2451b8e80941Smrg offset(tmp, bld, i + first_component)); 2452b8e80941Smrg } 2453b8e80941Smrg } else { 2454b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, 2455b8e80941Smrg icp_handle); 2456b8e80941Smrg inst->size_written = num_components * 2457b8e80941Smrg tmp_dst.component_size(inst->exec_size); 2458b8e80941Smrg } 2459b8e80941Smrg inst->offset = base_offset + nir_src_as_uint(offset_src); 2460b8e80941Smrg inst->mlen = 1; 2461b8e80941Smrg } else { 2462b8e80941Smrg /* Indirect indexing - use per-slot offsets as well. */ 2463b8e80941Smrg const fs_reg srcs[] = { icp_handle, indirect_offset }; 2464b8e80941Smrg unsigned read_components = num_components + first_component; 2465b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2466b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2467b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2468b8e80941Smrg if (first_component != 0) { 2469b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2470b8e80941Smrg payload); 2471b8e80941Smrg inst->size_written = read_components * 2472b8e80941Smrg tmp.component_size(inst->exec_size); 2473b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 2474b8e80941Smrg bld.MOV(offset(tmp_dst, bld, i), 2475b8e80941Smrg offset(tmp, bld, i + first_component)); 2476b8e80941Smrg } 2477b8e80941Smrg } else { 2478b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, 2479b8e80941Smrg payload); 2480b8e80941Smrg inst->size_written = num_components * 2481b8e80941Smrg tmp_dst.component_size(inst->exec_size); 2482b8e80941Smrg } 2483b8e80941Smrg inst->offset = base_offset; 2484b8e80941Smrg inst->mlen = 2; 2485b8e80941Smrg } 2486b8e80941Smrg 2487b8e80941Smrg if (type_sz(dst.type) == 8) { 2488b8e80941Smrg shuffle_from_32bit_read(bld, 2489b8e80941Smrg offset(dst, bld, iter * 2), 2490b8e80941Smrg retype(tmp_dst, BRW_REGISTER_TYPE_D), 2491b8e80941Smrg 0, 2492b8e80941Smrg num_components); 2493b8e80941Smrg } 2494b8e80941Smrg 2495b8e80941Smrg if (num_iterations > 1) { 2496b8e80941Smrg num_components = orig_num_components - 2; 2497b8e80941Smrg if(nir_src_is_const(offset_src)) { 2498b8e80941Smrg base_offset++; 2499b8e80941Smrg } else { 2500b8e80941Smrg fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2501b8e80941Smrg bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); 2502b8e80941Smrg indirect_offset = new_indirect; 2503b8e80941Smrg } 2504b8e80941Smrg } 2505b8e80941Smrg } 2506b8e80941Smrg} 2507b8e80941Smrg 2508b8e80941Smrgfs_reg 2509b8e80941Smrgfs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2510b8e80941Smrg{ 2511b8e80941Smrg nir_src *offset_src = nir_get_io_offset_src(instr); 2512b8e80941Smrg 2513b8e80941Smrg if (nir_src_is_const(*offset_src)) { 2514b8e80941Smrg /* The only constant offset we should find is 0. brw_nir.c's 2515b8e80941Smrg * add_const_offset_to_base() will fold other constant offsets 2516b8e80941Smrg * into instr->const_index[0]. 2517b8e80941Smrg */ 2518b8e80941Smrg assert(nir_src_as_uint(*offset_src) == 0); 2519b8e80941Smrg return fs_reg(); 2520b8e80941Smrg } 2521b8e80941Smrg 2522b8e80941Smrg return get_nir_src(*offset_src); 2523b8e80941Smrg} 2524b8e80941Smrg 2525b8e80941Smrgvoid 2526b8e80941Smrgfs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2527b8e80941Smrg nir_intrinsic_instr *instr) 2528b8e80941Smrg{ 2529b8e80941Smrg assert(stage == MESA_SHADER_VERTEX); 2530b8e80941Smrg 2531b8e80941Smrg fs_reg dest; 2532b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2533b8e80941Smrg dest = get_nir_dest(instr->dest); 2534b8e80941Smrg 2535b8e80941Smrg switch (instr->intrinsic) { 2536b8e80941Smrg case nir_intrinsic_load_vertex_id: 2537b8e80941Smrg case nir_intrinsic_load_base_vertex: 2538b8e80941Smrg unreachable("should be lowered by nir_lower_system_values()"); 2539b8e80941Smrg 2540b8e80941Smrg case nir_intrinsic_load_input: { 2541b8e80941Smrg fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2542b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 2543b8e80941Smrg unsigned num_components = instr->num_components; 2544b8e80941Smrg 2545b8e80941Smrg src = offset(src, bld, nir_src_as_uint(instr->src[0])); 2546b8e80941Smrg 2547b8e80941Smrg if (type_sz(dest.type) == 8) 2548b8e80941Smrg first_component /= 2; 2549b8e80941Smrg 2550b8e80941Smrg /* For 16-bit support maybe a temporary will be needed to copy from 2551b8e80941Smrg * the ATTR file. 2552b8e80941Smrg */ 2553b8e80941Smrg shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D), 2554b8e80941Smrg first_component, num_components); 2555b8e80941Smrg break; 2556b8e80941Smrg } 2557b8e80941Smrg 2558b8e80941Smrg case nir_intrinsic_load_vertex_id_zero_base: 2559b8e80941Smrg case nir_intrinsic_load_instance_id: 2560b8e80941Smrg case nir_intrinsic_load_base_instance: 2561b8e80941Smrg case nir_intrinsic_load_draw_id: 2562b8e80941Smrg case nir_intrinsic_load_first_vertex: 2563b8e80941Smrg case nir_intrinsic_load_is_indexed_draw: 2564b8e80941Smrg unreachable("lowered by brw_nir_lower_vs_inputs"); 2565b8e80941Smrg 2566b8e80941Smrg default: 2567b8e80941Smrg nir_emit_intrinsic(bld, instr); 2568b8e80941Smrg break; 2569b8e80941Smrg } 2570b8e80941Smrg} 2571b8e80941Smrg 2572b8e80941Smrgvoid 2573b8e80941Smrgfs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2574b8e80941Smrg nir_intrinsic_instr *instr) 2575b8e80941Smrg{ 2576b8e80941Smrg assert(stage == MESA_SHADER_TESS_CTRL); 2577b8e80941Smrg struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2578b8e80941Smrg struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2579b8e80941Smrg 2580b8e80941Smrg fs_reg dst; 2581b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2582b8e80941Smrg dst = get_nir_dest(instr->dest); 2583b8e80941Smrg 2584b8e80941Smrg switch (instr->intrinsic) { 2585b8e80941Smrg case nir_intrinsic_load_primitive_id: 2586b8e80941Smrg bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); 2587b8e80941Smrg break; 2588b8e80941Smrg case nir_intrinsic_load_invocation_id: 2589b8e80941Smrg bld.MOV(retype(dst, invocation_id.type), invocation_id); 2590b8e80941Smrg break; 2591b8e80941Smrg case nir_intrinsic_load_patch_vertices_in: 2592b8e80941Smrg bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2593b8e80941Smrg brw_imm_d(tcs_key->input_vertices)); 2594b8e80941Smrg break; 2595b8e80941Smrg 2596b8e80941Smrg case nir_intrinsic_barrier: { 2597b8e80941Smrg if (tcs_prog_data->instances == 1) 2598b8e80941Smrg break; 2599b8e80941Smrg 2600b8e80941Smrg fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2601b8e80941Smrg fs_reg m0_2 = component(m0, 2); 2602b8e80941Smrg 2603b8e80941Smrg const fs_builder chanbld = bld.exec_all().group(1, 0); 2604b8e80941Smrg 2605b8e80941Smrg /* Zero the message header */ 2606b8e80941Smrg bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2607b8e80941Smrg 2608b8e80941Smrg if (devinfo->gen < 11) { 2609b8e80941Smrg /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2610b8e80941Smrg chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2611b8e80941Smrg brw_imm_ud(INTEL_MASK(16, 13))); 2612b8e80941Smrg 2613b8e80941Smrg /* Shift it up to bits 27:24. */ 2614b8e80941Smrg chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2615b8e80941Smrg } else { 2616b8e80941Smrg chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2617b8e80941Smrg brw_imm_ud(INTEL_MASK(30, 24))); 2618b8e80941Smrg } 2619b8e80941Smrg 2620b8e80941Smrg /* Set the Barrier Count and the enable bit */ 2621b8e80941Smrg if (devinfo->gen < 11) { 2622b8e80941Smrg chanbld.OR(m0_2, m0_2, 2623b8e80941Smrg brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2624b8e80941Smrg } else { 2625b8e80941Smrg chanbld.OR(m0_2, m0_2, 2626b8e80941Smrg brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); 2627b8e80941Smrg } 2628b8e80941Smrg 2629b8e80941Smrg bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2630b8e80941Smrg break; 2631b8e80941Smrg } 2632b8e80941Smrg 2633b8e80941Smrg case nir_intrinsic_load_input: 2634b8e80941Smrg unreachable("nir_lower_io should never give us these."); 2635b8e80941Smrg break; 2636b8e80941Smrg 2637b8e80941Smrg case nir_intrinsic_load_per_vertex_input: { 2638b8e80941Smrg fs_reg indirect_offset = get_indirect_offset(instr); 2639b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 2640b8e80941Smrg 2641b8e80941Smrg const nir_src &vertex_src = instr->src[0]; 2642b8e80941Smrg 2643b8e80941Smrg fs_inst *inst; 2644b8e80941Smrg 2645b8e80941Smrg fs_reg icp_handle; 2646b8e80941Smrg 2647b8e80941Smrg if (nir_src_is_const(vertex_src)) { 2648b8e80941Smrg /* Emit a MOV to resolve <0,1,0> regioning. */ 2649b8e80941Smrg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2650b8e80941Smrg unsigned vertex = nir_src_as_uint(vertex_src); 2651b8e80941Smrg bld.MOV(icp_handle, 2652b8e80941Smrg retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), 2653b8e80941Smrg BRW_REGISTER_TYPE_UD)); 2654b8e80941Smrg } else if (tcs_prog_data->instances == 1 && 2655b8e80941Smrg nir_src_as_intrinsic(vertex_src) != NULL && 2656b8e80941Smrg nir_src_as_intrinsic(vertex_src)->intrinsic == nir_intrinsic_load_invocation_id) { 2657b8e80941Smrg /* For the common case of only 1 instance, an array index of 2658b8e80941Smrg * gl_InvocationID means reading g1. Skip all the indirect work. 2659b8e80941Smrg */ 2660b8e80941Smrg icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2661b8e80941Smrg } else { 2662b8e80941Smrg /* The vertex index is non-constant. We need to use indirect 2663b8e80941Smrg * addressing to fetch the proper URB handle. 2664b8e80941Smrg */ 2665b8e80941Smrg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2666b8e80941Smrg 2667b8e80941Smrg /* Each ICP handle is a single DWord (4 bytes) */ 2668b8e80941Smrg fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2669b8e80941Smrg bld.SHL(vertex_offset_bytes, 2670b8e80941Smrg retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2671b8e80941Smrg brw_imm_ud(2u)); 2672b8e80941Smrg 2673b8e80941Smrg /* Start at g1. We might read up to 4 registers. */ 2674b8e80941Smrg bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2675b8e80941Smrg retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2676b8e80941Smrg brw_imm_ud(4 * REG_SIZE)); 2677b8e80941Smrg } 2678b8e80941Smrg 2679b8e80941Smrg /* We can only read two double components with each URB read, so 2680b8e80941Smrg * we send two read messages in that case, each one loading up to 2681b8e80941Smrg * two double components. 2682b8e80941Smrg */ 2683b8e80941Smrg unsigned num_iterations = 1; 2684b8e80941Smrg unsigned num_components = instr->num_components; 2685b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 2686b8e80941Smrg fs_reg orig_dst = dst; 2687b8e80941Smrg if (type_sz(dst.type) == 8) { 2688b8e80941Smrg first_component = first_component / 2; 2689b8e80941Smrg if (instr->num_components > 2) { 2690b8e80941Smrg num_iterations = 2; 2691b8e80941Smrg num_components = 2; 2692b8e80941Smrg } 2693b8e80941Smrg 2694b8e80941Smrg fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2695b8e80941Smrg dst = tmp; 2696b8e80941Smrg } 2697b8e80941Smrg 2698b8e80941Smrg for (unsigned iter = 0; iter < num_iterations; iter++) { 2699b8e80941Smrg if (indirect_offset.file == BAD_FILE) { 2700b8e80941Smrg /* Constant indexing - use global offset. */ 2701b8e80941Smrg if (first_component != 0) { 2702b8e80941Smrg unsigned read_components = num_components + first_component; 2703b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2704b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2705b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 2706b8e80941Smrg bld.MOV(offset(dst, bld, i), 2707b8e80941Smrg offset(tmp, bld, i + first_component)); 2708b8e80941Smrg } 2709b8e80941Smrg } else { 2710b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2711b8e80941Smrg } 2712b8e80941Smrg inst->offset = imm_offset; 2713b8e80941Smrg inst->mlen = 1; 2714b8e80941Smrg } else { 2715b8e80941Smrg /* Indirect indexing - use per-slot offsets as well. */ 2716b8e80941Smrg const fs_reg srcs[] = { icp_handle, indirect_offset }; 2717b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2718b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2719b8e80941Smrg if (first_component != 0) { 2720b8e80941Smrg unsigned read_components = num_components + first_component; 2721b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2722b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2723b8e80941Smrg payload); 2724b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 2725b8e80941Smrg bld.MOV(offset(dst, bld, i), 2726b8e80941Smrg offset(tmp, bld, i + first_component)); 2727b8e80941Smrg } 2728b8e80941Smrg } else { 2729b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2730b8e80941Smrg payload); 2731b8e80941Smrg } 2732b8e80941Smrg inst->offset = imm_offset; 2733b8e80941Smrg inst->mlen = 2; 2734b8e80941Smrg } 2735b8e80941Smrg inst->size_written = (num_components + first_component) * 2736b8e80941Smrg inst->dst.component_size(inst->exec_size); 2737b8e80941Smrg 2738b8e80941Smrg /* If we are reading 64-bit data using 32-bit read messages we need 2739b8e80941Smrg * build proper 64-bit data elements by shuffling the low and high 2740b8e80941Smrg * 32-bit components around like we do for other things like UBOs 2741b8e80941Smrg * or SSBOs. 2742b8e80941Smrg */ 2743b8e80941Smrg if (type_sz(dst.type) == 8) { 2744b8e80941Smrg shuffle_from_32bit_read(bld, 2745b8e80941Smrg offset(orig_dst, bld, iter * 2), 2746b8e80941Smrg retype(dst, BRW_REGISTER_TYPE_D), 2747b8e80941Smrg 0, num_components); 2748b8e80941Smrg } 2749b8e80941Smrg 2750b8e80941Smrg /* Copy the temporary to the destination to deal with writemasking. 2751b8e80941Smrg * 2752b8e80941Smrg * Also attempt to deal with gl_PointSize being in the .w component. 2753b8e80941Smrg */ 2754b8e80941Smrg if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2755b8e80941Smrg assert(type_sz(dst.type) < 8); 2756b8e80941Smrg inst->dst = bld.vgrf(dst.type, 4); 2757b8e80941Smrg inst->size_written = 4 * REG_SIZE; 2758b8e80941Smrg bld.MOV(dst, offset(inst->dst, bld, 3)); 2759b8e80941Smrg } 2760b8e80941Smrg 2761b8e80941Smrg /* If we are loading double data and we need a second read message 2762b8e80941Smrg * adjust the write offset 2763b8e80941Smrg */ 2764b8e80941Smrg if (num_iterations > 1) { 2765b8e80941Smrg num_components = instr->num_components - 2; 2766b8e80941Smrg imm_offset++; 2767b8e80941Smrg } 2768b8e80941Smrg } 2769b8e80941Smrg break; 2770b8e80941Smrg } 2771b8e80941Smrg 2772b8e80941Smrg case nir_intrinsic_load_output: 2773b8e80941Smrg case nir_intrinsic_load_per_vertex_output: { 2774b8e80941Smrg fs_reg indirect_offset = get_indirect_offset(instr); 2775b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 2776b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 2777b8e80941Smrg 2778b8e80941Smrg fs_inst *inst; 2779b8e80941Smrg if (indirect_offset.file == BAD_FILE) { 2780b8e80941Smrg /* Replicate the patch handle to all enabled channels */ 2781b8e80941Smrg fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2782b8e80941Smrg bld.MOV(patch_handle, 2783b8e80941Smrg retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2784b8e80941Smrg 2785b8e80941Smrg { 2786b8e80941Smrg if (first_component != 0) { 2787b8e80941Smrg unsigned read_components = 2788b8e80941Smrg instr->num_components + first_component; 2789b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2790b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2791b8e80941Smrg patch_handle); 2792b8e80941Smrg inst->size_written = read_components * REG_SIZE; 2793b8e80941Smrg for (unsigned i = 0; i < instr->num_components; i++) { 2794b8e80941Smrg bld.MOV(offset(dst, bld, i), 2795b8e80941Smrg offset(tmp, bld, i + first_component)); 2796b8e80941Smrg } 2797b8e80941Smrg } else { 2798b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 2799b8e80941Smrg patch_handle); 2800b8e80941Smrg inst->size_written = instr->num_components * REG_SIZE; 2801b8e80941Smrg } 2802b8e80941Smrg inst->offset = imm_offset; 2803b8e80941Smrg inst->mlen = 1; 2804b8e80941Smrg } 2805b8e80941Smrg } else { 2806b8e80941Smrg /* Indirect indexing - use per-slot offsets as well. */ 2807b8e80941Smrg const fs_reg srcs[] = { 2808b8e80941Smrg retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2809b8e80941Smrg indirect_offset 2810b8e80941Smrg }; 2811b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2812b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2813b8e80941Smrg if (first_component != 0) { 2814b8e80941Smrg unsigned read_components = 2815b8e80941Smrg instr->num_components + first_component; 2816b8e80941Smrg fs_reg tmp = bld.vgrf(dst.type, read_components); 2817b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2818b8e80941Smrg payload); 2819b8e80941Smrg inst->size_written = read_components * REG_SIZE; 2820b8e80941Smrg for (unsigned i = 0; i < instr->num_components; i++) { 2821b8e80941Smrg bld.MOV(offset(dst, bld, i), 2822b8e80941Smrg offset(tmp, bld, i + first_component)); 2823b8e80941Smrg } 2824b8e80941Smrg } else { 2825b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2826b8e80941Smrg payload); 2827b8e80941Smrg inst->size_written = instr->num_components * REG_SIZE; 2828b8e80941Smrg } 2829b8e80941Smrg inst->offset = imm_offset; 2830b8e80941Smrg inst->mlen = 2; 2831b8e80941Smrg } 2832b8e80941Smrg break; 2833b8e80941Smrg } 2834b8e80941Smrg 2835b8e80941Smrg case nir_intrinsic_store_output: 2836b8e80941Smrg case nir_intrinsic_store_per_vertex_output: { 2837b8e80941Smrg fs_reg value = get_nir_src(instr->src[0]); 2838b8e80941Smrg bool is_64bit = (instr->src[0].is_ssa ? 2839b8e80941Smrg instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; 2840b8e80941Smrg fs_reg indirect_offset = get_indirect_offset(instr); 2841b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 2842b8e80941Smrg unsigned mask = instr->const_index[1]; 2843b8e80941Smrg unsigned header_regs = 0; 2844b8e80941Smrg fs_reg srcs[7]; 2845b8e80941Smrg srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2846b8e80941Smrg 2847b8e80941Smrg if (indirect_offset.file != BAD_FILE) { 2848b8e80941Smrg srcs[header_regs++] = indirect_offset; 2849b8e80941Smrg } 2850b8e80941Smrg 2851b8e80941Smrg if (mask == 0) 2852b8e80941Smrg break; 2853b8e80941Smrg 2854b8e80941Smrg unsigned num_components = util_last_bit(mask); 2855b8e80941Smrg enum opcode opcode; 2856b8e80941Smrg 2857b8e80941Smrg /* We can only pack two 64-bit components in a single message, so send 2858b8e80941Smrg * 2 messages if we have more components 2859b8e80941Smrg */ 2860b8e80941Smrg unsigned num_iterations = 1; 2861b8e80941Smrg unsigned iter_components = num_components; 2862b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 2863b8e80941Smrg if (is_64bit) { 2864b8e80941Smrg first_component = first_component / 2; 2865b8e80941Smrg if (instr->num_components > 2) { 2866b8e80941Smrg num_iterations = 2; 2867b8e80941Smrg iter_components = 2; 2868b8e80941Smrg } 2869b8e80941Smrg } 2870b8e80941Smrg 2871b8e80941Smrg mask = mask << first_component; 2872b8e80941Smrg 2873b8e80941Smrg for (unsigned iter = 0; iter < num_iterations; iter++) { 2874b8e80941Smrg if (!is_64bit && mask != WRITEMASK_XYZW) { 2875b8e80941Smrg srcs[header_regs++] = brw_imm_ud(mask << 16); 2876b8e80941Smrg opcode = indirect_offset.file != BAD_FILE ? 2877b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2878b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2879b8e80941Smrg } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { 2880b8e80941Smrg /* Expand the 64-bit mask to 32-bit channels. We only handle 2881b8e80941Smrg * two channels in each iteration, so we only care about X/Y. 2882b8e80941Smrg */ 2883b8e80941Smrg unsigned mask32 = 0; 2884b8e80941Smrg if (mask & WRITEMASK_X) 2885b8e80941Smrg mask32 |= WRITEMASK_XY; 2886b8e80941Smrg if (mask & WRITEMASK_Y) 2887b8e80941Smrg mask32 |= WRITEMASK_ZW; 2888b8e80941Smrg 2889b8e80941Smrg /* If the mask does not include any of the channels X or Y there 2890b8e80941Smrg * is nothing to do in this iteration. Move on to the next couple 2891b8e80941Smrg * of 64-bit channels. 2892b8e80941Smrg */ 2893b8e80941Smrg if (!mask32) { 2894b8e80941Smrg mask >>= 2; 2895b8e80941Smrg imm_offset++; 2896b8e80941Smrg continue; 2897b8e80941Smrg } 2898b8e80941Smrg 2899b8e80941Smrg srcs[header_regs++] = brw_imm_ud(mask32 << 16); 2900b8e80941Smrg opcode = indirect_offset.file != BAD_FILE ? 2901b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2902b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2903b8e80941Smrg } else { 2904b8e80941Smrg opcode = indirect_offset.file != BAD_FILE ? 2905b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : 2906b8e80941Smrg SHADER_OPCODE_URB_WRITE_SIMD8; 2907b8e80941Smrg } 2908b8e80941Smrg 2909b8e80941Smrg for (unsigned i = 0; i < iter_components; i++) { 2910b8e80941Smrg if (!(mask & (1 << (i + first_component)))) 2911b8e80941Smrg continue; 2912b8e80941Smrg 2913b8e80941Smrg if (!is_64bit) { 2914b8e80941Smrg srcs[header_regs + i + first_component] = offset(value, bld, i); 2915b8e80941Smrg } else { 2916b8e80941Smrg /* We need to shuffle the 64-bit data to match the layout 2917b8e80941Smrg * expected by our 32-bit URB write messages. We use a temporary 2918b8e80941Smrg * for that. 2919b8e80941Smrg */ 2920b8e80941Smrg unsigned channel = iter * 2 + i; 2921b8e80941Smrg fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1); 2922b8e80941Smrg 2923b8e80941Smrg srcs[header_regs + (i + first_component) * 2] = dest; 2924b8e80941Smrg srcs[header_regs + (i + first_component) * 2 + 1] = 2925b8e80941Smrg offset(dest, bld, 1); 2926b8e80941Smrg } 2927b8e80941Smrg } 2928b8e80941Smrg 2929b8e80941Smrg unsigned mlen = 2930b8e80941Smrg header_regs + (is_64bit ? 2 * iter_components : iter_components) + 2931b8e80941Smrg (is_64bit ? 2 * first_component : first_component); 2932b8e80941Smrg fs_reg payload = 2933b8e80941Smrg bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2934b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); 2935b8e80941Smrg 2936b8e80941Smrg fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); 2937b8e80941Smrg inst->offset = imm_offset; 2938b8e80941Smrg inst->mlen = mlen; 2939b8e80941Smrg 2940b8e80941Smrg /* If this is a 64-bit attribute, select the next two 64-bit channels 2941b8e80941Smrg * to be handled in the next iteration. 2942b8e80941Smrg */ 2943b8e80941Smrg if (is_64bit) { 2944b8e80941Smrg mask >>= 2; 2945b8e80941Smrg imm_offset++; 2946b8e80941Smrg } 2947b8e80941Smrg } 2948b8e80941Smrg break; 2949b8e80941Smrg } 2950b8e80941Smrg 2951b8e80941Smrg default: 2952b8e80941Smrg nir_emit_intrinsic(bld, instr); 2953b8e80941Smrg break; 2954b8e80941Smrg } 2955b8e80941Smrg} 2956b8e80941Smrg 2957b8e80941Smrgvoid 2958b8e80941Smrgfs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 2959b8e80941Smrg nir_intrinsic_instr *instr) 2960b8e80941Smrg{ 2961b8e80941Smrg assert(stage == MESA_SHADER_TESS_EVAL); 2962b8e80941Smrg struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 2963b8e80941Smrg 2964b8e80941Smrg fs_reg dest; 2965b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2966b8e80941Smrg dest = get_nir_dest(instr->dest); 2967b8e80941Smrg 2968b8e80941Smrg switch (instr->intrinsic) { 2969b8e80941Smrg case nir_intrinsic_load_primitive_id: 2970b8e80941Smrg bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 2971b8e80941Smrg break; 2972b8e80941Smrg case nir_intrinsic_load_tess_coord: 2973b8e80941Smrg /* gl_TessCoord is part of the payload in g1-3 */ 2974b8e80941Smrg for (unsigned i = 0; i < 3; i++) { 2975b8e80941Smrg bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 2976b8e80941Smrg } 2977b8e80941Smrg break; 2978b8e80941Smrg 2979b8e80941Smrg case nir_intrinsic_load_input: 2980b8e80941Smrg case nir_intrinsic_load_per_vertex_input: { 2981b8e80941Smrg fs_reg indirect_offset = get_indirect_offset(instr); 2982b8e80941Smrg unsigned imm_offset = instr->const_index[0]; 2983b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 2984b8e80941Smrg 2985b8e80941Smrg if (type_sz(dest.type) == 8) { 2986b8e80941Smrg first_component = first_component / 2; 2987b8e80941Smrg } 2988b8e80941Smrg 2989b8e80941Smrg fs_inst *inst; 2990b8e80941Smrg if (indirect_offset.file == BAD_FILE) { 2991b8e80941Smrg /* Arbitrarily only push up to 32 vec4 slots worth of data, 2992b8e80941Smrg * which is 16 registers (since each holds 2 vec4 slots). 2993b8e80941Smrg */ 2994b8e80941Smrg unsigned slot_count = 1; 2995b8e80941Smrg if (type_sz(dest.type) == 8 && instr->num_components > 2) 2996b8e80941Smrg slot_count++; 2997b8e80941Smrg 2998b8e80941Smrg const unsigned max_push_slots = 32; 2999b8e80941Smrg if (imm_offset + slot_count <= max_push_slots) { 3000b8e80941Smrg fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 3001b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 3002b8e80941Smrg unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + 3003b8e80941Smrg i + first_component; 3004b8e80941Smrg bld.MOV(offset(dest, bld, i), component(src, comp)); 3005b8e80941Smrg } 3006b8e80941Smrg 3007b8e80941Smrg tes_prog_data->base.urb_read_length = 3008b8e80941Smrg MAX2(tes_prog_data->base.urb_read_length, 3009b8e80941Smrg DIV_ROUND_UP(imm_offset + slot_count, 2)); 3010b8e80941Smrg } else { 3011b8e80941Smrg /* Replicate the patch handle to all enabled channels */ 3012b8e80941Smrg const fs_reg srcs[] = { 3013b8e80941Smrg retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) 3014b8e80941Smrg }; 3015b8e80941Smrg fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 3016b8e80941Smrg bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); 3017b8e80941Smrg 3018b8e80941Smrg if (first_component != 0) { 3019b8e80941Smrg unsigned read_components = 3020b8e80941Smrg instr->num_components + first_component; 3021b8e80941Smrg fs_reg tmp = bld.vgrf(dest.type, read_components); 3022b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 3023b8e80941Smrg patch_handle); 3024b8e80941Smrg inst->size_written = read_components * REG_SIZE; 3025b8e80941Smrg for (unsigned i = 0; i < instr->num_components; i++) { 3026b8e80941Smrg bld.MOV(offset(dest, bld, i), 3027b8e80941Smrg offset(tmp, bld, i + first_component)); 3028b8e80941Smrg } 3029b8e80941Smrg } else { 3030b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, 3031b8e80941Smrg patch_handle); 3032b8e80941Smrg inst->size_written = instr->num_components * REG_SIZE; 3033b8e80941Smrg } 3034b8e80941Smrg inst->mlen = 1; 3035b8e80941Smrg inst->offset = imm_offset; 3036b8e80941Smrg } 3037b8e80941Smrg } else { 3038b8e80941Smrg /* Indirect indexing - use per-slot offsets as well. */ 3039b8e80941Smrg 3040b8e80941Smrg /* We can only read two double components with each URB read, so 3041b8e80941Smrg * we send two read messages in that case, each one loading up to 3042b8e80941Smrg * two double components. 3043b8e80941Smrg */ 3044b8e80941Smrg unsigned num_iterations = 1; 3045b8e80941Smrg unsigned num_components = instr->num_components; 3046b8e80941Smrg fs_reg orig_dest = dest; 3047b8e80941Smrg if (type_sz(dest.type) == 8) { 3048b8e80941Smrg if (instr->num_components > 2) { 3049b8e80941Smrg num_iterations = 2; 3050b8e80941Smrg num_components = 2; 3051b8e80941Smrg } 3052b8e80941Smrg fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); 3053b8e80941Smrg dest = tmp; 3054b8e80941Smrg } 3055b8e80941Smrg 3056b8e80941Smrg for (unsigned iter = 0; iter < num_iterations; iter++) { 3057b8e80941Smrg const fs_reg srcs[] = { 3058b8e80941Smrg retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 3059b8e80941Smrg indirect_offset 3060b8e80941Smrg }; 3061b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3062b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 3063b8e80941Smrg 3064b8e80941Smrg if (first_component != 0) { 3065b8e80941Smrg unsigned read_components = 3066b8e80941Smrg num_components + first_component; 3067b8e80941Smrg fs_reg tmp = bld.vgrf(dest.type, read_components); 3068b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 3069b8e80941Smrg payload); 3070b8e80941Smrg for (unsigned i = 0; i < num_components; i++) { 3071b8e80941Smrg bld.MOV(offset(dest, bld, i), 3072b8e80941Smrg offset(tmp, bld, i + first_component)); 3073b8e80941Smrg } 3074b8e80941Smrg } else { 3075b8e80941Smrg inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, 3076b8e80941Smrg payload); 3077b8e80941Smrg } 3078b8e80941Smrg inst->mlen = 2; 3079b8e80941Smrg inst->offset = imm_offset; 3080b8e80941Smrg inst->size_written = (num_components + first_component) * 3081b8e80941Smrg inst->dst.component_size(inst->exec_size); 3082b8e80941Smrg 3083b8e80941Smrg /* If we are reading 64-bit data using 32-bit read messages we need 3084b8e80941Smrg * build proper 64-bit data elements by shuffling the low and high 3085b8e80941Smrg * 32-bit components around like we do for other things like UBOs 3086b8e80941Smrg * or SSBOs. 3087b8e80941Smrg */ 3088b8e80941Smrg if (type_sz(dest.type) == 8) { 3089b8e80941Smrg shuffle_from_32bit_read(bld, 3090b8e80941Smrg offset(orig_dest, bld, iter * 2), 3091b8e80941Smrg retype(dest, BRW_REGISTER_TYPE_D), 3092b8e80941Smrg 0, num_components); 3093b8e80941Smrg } 3094b8e80941Smrg 3095b8e80941Smrg /* If we are loading double data and we need a second read message 3096b8e80941Smrg * adjust the offset 3097b8e80941Smrg */ 3098b8e80941Smrg if (num_iterations > 1) { 3099b8e80941Smrg num_components = instr->num_components - 2; 3100b8e80941Smrg imm_offset++; 3101b8e80941Smrg } 3102b8e80941Smrg } 3103b8e80941Smrg } 3104b8e80941Smrg break; 3105b8e80941Smrg } 3106b8e80941Smrg default: 3107b8e80941Smrg nir_emit_intrinsic(bld, instr); 3108b8e80941Smrg break; 3109b8e80941Smrg } 3110b8e80941Smrg} 3111b8e80941Smrg 3112b8e80941Smrgvoid 3113b8e80941Smrgfs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 3114b8e80941Smrg nir_intrinsic_instr *instr) 3115b8e80941Smrg{ 3116b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 3117b8e80941Smrg fs_reg indirect_offset; 3118b8e80941Smrg 3119b8e80941Smrg fs_reg dest; 3120b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3121b8e80941Smrg dest = get_nir_dest(instr->dest); 3122b8e80941Smrg 3123b8e80941Smrg switch (instr->intrinsic) { 3124b8e80941Smrg case nir_intrinsic_load_primitive_id: 3125b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 3126b8e80941Smrg assert(brw_gs_prog_data(prog_data)->include_primitive_id); 3127b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 3128b8e80941Smrg retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 3129b8e80941Smrg break; 3130b8e80941Smrg 3131b8e80941Smrg case nir_intrinsic_load_input: 3132b8e80941Smrg unreachable("load_input intrinsics are invalid for the GS stage"); 3133b8e80941Smrg 3134b8e80941Smrg case nir_intrinsic_load_per_vertex_input: 3135b8e80941Smrg emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 3136b8e80941Smrg instr->src[1], instr->num_components, 3137b8e80941Smrg nir_intrinsic_component(instr)); 3138b8e80941Smrg break; 3139b8e80941Smrg 3140b8e80941Smrg case nir_intrinsic_emit_vertex_with_counter: 3141b8e80941Smrg emit_gs_vertex(instr->src[0], instr->const_index[0]); 3142b8e80941Smrg break; 3143b8e80941Smrg 3144b8e80941Smrg case nir_intrinsic_end_primitive_with_counter: 3145b8e80941Smrg emit_gs_end_primitive(instr->src[0]); 3146b8e80941Smrg break; 3147b8e80941Smrg 3148b8e80941Smrg case nir_intrinsic_set_vertex_count: 3149b8e80941Smrg bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 3150b8e80941Smrg break; 3151b8e80941Smrg 3152b8e80941Smrg case nir_intrinsic_load_invocation_id: { 3153b8e80941Smrg fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 3154b8e80941Smrg assert(val.file != BAD_FILE); 3155b8e80941Smrg dest.type = val.type; 3156b8e80941Smrg bld.MOV(dest, val); 3157b8e80941Smrg break; 3158b8e80941Smrg } 3159b8e80941Smrg 3160b8e80941Smrg default: 3161b8e80941Smrg nir_emit_intrinsic(bld, instr); 3162b8e80941Smrg break; 3163b8e80941Smrg } 3164b8e80941Smrg} 3165b8e80941Smrg 3166b8e80941Smrg/** 3167b8e80941Smrg * Fetch the current render target layer index. 3168b8e80941Smrg */ 3169b8e80941Smrgstatic fs_reg 3170b8e80941Smrgfetch_render_target_array_index(const fs_builder &bld) 3171b8e80941Smrg{ 3172b8e80941Smrg if (bld.shader->devinfo->gen >= 6) { 3173b8e80941Smrg /* The render target array index is provided in the thread payload as 3174b8e80941Smrg * bits 26:16 of r0.0. 3175b8e80941Smrg */ 3176b8e80941Smrg const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3177b8e80941Smrg bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3178b8e80941Smrg brw_imm_uw(0x7ff)); 3179b8e80941Smrg return idx; 3180b8e80941Smrg } else { 3181b8e80941Smrg /* Pre-SNB we only ever render into the first layer of the framebuffer 3182b8e80941Smrg * since layered rendering is not implemented. 3183b8e80941Smrg */ 3184b8e80941Smrg return brw_imm_ud(0); 3185b8e80941Smrg } 3186b8e80941Smrg} 3187b8e80941Smrg 3188b8e80941Smrg/** 3189b8e80941Smrg * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3190b8e80941Smrg * framebuffer at the current fragment coordinates and sample index. 3191b8e80941Smrg */ 3192b8e80941Smrgfs_inst * 3193b8e80941Smrgfs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3194b8e80941Smrg unsigned target) 3195b8e80941Smrg{ 3196b8e80941Smrg const struct gen_device_info *devinfo = bld.shader->devinfo; 3197b8e80941Smrg 3198b8e80941Smrg assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3199b8e80941Smrg const brw_wm_prog_key *wm_key = 3200b8e80941Smrg reinterpret_cast<const brw_wm_prog_key *>(key); 3201b8e80941Smrg assert(!wm_key->coherent_fb_fetch); 3202b8e80941Smrg const struct brw_wm_prog_data *wm_prog_data = 3203b8e80941Smrg brw_wm_prog_data(stage_prog_data); 3204b8e80941Smrg 3205b8e80941Smrg /* Calculate the surface index relative to the start of the texture binding 3206b8e80941Smrg * table block, since that's what the texturing messages expect. 3207b8e80941Smrg */ 3208b8e80941Smrg const unsigned surface = target + 3209b8e80941Smrg wm_prog_data->binding_table.render_target_read_start - 3210b8e80941Smrg wm_prog_data->base.binding_table.texture_start; 3211b8e80941Smrg 3212b8e80941Smrg /* Calculate the fragment coordinates. */ 3213b8e80941Smrg const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3214b8e80941Smrg bld.MOV(offset(coords, bld, 0), pixel_x); 3215b8e80941Smrg bld.MOV(offset(coords, bld, 1), pixel_y); 3216b8e80941Smrg bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3217b8e80941Smrg 3218b8e80941Smrg /* Calculate the sample index and MCS payload when multisampling. Luckily 3219b8e80941Smrg * the MCS fetch message behaves deterministically for UMS surfaces, so it 3220b8e80941Smrg * shouldn't be necessary to recompile based on whether the framebuffer is 3221b8e80941Smrg * CMS or UMS. 3222b8e80941Smrg */ 3223b8e80941Smrg if (wm_key->multisample_fbo && 3224b8e80941Smrg nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3225b8e80941Smrg nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 3226b8e80941Smrg 3227b8e80941Smrg const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3228b8e80941Smrg const fs_reg mcs = wm_key->multisample_fbo ? 3229b8e80941Smrg emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg(); 3230b8e80941Smrg 3231b8e80941Smrg /* Use either a normal or a CMS texel fetch message depending on whether 3232b8e80941Smrg * the framebuffer is single or multisample. On SKL+ use the wide CMS 3233b8e80941Smrg * message just in case the framebuffer uses 16x multisampling, it should 3234b8e80941Smrg * be equivalent to the normal CMS fetch for lower multisampling modes. 3235b8e80941Smrg */ 3236b8e80941Smrg const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : 3237b8e80941Smrg devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : 3238b8e80941Smrg SHADER_OPCODE_TXF_CMS_LOGICAL; 3239b8e80941Smrg 3240b8e80941Smrg /* Emit the instruction. */ 3241b8e80941Smrg fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 3242b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; 3243b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); 3244b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; 3245b8e80941Smrg srcs[TEX_LOGICAL_SRC_MCS] = mcs; 3246b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3247b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 3248b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); 3249b8e80941Smrg srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); 3250b8e80941Smrg 3251b8e80941Smrg fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3252b8e80941Smrg inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3253b8e80941Smrg 3254b8e80941Smrg return inst; 3255b8e80941Smrg} 3256b8e80941Smrg 3257b8e80941Smrg/** 3258b8e80941Smrg * Actual coherent framebuffer read implemented using the native render target 3259b8e80941Smrg * read message. Requires SKL+. 3260b8e80941Smrg */ 3261b8e80941Smrgstatic fs_inst * 3262b8e80941Smrgemit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3263b8e80941Smrg{ 3264b8e80941Smrg assert(bld.shader->devinfo->gen >= 9); 3265b8e80941Smrg fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3266b8e80941Smrg inst->target = target; 3267b8e80941Smrg inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3268b8e80941Smrg 3269b8e80941Smrg return inst; 3270b8e80941Smrg} 3271b8e80941Smrg 3272b8e80941Smrgstatic fs_reg 3273b8e80941Smrgalloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3274b8e80941Smrg{ 3275b8e80941Smrg if (n && regs[0].file != BAD_FILE) { 3276b8e80941Smrg return regs[0]; 3277b8e80941Smrg 3278b8e80941Smrg } else { 3279b8e80941Smrg const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3280b8e80941Smrg 3281b8e80941Smrg for (unsigned i = 0; i < n; i++) 3282b8e80941Smrg regs[i] = tmp; 3283b8e80941Smrg 3284b8e80941Smrg return tmp; 3285b8e80941Smrg } 3286b8e80941Smrg} 3287b8e80941Smrg 3288b8e80941Smrgstatic fs_reg 3289b8e80941Smrgalloc_frag_output(fs_visitor *v, unsigned location) 3290b8e80941Smrg{ 3291b8e80941Smrg assert(v->stage == MESA_SHADER_FRAGMENT); 3292b8e80941Smrg const brw_wm_prog_key *const key = 3293b8e80941Smrg reinterpret_cast<const brw_wm_prog_key *>(v->key); 3294b8e80941Smrg const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3295b8e80941Smrg const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3296b8e80941Smrg 3297b8e80941Smrg if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3298b8e80941Smrg return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3299b8e80941Smrg 3300b8e80941Smrg else if (l == FRAG_RESULT_COLOR) 3301b8e80941Smrg return alloc_temporary(v->bld, 4, v->outputs, 3302b8e80941Smrg MAX2(key->nr_color_regions, 1)); 3303b8e80941Smrg 3304b8e80941Smrg else if (l == FRAG_RESULT_DEPTH) 3305b8e80941Smrg return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3306b8e80941Smrg 3307b8e80941Smrg else if (l == FRAG_RESULT_STENCIL) 3308b8e80941Smrg return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3309b8e80941Smrg 3310b8e80941Smrg else if (l == FRAG_RESULT_SAMPLE_MASK) 3311b8e80941Smrg return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3312b8e80941Smrg 3313b8e80941Smrg else if (l >= FRAG_RESULT_DATA0 && 3314b8e80941Smrg l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3315b8e80941Smrg return alloc_temporary(v->bld, 4, 3316b8e80941Smrg &v->outputs[l - FRAG_RESULT_DATA0], 1); 3317b8e80941Smrg 3318b8e80941Smrg else 3319b8e80941Smrg unreachable("Invalid location"); 3320b8e80941Smrg} 3321b8e80941Smrg 3322b8e80941Smrgvoid 3323b8e80941Smrgfs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3324b8e80941Smrg nir_intrinsic_instr *instr) 3325b8e80941Smrg{ 3326b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 3327b8e80941Smrg 3328b8e80941Smrg fs_reg dest; 3329b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3330b8e80941Smrg dest = get_nir_dest(instr->dest); 3331b8e80941Smrg 3332b8e80941Smrg switch (instr->intrinsic) { 3333b8e80941Smrg case nir_intrinsic_load_front_face: 3334b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3335b8e80941Smrg *emit_frontfacing_interpolation()); 3336b8e80941Smrg break; 3337b8e80941Smrg 3338b8e80941Smrg case nir_intrinsic_load_sample_pos: { 3339b8e80941Smrg fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3340b8e80941Smrg assert(sample_pos.file != BAD_FILE); 3341b8e80941Smrg dest.type = sample_pos.type; 3342b8e80941Smrg bld.MOV(dest, sample_pos); 3343b8e80941Smrg bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3344b8e80941Smrg break; 3345b8e80941Smrg } 3346b8e80941Smrg 3347b8e80941Smrg case nir_intrinsic_load_layer_id: 3348b8e80941Smrg dest.type = BRW_REGISTER_TYPE_UD; 3349b8e80941Smrg bld.MOV(dest, fetch_render_target_array_index(bld)); 3350b8e80941Smrg break; 3351b8e80941Smrg 3352b8e80941Smrg case nir_intrinsic_load_helper_invocation: 3353b8e80941Smrg case nir_intrinsic_load_sample_mask_in: 3354b8e80941Smrg case nir_intrinsic_load_sample_id: { 3355b8e80941Smrg gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3356b8e80941Smrg fs_reg val = nir_system_values[sv]; 3357b8e80941Smrg assert(val.file != BAD_FILE); 3358b8e80941Smrg dest.type = val.type; 3359b8e80941Smrg bld.MOV(dest, val); 3360b8e80941Smrg break; 3361b8e80941Smrg } 3362b8e80941Smrg 3363b8e80941Smrg case nir_intrinsic_store_output: { 3364b8e80941Smrg const fs_reg src = get_nir_src(instr->src[0]); 3365b8e80941Smrg const unsigned store_offset = nir_src_as_uint(instr->src[1]); 3366b8e80941Smrg const unsigned location = nir_intrinsic_base(instr) + 3367b8e80941Smrg SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); 3368b8e80941Smrg const fs_reg new_dest = retype(alloc_frag_output(this, location), 3369b8e80941Smrg src.type); 3370b8e80941Smrg 3371b8e80941Smrg for (unsigned j = 0; j < instr->num_components; j++) 3372b8e80941Smrg bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3373b8e80941Smrg offset(src, bld, j)); 3374b8e80941Smrg 3375b8e80941Smrg break; 3376b8e80941Smrg } 3377b8e80941Smrg 3378b8e80941Smrg case nir_intrinsic_load_output: { 3379b8e80941Smrg const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3380b8e80941Smrg BRW_NIR_FRAG_OUTPUT_LOCATION); 3381b8e80941Smrg assert(l >= FRAG_RESULT_DATA0); 3382b8e80941Smrg const unsigned load_offset = nir_src_as_uint(instr->src[0]); 3383b8e80941Smrg const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; 3384b8e80941Smrg const fs_reg tmp = bld.vgrf(dest.type, 4); 3385b8e80941Smrg 3386b8e80941Smrg if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3387b8e80941Smrg emit_coherent_fb_read(bld, tmp, target); 3388b8e80941Smrg else 3389b8e80941Smrg emit_non_coherent_fb_read(bld, tmp, target); 3390b8e80941Smrg 3391b8e80941Smrg for (unsigned j = 0; j < instr->num_components; j++) { 3392b8e80941Smrg bld.MOV(offset(dest, bld, j), 3393b8e80941Smrg offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3394b8e80941Smrg } 3395b8e80941Smrg 3396b8e80941Smrg break; 3397b8e80941Smrg } 3398b8e80941Smrg 3399b8e80941Smrg case nir_intrinsic_discard: 3400b8e80941Smrg case nir_intrinsic_discard_if: { 3401b8e80941Smrg /* We track our discarded pixels in f0.1. By predicating on it, we can 3402b8e80941Smrg * update just the flag bits that aren't yet discarded. If there's no 3403b8e80941Smrg * condition, we emit a CMP of g0 != g0, so all currently executing 3404b8e80941Smrg * channels will get turned off. 3405b8e80941Smrg */ 3406b8e80941Smrg fs_inst *cmp; 3407b8e80941Smrg if (instr->intrinsic == nir_intrinsic_discard_if) { 3408b8e80941Smrg cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3409b8e80941Smrg brw_imm_d(0), BRW_CONDITIONAL_Z); 3410b8e80941Smrg } else { 3411b8e80941Smrg fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3412b8e80941Smrg BRW_REGISTER_TYPE_UW)); 3413b8e80941Smrg cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3414b8e80941Smrg } 3415b8e80941Smrg cmp->predicate = BRW_PREDICATE_NORMAL; 3416b8e80941Smrg cmp->flag_subreg = 1; 3417b8e80941Smrg 3418b8e80941Smrg if (devinfo->gen >= 6) { 3419b8e80941Smrg emit_discard_jump(); 3420b8e80941Smrg } 3421b8e80941Smrg 3422b8e80941Smrg limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode."); 3423b8e80941Smrg break; 3424b8e80941Smrg } 3425b8e80941Smrg 3426b8e80941Smrg case nir_intrinsic_load_input: { 3427b8e80941Smrg /* load_input is only used for flat inputs */ 3428b8e80941Smrg unsigned base = nir_intrinsic_base(instr); 3429b8e80941Smrg unsigned comp = nir_intrinsic_component(instr); 3430b8e80941Smrg unsigned num_components = instr->num_components; 3431b8e80941Smrg fs_reg orig_dest = dest; 3432b8e80941Smrg enum brw_reg_type type = dest.type; 3433b8e80941Smrg 3434b8e80941Smrg /* Special case fields in the VUE header */ 3435b8e80941Smrg if (base == VARYING_SLOT_LAYER) 3436b8e80941Smrg comp = 1; 3437b8e80941Smrg else if (base == VARYING_SLOT_VIEWPORT) 3438b8e80941Smrg comp = 2; 3439b8e80941Smrg 3440b8e80941Smrg if (nir_dest_bit_size(instr->dest) == 64) { 3441b8e80941Smrg /* const_index is in 32-bit type size units that could not be aligned 3442b8e80941Smrg * with DF. We need to read the double vector as if it was a float 3443b8e80941Smrg * vector of twice the number of components to fetch the right data. 3444b8e80941Smrg */ 3445b8e80941Smrg type = BRW_REGISTER_TYPE_F; 3446b8e80941Smrg num_components *= 2; 3447b8e80941Smrg dest = bld.vgrf(type, num_components); 3448b8e80941Smrg } 3449b8e80941Smrg 3450b8e80941Smrg for (unsigned int i = 0; i < num_components; i++) { 3451b8e80941Smrg bld.MOV(offset(retype(dest, type), bld, i), 3452b8e80941Smrg retype(component(interp_reg(base, comp + i), 3), type)); 3453b8e80941Smrg } 3454b8e80941Smrg 3455b8e80941Smrg if (nir_dest_bit_size(instr->dest) == 64) { 3456b8e80941Smrg shuffle_from_32bit_read(bld, orig_dest, dest, 0, 3457b8e80941Smrg instr->num_components); 3458b8e80941Smrg } 3459b8e80941Smrg break; 3460b8e80941Smrg } 3461b8e80941Smrg 3462b8e80941Smrg case nir_intrinsic_load_barycentric_pixel: 3463b8e80941Smrg case nir_intrinsic_load_barycentric_centroid: 3464b8e80941Smrg case nir_intrinsic_load_barycentric_sample: 3465b8e80941Smrg /* Do nothing - load_interpolated_input handling will handle it later. */ 3466b8e80941Smrg break; 3467b8e80941Smrg 3468b8e80941Smrg case nir_intrinsic_load_barycentric_at_sample: { 3469b8e80941Smrg const glsl_interp_mode interpolation = 3470b8e80941Smrg (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3471b8e80941Smrg 3472b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 3473b8e80941Smrg unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; 3474b8e80941Smrg 3475b8e80941Smrg emit_pixel_interpolater_send(bld, 3476b8e80941Smrg FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3477b8e80941Smrg dest, 3478b8e80941Smrg fs_reg(), /* src */ 3479b8e80941Smrg brw_imm_ud(msg_data), 3480b8e80941Smrg interpolation); 3481b8e80941Smrg } else { 3482b8e80941Smrg const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3483b8e80941Smrg BRW_REGISTER_TYPE_UD); 3484b8e80941Smrg 3485b8e80941Smrg if (nir_src_is_dynamically_uniform(instr->src[0])) { 3486b8e80941Smrg const fs_reg sample_id = bld.emit_uniformize(sample_src); 3487b8e80941Smrg const fs_reg msg_data = vgrf(glsl_type::uint_type); 3488b8e80941Smrg bld.exec_all().group(1, 0) 3489b8e80941Smrg .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3490b8e80941Smrg emit_pixel_interpolater_send(bld, 3491b8e80941Smrg FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3492b8e80941Smrg dest, 3493b8e80941Smrg fs_reg(), /* src */ 3494b8e80941Smrg msg_data, 3495b8e80941Smrg interpolation); 3496b8e80941Smrg } else { 3497b8e80941Smrg /* Make a loop that sends a message to the pixel interpolater 3498b8e80941Smrg * for the sample number in each live channel. If there are 3499b8e80941Smrg * multiple channels with the same sample number then these 3500b8e80941Smrg * will be handled simultaneously with a single interation of 3501b8e80941Smrg * the loop. 3502b8e80941Smrg */ 3503b8e80941Smrg bld.emit(BRW_OPCODE_DO); 3504b8e80941Smrg 3505b8e80941Smrg /* Get the next live sample number into sample_id_reg */ 3506b8e80941Smrg const fs_reg sample_id = bld.emit_uniformize(sample_src); 3507b8e80941Smrg 3508b8e80941Smrg /* Set the flag register so that we can perform the send 3509b8e80941Smrg * message on all channels that have the same sample number 3510b8e80941Smrg */ 3511b8e80941Smrg bld.CMP(bld.null_reg_ud(), 3512b8e80941Smrg sample_src, sample_id, 3513b8e80941Smrg BRW_CONDITIONAL_EQ); 3514b8e80941Smrg const fs_reg msg_data = vgrf(glsl_type::uint_type); 3515b8e80941Smrg bld.exec_all().group(1, 0) 3516b8e80941Smrg .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3517b8e80941Smrg fs_inst *inst = 3518b8e80941Smrg emit_pixel_interpolater_send(bld, 3519b8e80941Smrg FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3520b8e80941Smrg dest, 3521b8e80941Smrg fs_reg(), /* src */ 3522b8e80941Smrg component(msg_data, 0), 3523b8e80941Smrg interpolation); 3524b8e80941Smrg set_predicate(BRW_PREDICATE_NORMAL, inst); 3525b8e80941Smrg 3526b8e80941Smrg /* Continue the loop if there are any live channels left */ 3527b8e80941Smrg set_predicate_inv(BRW_PREDICATE_NORMAL, 3528b8e80941Smrg true, /* inverse */ 3529b8e80941Smrg bld.emit(BRW_OPCODE_WHILE)); 3530b8e80941Smrg } 3531b8e80941Smrg } 3532b8e80941Smrg break; 3533b8e80941Smrg } 3534b8e80941Smrg 3535b8e80941Smrg case nir_intrinsic_load_barycentric_at_offset: { 3536b8e80941Smrg const glsl_interp_mode interpolation = 3537b8e80941Smrg (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3538b8e80941Smrg 3539b8e80941Smrg nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3540b8e80941Smrg 3541b8e80941Smrg if (const_offset) { 3542b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) == 32); 3543b8e80941Smrg unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; 3544b8e80941Smrg unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf; 3545b8e80941Smrg 3546b8e80941Smrg emit_pixel_interpolater_send(bld, 3547b8e80941Smrg FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3548b8e80941Smrg dest, 3549b8e80941Smrg fs_reg(), /* src */ 3550b8e80941Smrg brw_imm_ud(off_x | (off_y << 4)), 3551b8e80941Smrg interpolation); 3552b8e80941Smrg } else { 3553b8e80941Smrg fs_reg src = vgrf(glsl_type::ivec2_type); 3554b8e80941Smrg fs_reg offset_src = retype(get_nir_src(instr->src[0]), 3555b8e80941Smrg BRW_REGISTER_TYPE_F); 3556b8e80941Smrg for (int i = 0; i < 2; i++) { 3557b8e80941Smrg fs_reg temp = vgrf(glsl_type::float_type); 3558b8e80941Smrg bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); 3559b8e80941Smrg fs_reg itemp = vgrf(glsl_type::int_type); 3560b8e80941Smrg /* float to int */ 3561b8e80941Smrg bld.MOV(itemp, temp); 3562b8e80941Smrg 3563b8e80941Smrg /* Clamp the upper end of the range to +7/16. 3564b8e80941Smrg * ARB_gpu_shader5 requires that we support a maximum offset 3565b8e80941Smrg * of +0.5, which isn't representable in a S0.4 value -- if 3566b8e80941Smrg * we didn't clamp it, we'd end up with -8/16, which is the 3567b8e80941Smrg * opposite of what the shader author wanted. 3568b8e80941Smrg * 3569b8e80941Smrg * This is legal due to ARB_gpu_shader5's quantization 3570b8e80941Smrg * rules: 3571b8e80941Smrg * 3572b8e80941Smrg * "Not all values of <offset> may be supported; x and y 3573b8e80941Smrg * offsets may be rounded to fixed-point values with the 3574b8e80941Smrg * number of fraction bits given by the 3575b8e80941Smrg * implementation-dependent constant 3576b8e80941Smrg * FRAGMENT_INTERPOLATION_OFFSET_BITS" 3577b8e80941Smrg */ 3578b8e80941Smrg set_condmod(BRW_CONDITIONAL_L, 3579b8e80941Smrg bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); 3580b8e80941Smrg } 3581b8e80941Smrg 3582b8e80941Smrg const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3583b8e80941Smrg emit_pixel_interpolater_send(bld, 3584b8e80941Smrg opcode, 3585b8e80941Smrg dest, 3586b8e80941Smrg src, 3587b8e80941Smrg brw_imm_ud(0u), 3588b8e80941Smrg interpolation); 3589b8e80941Smrg } 3590b8e80941Smrg break; 3591b8e80941Smrg } 3592b8e80941Smrg 3593b8e80941Smrg case nir_intrinsic_load_interpolated_input: { 3594b8e80941Smrg if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { 3595b8e80941Smrg emit_fragcoord_interpolation(dest); 3596b8e80941Smrg break; 3597b8e80941Smrg } 3598b8e80941Smrg 3599b8e80941Smrg assert(instr->src[0].ssa && 3600b8e80941Smrg instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3601b8e80941Smrg nir_intrinsic_instr *bary_intrinsic = 3602b8e80941Smrg nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3603b8e80941Smrg nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3604b8e80941Smrg enum glsl_interp_mode interp_mode = 3605b8e80941Smrg (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3606b8e80941Smrg fs_reg dst_xy; 3607b8e80941Smrg 3608b8e80941Smrg if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3609b8e80941Smrg bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3610b8e80941Smrg /* Use the result of the PI message */ 3611b8e80941Smrg dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3612b8e80941Smrg } else { 3613b8e80941Smrg /* Use the delta_xy values computed from the payload */ 3614b8e80941Smrg enum brw_barycentric_mode bary = 3615b8e80941Smrg brw_barycentric_mode(interp_mode, bary_intrin); 3616b8e80941Smrg 3617b8e80941Smrg dst_xy = this->delta_xy[bary]; 3618b8e80941Smrg } 3619b8e80941Smrg 3620b8e80941Smrg for (unsigned int i = 0; i < instr->num_components; i++) { 3621b8e80941Smrg fs_reg interp = 3622b8e80941Smrg component(interp_reg(nir_intrinsic_base(instr), 3623b8e80941Smrg nir_intrinsic_component(instr) + i), 0); 3624b8e80941Smrg interp.type = BRW_REGISTER_TYPE_F; 3625b8e80941Smrg dest.type = BRW_REGISTER_TYPE_F; 3626b8e80941Smrg 3627b8e80941Smrg if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3628b8e80941Smrg fs_reg tmp = vgrf(glsl_type::float_type); 3629b8e80941Smrg bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3630b8e80941Smrg bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3631b8e80941Smrg } else { 3632b8e80941Smrg bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3633b8e80941Smrg } 3634b8e80941Smrg } 3635b8e80941Smrg break; 3636b8e80941Smrg } 3637b8e80941Smrg 3638b8e80941Smrg default: 3639b8e80941Smrg nir_emit_intrinsic(bld, instr); 3640b8e80941Smrg break; 3641b8e80941Smrg } 3642b8e80941Smrg} 3643b8e80941Smrg 3644b8e80941Smrgstatic int 3645b8e80941Smrgget_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 3646b8e80941Smrg{ 3647b8e80941Smrg if (nir_src_is_const(instr->src[src])) { 3648b8e80941Smrg int64_t add_val = nir_src_as_int(instr->src[src]); 3649b8e80941Smrg if (add_val == 1) 3650b8e80941Smrg return BRW_AOP_INC; 3651b8e80941Smrg else if (add_val == -1) 3652b8e80941Smrg return BRW_AOP_DEC; 3653b8e80941Smrg } 3654b8e80941Smrg 3655b8e80941Smrg return BRW_AOP_ADD; 3656b8e80941Smrg} 3657b8e80941Smrg 3658b8e80941Smrgvoid 3659b8e80941Smrgfs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3660b8e80941Smrg nir_intrinsic_instr *instr) 3661b8e80941Smrg{ 3662b8e80941Smrg assert(stage == MESA_SHADER_COMPUTE); 3663b8e80941Smrg struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3664b8e80941Smrg 3665b8e80941Smrg fs_reg dest; 3666b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3667b8e80941Smrg dest = get_nir_dest(instr->dest); 3668b8e80941Smrg 3669b8e80941Smrg switch (instr->intrinsic) { 3670b8e80941Smrg case nir_intrinsic_barrier: 3671b8e80941Smrg emit_barrier(); 3672b8e80941Smrg cs_prog_data->uses_barrier = true; 3673b8e80941Smrg break; 3674b8e80941Smrg 3675b8e80941Smrg case nir_intrinsic_load_subgroup_id: 3676b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3677b8e80941Smrg break; 3678b8e80941Smrg 3679b8e80941Smrg case nir_intrinsic_load_local_invocation_id: 3680b8e80941Smrg case nir_intrinsic_load_work_group_id: { 3681b8e80941Smrg gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3682b8e80941Smrg fs_reg val = nir_system_values[sv]; 3683b8e80941Smrg assert(val.file != BAD_FILE); 3684b8e80941Smrg dest.type = val.type; 3685b8e80941Smrg for (unsigned i = 0; i < 3; i++) 3686b8e80941Smrg bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3687b8e80941Smrg break; 3688b8e80941Smrg } 3689b8e80941Smrg 3690b8e80941Smrg case nir_intrinsic_load_num_work_groups: { 3691b8e80941Smrg const unsigned surface = 3692b8e80941Smrg cs_prog_data->binding_table.work_groups_start; 3693b8e80941Smrg 3694b8e80941Smrg cs_prog_data->uses_num_work_groups = true; 3695b8e80941Smrg 3696b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3697b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3698b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3699b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */ 3700b8e80941Smrg 3701b8e80941Smrg /* Read the 3 GLuint components of gl_NumWorkGroups */ 3702b8e80941Smrg for (unsigned i = 0; i < 3; i++) { 3703b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2); 3704b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3705b8e80941Smrg offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS); 3706b8e80941Smrg } 3707b8e80941Smrg break; 3708b8e80941Smrg } 3709b8e80941Smrg 3710b8e80941Smrg case nir_intrinsic_shared_atomic_add: 3711b8e80941Smrg nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); 3712b8e80941Smrg break; 3713b8e80941Smrg case nir_intrinsic_shared_atomic_imin: 3714b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); 3715b8e80941Smrg break; 3716b8e80941Smrg case nir_intrinsic_shared_atomic_umin: 3717b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); 3718b8e80941Smrg break; 3719b8e80941Smrg case nir_intrinsic_shared_atomic_imax: 3720b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); 3721b8e80941Smrg break; 3722b8e80941Smrg case nir_intrinsic_shared_atomic_umax: 3723b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); 3724b8e80941Smrg break; 3725b8e80941Smrg case nir_intrinsic_shared_atomic_and: 3726b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); 3727b8e80941Smrg break; 3728b8e80941Smrg case nir_intrinsic_shared_atomic_or: 3729b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); 3730b8e80941Smrg break; 3731b8e80941Smrg case nir_intrinsic_shared_atomic_xor: 3732b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); 3733b8e80941Smrg break; 3734b8e80941Smrg case nir_intrinsic_shared_atomic_exchange: 3735b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); 3736b8e80941Smrg break; 3737b8e80941Smrg case nir_intrinsic_shared_atomic_comp_swap: 3738b8e80941Smrg nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); 3739b8e80941Smrg break; 3740b8e80941Smrg case nir_intrinsic_shared_atomic_fmin: 3741b8e80941Smrg nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); 3742b8e80941Smrg break; 3743b8e80941Smrg case nir_intrinsic_shared_atomic_fmax: 3744b8e80941Smrg nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); 3745b8e80941Smrg break; 3746b8e80941Smrg case nir_intrinsic_shared_atomic_fcomp_swap: 3747b8e80941Smrg nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); 3748b8e80941Smrg break; 3749b8e80941Smrg 3750b8e80941Smrg case nir_intrinsic_load_shared: { 3751b8e80941Smrg assert(devinfo->gen >= 7); 3752b8e80941Smrg assert(stage == MESA_SHADER_COMPUTE); 3753b8e80941Smrg 3754b8e80941Smrg const unsigned bit_size = nir_dest_bit_size(instr->dest); 3755b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3756b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 3757b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); 3758b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3759b8e80941Smrg 3760b8e80941Smrg /* Make dest unsigned because that's what the temporary will be */ 3761b8e80941Smrg dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3762b8e80941Smrg 3763b8e80941Smrg /* Read the vector */ 3764b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 3765b8e80941Smrg assert(nir_dest_bit_size(instr->dest) == 32); 3766b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3767b8e80941Smrg fs_inst *inst = 3768b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3769b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3770b8e80941Smrg inst->size_written = instr->num_components * dispatch_width * 4; 3771b8e80941Smrg } else { 3772b8e80941Smrg assert(nir_dest_bit_size(instr->dest) <= 32); 3773b8e80941Smrg assert(nir_dest_num_components(instr->dest) == 1); 3774b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3775b8e80941Smrg 3776b8e80941Smrg fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 3777b8e80941Smrg bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 3778b8e80941Smrg read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 3779b8e80941Smrg bld.MOV(dest, read_result); 3780b8e80941Smrg } 3781b8e80941Smrg break; 3782b8e80941Smrg } 3783b8e80941Smrg 3784b8e80941Smrg case nir_intrinsic_store_shared: { 3785b8e80941Smrg assert(devinfo->gen >= 7); 3786b8e80941Smrg assert(stage == MESA_SHADER_COMPUTE); 3787b8e80941Smrg 3788b8e80941Smrg const unsigned bit_size = nir_src_bit_size(instr->src[0]); 3789b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3790b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 3791b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 3792b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3793b8e80941Smrg 3794b8e80941Smrg fs_reg data = get_nir_src(instr->src[0]); 3795b8e80941Smrg data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3796b8e80941Smrg 3797b8e80941Smrg assert(nir_intrinsic_write_mask(instr) == 3798b8e80941Smrg (1u << instr->num_components) - 1); 3799b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 3800b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) == 32); 3801b8e80941Smrg assert(nir_src_num_components(instr->src[0]) <= 4); 3802b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 3803b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3804b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 3805b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3806b8e80941Smrg } else { 3807b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) <= 32); 3808b8e80941Smrg assert(nir_src_num_components(instr->src[0]) == 1); 3809b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3810b8e80941Smrg 3811b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 3812b8e80941Smrg bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 3813b8e80941Smrg 3814b8e80941Smrg bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 3815b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3816b8e80941Smrg } 3817b8e80941Smrg break; 3818b8e80941Smrg } 3819b8e80941Smrg 3820b8e80941Smrg default: 3821b8e80941Smrg nir_emit_intrinsic(bld, instr); 3822b8e80941Smrg break; 3823b8e80941Smrg } 3824b8e80941Smrg} 3825b8e80941Smrg 3826b8e80941Smrgstatic fs_reg 3827b8e80941Smrgbrw_nir_reduction_op_identity(const fs_builder &bld, 3828b8e80941Smrg nir_op op, brw_reg_type type) 3829b8e80941Smrg{ 3830b8e80941Smrg nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 3831b8e80941Smrg switch (type_sz(type)) { 3832b8e80941Smrg case 2: 3833b8e80941Smrg assert(type != BRW_REGISTER_TYPE_HF); 3834b8e80941Smrg return retype(brw_imm_uw(value.u16), type); 3835b8e80941Smrg case 4: 3836b8e80941Smrg return retype(brw_imm_ud(value.u32), type); 3837b8e80941Smrg case 8: 3838b8e80941Smrg if (type == BRW_REGISTER_TYPE_DF) 3839b8e80941Smrg return setup_imm_df(bld, value.f64); 3840b8e80941Smrg else 3841b8e80941Smrg return retype(brw_imm_u64(value.u64), type); 3842b8e80941Smrg default: 3843b8e80941Smrg unreachable("Invalid type size"); 3844b8e80941Smrg } 3845b8e80941Smrg} 3846b8e80941Smrg 3847b8e80941Smrgstatic opcode 3848b8e80941Smrgbrw_op_for_nir_reduction_op(nir_op op) 3849b8e80941Smrg{ 3850b8e80941Smrg switch (op) { 3851b8e80941Smrg case nir_op_iadd: return BRW_OPCODE_ADD; 3852b8e80941Smrg case nir_op_fadd: return BRW_OPCODE_ADD; 3853b8e80941Smrg case nir_op_imul: return BRW_OPCODE_MUL; 3854b8e80941Smrg case nir_op_fmul: return BRW_OPCODE_MUL; 3855b8e80941Smrg case nir_op_imin: return BRW_OPCODE_SEL; 3856b8e80941Smrg case nir_op_umin: return BRW_OPCODE_SEL; 3857b8e80941Smrg case nir_op_fmin: return BRW_OPCODE_SEL; 3858b8e80941Smrg case nir_op_imax: return BRW_OPCODE_SEL; 3859b8e80941Smrg case nir_op_umax: return BRW_OPCODE_SEL; 3860b8e80941Smrg case nir_op_fmax: return BRW_OPCODE_SEL; 3861b8e80941Smrg case nir_op_iand: return BRW_OPCODE_AND; 3862b8e80941Smrg case nir_op_ior: return BRW_OPCODE_OR; 3863b8e80941Smrg case nir_op_ixor: return BRW_OPCODE_XOR; 3864b8e80941Smrg default: 3865b8e80941Smrg unreachable("Invalid reduction operation"); 3866b8e80941Smrg } 3867b8e80941Smrg} 3868b8e80941Smrg 3869b8e80941Smrgstatic brw_conditional_mod 3870b8e80941Smrgbrw_cond_mod_for_nir_reduction_op(nir_op op) 3871b8e80941Smrg{ 3872b8e80941Smrg switch (op) { 3873b8e80941Smrg case nir_op_iadd: return BRW_CONDITIONAL_NONE; 3874b8e80941Smrg case nir_op_fadd: return BRW_CONDITIONAL_NONE; 3875b8e80941Smrg case nir_op_imul: return BRW_CONDITIONAL_NONE; 3876b8e80941Smrg case nir_op_fmul: return BRW_CONDITIONAL_NONE; 3877b8e80941Smrg case nir_op_imin: return BRW_CONDITIONAL_L; 3878b8e80941Smrg case nir_op_umin: return BRW_CONDITIONAL_L; 3879b8e80941Smrg case nir_op_fmin: return BRW_CONDITIONAL_L; 3880b8e80941Smrg case nir_op_imax: return BRW_CONDITIONAL_GE; 3881b8e80941Smrg case nir_op_umax: return BRW_CONDITIONAL_GE; 3882b8e80941Smrg case nir_op_fmax: return BRW_CONDITIONAL_GE; 3883b8e80941Smrg case nir_op_iand: return BRW_CONDITIONAL_NONE; 3884b8e80941Smrg case nir_op_ior: return BRW_CONDITIONAL_NONE; 3885b8e80941Smrg case nir_op_ixor: return BRW_CONDITIONAL_NONE; 3886b8e80941Smrg default: 3887b8e80941Smrg unreachable("Invalid reduction operation"); 3888b8e80941Smrg } 3889b8e80941Smrg} 3890b8e80941Smrg 3891b8e80941Smrgfs_reg 3892b8e80941Smrgfs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 3893b8e80941Smrg nir_intrinsic_instr *instr) 3894b8e80941Smrg{ 3895b8e80941Smrg fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 3896b8e80941Smrg 3897b8e80941Smrg if (stage_prog_data->binding_table.image_start > 0) { 3898b8e80941Smrg if (image.file == BRW_IMMEDIATE_VALUE) { 3899b8e80941Smrg image.d += stage_prog_data->binding_table.image_start; 3900b8e80941Smrg } else { 3901b8e80941Smrg bld.ADD(image, image, 3902b8e80941Smrg brw_imm_d(stage_prog_data->binding_table.image_start)); 3903b8e80941Smrg } 3904b8e80941Smrg } 3905b8e80941Smrg 3906b8e80941Smrg return bld.emit_uniformize(image); 3907b8e80941Smrg} 3908b8e80941Smrg 3909b8e80941Smrgfs_reg 3910b8e80941Smrgfs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, 3911b8e80941Smrg nir_intrinsic_instr *instr) 3912b8e80941Smrg{ 3913b8e80941Smrg /* SSBO stores are weird in that their index is in src[1] */ 3914b8e80941Smrg const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0; 3915b8e80941Smrg 3916b8e80941Smrg fs_reg surf_index; 3917b8e80941Smrg if (nir_src_is_const(instr->src[src])) { 3918b8e80941Smrg unsigned index = stage_prog_data->binding_table.ssbo_start + 3919b8e80941Smrg nir_src_as_uint(instr->src[src]); 3920b8e80941Smrg surf_index = brw_imm_ud(index); 3921b8e80941Smrg } else { 3922b8e80941Smrg surf_index = vgrf(glsl_type::uint_type); 3923b8e80941Smrg bld.ADD(surf_index, get_nir_src(instr->src[src]), 3924b8e80941Smrg brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 3925b8e80941Smrg } 3926b8e80941Smrg 3927b8e80941Smrg return bld.emit_uniformize(surf_index); 3928b8e80941Smrg} 3929b8e80941Smrg 3930b8e80941Smrgstatic unsigned 3931b8e80941Smrgimage_intrinsic_coord_components(nir_intrinsic_instr *instr) 3932b8e80941Smrg{ 3933b8e80941Smrg switch (nir_intrinsic_image_dim(instr)) { 3934b8e80941Smrg case GLSL_SAMPLER_DIM_1D: 3935b8e80941Smrg return 1 + nir_intrinsic_image_array(instr); 3936b8e80941Smrg case GLSL_SAMPLER_DIM_2D: 3937b8e80941Smrg case GLSL_SAMPLER_DIM_RECT: 3938b8e80941Smrg return 2 + nir_intrinsic_image_array(instr); 3939b8e80941Smrg case GLSL_SAMPLER_DIM_3D: 3940b8e80941Smrg case GLSL_SAMPLER_DIM_CUBE: 3941b8e80941Smrg return 3; 3942b8e80941Smrg case GLSL_SAMPLER_DIM_BUF: 3943b8e80941Smrg return 1; 3944b8e80941Smrg case GLSL_SAMPLER_DIM_MS: 3945b8e80941Smrg return 2 + nir_intrinsic_image_array(instr); 3946b8e80941Smrg default: 3947b8e80941Smrg unreachable("Invalid image dimension"); 3948b8e80941Smrg } 3949b8e80941Smrg} 3950b8e80941Smrg 3951b8e80941Smrgvoid 3952b8e80941Smrgfs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 3953b8e80941Smrg{ 3954b8e80941Smrg fs_reg dest; 3955b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3956b8e80941Smrg dest = get_nir_dest(instr->dest); 3957b8e80941Smrg 3958b8e80941Smrg switch (instr->intrinsic) { 3959b8e80941Smrg case nir_intrinsic_image_load: 3960b8e80941Smrg case nir_intrinsic_image_store: 3961b8e80941Smrg case nir_intrinsic_image_atomic_add: 3962b8e80941Smrg case nir_intrinsic_image_atomic_min: 3963b8e80941Smrg case nir_intrinsic_image_atomic_max: 3964b8e80941Smrg case nir_intrinsic_image_atomic_and: 3965b8e80941Smrg case nir_intrinsic_image_atomic_or: 3966b8e80941Smrg case nir_intrinsic_image_atomic_xor: 3967b8e80941Smrg case nir_intrinsic_image_atomic_exchange: 3968b8e80941Smrg case nir_intrinsic_image_atomic_comp_swap: 3969b8e80941Smrg case nir_intrinsic_bindless_image_load: 3970b8e80941Smrg case nir_intrinsic_bindless_image_store: 3971b8e80941Smrg case nir_intrinsic_bindless_image_atomic_add: 3972b8e80941Smrg case nir_intrinsic_bindless_image_atomic_min: 3973b8e80941Smrg case nir_intrinsic_bindless_image_atomic_max: 3974b8e80941Smrg case nir_intrinsic_bindless_image_atomic_and: 3975b8e80941Smrg case nir_intrinsic_bindless_image_atomic_or: 3976b8e80941Smrg case nir_intrinsic_bindless_image_atomic_xor: 3977b8e80941Smrg case nir_intrinsic_bindless_image_atomic_exchange: 3978b8e80941Smrg case nir_intrinsic_bindless_image_atomic_comp_swap: { 3979b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT && 3980b8e80941Smrg instr->intrinsic != nir_intrinsic_image_load) 3981b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 3982b8e80941Smrg 3983b8e80941Smrg /* Get some metadata from the image intrinsic. */ 3984b8e80941Smrg const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3985b8e80941Smrg const GLenum format = nir_intrinsic_format(instr); 3986b8e80941Smrg 3987b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3988b8e80941Smrg 3989b8e80941Smrg switch (instr->intrinsic) { 3990b8e80941Smrg case nir_intrinsic_image_load: 3991b8e80941Smrg case nir_intrinsic_image_store: 3992b8e80941Smrg case nir_intrinsic_image_atomic_add: 3993b8e80941Smrg case nir_intrinsic_image_atomic_min: 3994b8e80941Smrg case nir_intrinsic_image_atomic_max: 3995b8e80941Smrg case nir_intrinsic_image_atomic_and: 3996b8e80941Smrg case nir_intrinsic_image_atomic_or: 3997b8e80941Smrg case nir_intrinsic_image_atomic_xor: 3998b8e80941Smrg case nir_intrinsic_image_atomic_exchange: 3999b8e80941Smrg case nir_intrinsic_image_atomic_comp_swap: 4000b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4001b8e80941Smrg get_nir_image_intrinsic_image(bld, instr); 4002b8e80941Smrg break; 4003b8e80941Smrg 4004b8e80941Smrg default: 4005b8e80941Smrg /* Bindless */ 4006b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = 4007b8e80941Smrg bld.emit_uniformize(get_nir_src(instr->src[0])); 4008b8e80941Smrg break; 4009b8e80941Smrg } 4010b8e80941Smrg 4011b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4012b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = 4013b8e80941Smrg brw_imm_ud(image_intrinsic_coord_components(instr)); 4014b8e80941Smrg 4015b8e80941Smrg /* Emit an image load, store or atomic op. */ 4016b8e80941Smrg if (instr->intrinsic == nir_intrinsic_image_load || 4017b8e80941Smrg instr->intrinsic == nir_intrinsic_bindless_image_load) { 4018b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4019b8e80941Smrg fs_inst *inst = 4020b8e80941Smrg bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 4021b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4022b8e80941Smrg inst->size_written = instr->num_components * dispatch_width * 4; 4023b8e80941Smrg } else if (instr->intrinsic == nir_intrinsic_image_store || 4024b8e80941Smrg instr->intrinsic == nir_intrinsic_bindless_image_store) { 4025b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4026b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); 4027b8e80941Smrg bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 4028b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4029b8e80941Smrg } else { 4030b8e80941Smrg int op; 4031b8e80941Smrg unsigned num_srcs = info->num_srcs; 4032b8e80941Smrg 4033b8e80941Smrg switch (instr->intrinsic) { 4034b8e80941Smrg case nir_intrinsic_image_atomic_add: 4035b8e80941Smrg case nir_intrinsic_bindless_image_atomic_add: 4036b8e80941Smrg assert(num_srcs == 4); 4037b8e80941Smrg 4038b8e80941Smrg op = get_op_for_atomic_add(instr, 3); 4039b8e80941Smrg 4040b8e80941Smrg if (op != BRW_AOP_ADD) 4041b8e80941Smrg num_srcs = 3; 4042b8e80941Smrg break; 4043b8e80941Smrg case nir_intrinsic_image_atomic_min: 4044b8e80941Smrg case nir_intrinsic_bindless_image_atomic_min: 4045b8e80941Smrg assert(format == GL_R32UI || format == GL_R32I); 4046b8e80941Smrg op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN; 4047b8e80941Smrg break; 4048b8e80941Smrg case nir_intrinsic_image_atomic_max: 4049b8e80941Smrg case nir_intrinsic_bindless_image_atomic_max: 4050b8e80941Smrg assert(format == GL_R32UI || format == GL_R32I); 4051b8e80941Smrg op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX; 4052b8e80941Smrg break; 4053b8e80941Smrg case nir_intrinsic_image_atomic_and: 4054b8e80941Smrg case nir_intrinsic_bindless_image_atomic_and: 4055b8e80941Smrg op = BRW_AOP_AND; 4056b8e80941Smrg break; 4057b8e80941Smrg case nir_intrinsic_image_atomic_or: 4058b8e80941Smrg case nir_intrinsic_bindless_image_atomic_or: 4059b8e80941Smrg op = BRW_AOP_OR; 4060b8e80941Smrg break; 4061b8e80941Smrg case nir_intrinsic_image_atomic_xor: 4062b8e80941Smrg case nir_intrinsic_bindless_image_atomic_xor: 4063b8e80941Smrg op = BRW_AOP_XOR; 4064b8e80941Smrg break; 4065b8e80941Smrg case nir_intrinsic_image_atomic_exchange: 4066b8e80941Smrg case nir_intrinsic_bindless_image_atomic_exchange: 4067b8e80941Smrg op = BRW_AOP_MOV; 4068b8e80941Smrg break; 4069b8e80941Smrg case nir_intrinsic_image_atomic_comp_swap: 4070b8e80941Smrg case nir_intrinsic_bindless_image_atomic_comp_swap: 4071b8e80941Smrg op = BRW_AOP_CMPWR; 4072b8e80941Smrg break; 4073b8e80941Smrg default: 4074b8e80941Smrg unreachable("Not reachable."); 4075b8e80941Smrg } 4076b8e80941Smrg 4077b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 4078b8e80941Smrg 4079b8e80941Smrg fs_reg data; 4080b8e80941Smrg if (num_srcs >= 4) 4081b8e80941Smrg data = get_nir_src(instr->src[3]); 4082b8e80941Smrg if (num_srcs >= 5) { 4083b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 4084b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[4]) }; 4085b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 4086b8e80941Smrg data = tmp; 4087b8e80941Smrg } 4088b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4089b8e80941Smrg 4090b8e80941Smrg bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 4091b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4092b8e80941Smrg } 4093b8e80941Smrg break; 4094b8e80941Smrg } 4095b8e80941Smrg 4096b8e80941Smrg case nir_intrinsic_image_size: 4097b8e80941Smrg case nir_intrinsic_bindless_image_size: { 4098b8e80941Smrg /* Unlike the [un]typed load and store opcodes, the TXS that this turns 4099b8e80941Smrg * into will handle the binding table index for us in the geneerator. 4100b8e80941Smrg * Incidentally, this means that we can handle bindless with exactly the 4101b8e80941Smrg * same code. 4102b8e80941Smrg */ 4103b8e80941Smrg fs_reg image = retype(get_nir_src_imm(instr->src[0]), 4104b8e80941Smrg BRW_REGISTER_TYPE_UD); 4105b8e80941Smrg image = bld.emit_uniformize(image); 4106b8e80941Smrg 4107b8e80941Smrg fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4108b8e80941Smrg if (instr->intrinsic == nir_intrinsic_image_size) 4109b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = image; 4110b8e80941Smrg else 4111b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; 4112b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); 4113b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); 4114b8e80941Smrg srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 4115b8e80941Smrg 4116b8e80941Smrg /* Since the image size is always uniform, we can just emit a SIMD8 4117b8e80941Smrg * query instruction and splat the result out. 4118b8e80941Smrg */ 4119b8e80941Smrg const fs_builder ubld = bld.exec_all().group(8, 0); 4120b8e80941Smrg 4121b8e80941Smrg fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4122b8e80941Smrg fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, 4123b8e80941Smrg tmp, srcs, ARRAY_SIZE(srcs)); 4124b8e80941Smrg inst->size_written = 4 * REG_SIZE; 4125b8e80941Smrg 4126b8e80941Smrg for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 4127b8e80941Smrg if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) { 4128b8e80941Smrg bld.emit(SHADER_OPCODE_INT_QUOTIENT, 4129b8e80941Smrg offset(retype(dest, tmp.type), bld, c), 4130b8e80941Smrg component(offset(tmp, ubld, c), 0), brw_imm_ud(6)); 4131b8e80941Smrg } else { 4132b8e80941Smrg bld.MOV(offset(retype(dest, tmp.type), bld, c), 4133b8e80941Smrg component(offset(tmp, ubld, c), 0)); 4134b8e80941Smrg } 4135b8e80941Smrg } 4136b8e80941Smrg break; 4137b8e80941Smrg } 4138b8e80941Smrg 4139b8e80941Smrg case nir_intrinsic_image_load_raw_intel: { 4140b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4141b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4142b8e80941Smrg get_nir_image_intrinsic_image(bld, instr); 4143b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4144b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4145b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4146b8e80941Smrg 4147b8e80941Smrg fs_inst *inst = 4148b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4149b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4150b8e80941Smrg inst->size_written = instr->num_components * dispatch_width * 4; 4151b8e80941Smrg break; 4152b8e80941Smrg } 4153b8e80941Smrg 4154b8e80941Smrg case nir_intrinsic_image_store_raw_intel: { 4155b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 4156b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 4157b8e80941Smrg 4158b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4159b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4160b8e80941Smrg get_nir_image_intrinsic_image(bld, instr); 4161b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4162b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); 4163b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4164b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4165b8e80941Smrg 4166b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4167b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4168b8e80941Smrg break; 4169b8e80941Smrg } 4170b8e80941Smrg 4171b8e80941Smrg case nir_intrinsic_group_memory_barrier: 4172b8e80941Smrg case nir_intrinsic_memory_barrier_shared: 4173b8e80941Smrg case nir_intrinsic_memory_barrier_atomic_counter: 4174b8e80941Smrg case nir_intrinsic_memory_barrier_buffer: 4175b8e80941Smrg case nir_intrinsic_memory_barrier_image: 4176b8e80941Smrg case nir_intrinsic_memory_barrier: { 4177b8e80941Smrg const fs_builder ubld = bld.group(8, 0); 4178b8e80941Smrg const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4179b8e80941Smrg ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, 4180b8e80941Smrg brw_vec8_grf(0, 0), brw_imm_ud(0)) 4181b8e80941Smrg ->size_written = 2 * REG_SIZE; 4182b8e80941Smrg break; 4183b8e80941Smrg } 4184b8e80941Smrg 4185b8e80941Smrg case nir_intrinsic_shader_clock: { 4186b8e80941Smrg /* We cannot do anything if there is an event, so ignore it for now */ 4187b8e80941Smrg const fs_reg shader_clock = get_timestamp(bld); 4188b8e80941Smrg const fs_reg srcs[] = { component(shader_clock, 0), 4189b8e80941Smrg component(shader_clock, 1) }; 4190b8e80941Smrg bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 4191b8e80941Smrg break; 4192b8e80941Smrg } 4193b8e80941Smrg 4194b8e80941Smrg case nir_intrinsic_image_samples: 4195b8e80941Smrg /* The driver does not support multi-sampled images. */ 4196b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 4197b8e80941Smrg break; 4198b8e80941Smrg 4199b8e80941Smrg case nir_intrinsic_load_uniform: { 4200b8e80941Smrg /* Offsets are in bytes but they should always aligned to 4201b8e80941Smrg * the type size 4202b8e80941Smrg */ 4203b8e80941Smrg assert(instr->const_index[0] % 4 == 0 || 4204b8e80941Smrg instr->const_index[0] % type_sz(dest.type) == 0); 4205b8e80941Smrg 4206b8e80941Smrg fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 4207b8e80941Smrg 4208b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 4209b8e80941Smrg unsigned load_offset = nir_src_as_uint(instr->src[0]); 4210b8e80941Smrg assert(load_offset % type_sz(dest.type) == 0); 4211b8e80941Smrg /* For 16-bit types we add the module of the const_index[0] 4212b8e80941Smrg * offset to access to not 32-bit aligned element 4213b8e80941Smrg */ 4214b8e80941Smrg src.offset = load_offset + instr->const_index[0] % 4; 4215b8e80941Smrg 4216b8e80941Smrg for (unsigned j = 0; j < instr->num_components; j++) { 4217b8e80941Smrg bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 4218b8e80941Smrg } 4219b8e80941Smrg } else { 4220b8e80941Smrg fs_reg indirect = retype(get_nir_src(instr->src[0]), 4221b8e80941Smrg BRW_REGISTER_TYPE_UD); 4222b8e80941Smrg 4223b8e80941Smrg /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4224b8e80941Smrg * go past the end of the uniform. In order to keep the n'th 4225b8e80941Smrg * component from running past, we subtract off the size of all but 4226b8e80941Smrg * one component of the vector. 4227b8e80941Smrg */ 4228b8e80941Smrg assert(instr->const_index[1] >= 4229b8e80941Smrg instr->num_components * (int) type_sz(dest.type)); 4230b8e80941Smrg unsigned read_size = instr->const_index[1] - 4231b8e80941Smrg (instr->num_components - 1) * type_sz(dest.type); 4232b8e80941Smrg 4233b8e80941Smrg bool supports_64bit_indirects = 4234b8e80941Smrg !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo); 4235b8e80941Smrg 4236b8e80941Smrg if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4237b8e80941Smrg for (unsigned j = 0; j < instr->num_components; j++) { 4238b8e80941Smrg bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4239b8e80941Smrg offset(dest, bld, j), offset(src, bld, j), 4240b8e80941Smrg indirect, brw_imm_ud(read_size)); 4241b8e80941Smrg } 4242b8e80941Smrg } else { 4243b8e80941Smrg const unsigned num_mov_indirects = 4244b8e80941Smrg type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4245b8e80941Smrg /* We read a little bit less per MOV INDIRECT, as they are now 4246b8e80941Smrg * 32-bits ones instead of 64-bit. Fix read_size then. 4247b8e80941Smrg */ 4248b8e80941Smrg const unsigned read_size_32bit = read_size - 4249b8e80941Smrg (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4250b8e80941Smrg for (unsigned j = 0; j < instr->num_components; j++) { 4251b8e80941Smrg for (unsigned i = 0; i < num_mov_indirects; i++) { 4252b8e80941Smrg bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4253b8e80941Smrg subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4254b8e80941Smrg subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4255b8e80941Smrg indirect, brw_imm_ud(read_size_32bit)); 4256b8e80941Smrg } 4257b8e80941Smrg } 4258b8e80941Smrg } 4259b8e80941Smrg } 4260b8e80941Smrg break; 4261b8e80941Smrg } 4262b8e80941Smrg 4263b8e80941Smrg case nir_intrinsic_load_ubo: { 4264b8e80941Smrg fs_reg surf_index; 4265b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 4266b8e80941Smrg const unsigned index = stage_prog_data->binding_table.ubo_start + 4267b8e80941Smrg nir_src_as_uint(instr->src[0]); 4268b8e80941Smrg surf_index = brw_imm_ud(index); 4269b8e80941Smrg } else { 4270b8e80941Smrg /* The block index is not a constant. Evaluate the index expression 4271b8e80941Smrg * per-channel and add the base UBO index; we have to select a value 4272b8e80941Smrg * from any live channel. 4273b8e80941Smrg */ 4274b8e80941Smrg surf_index = vgrf(glsl_type::uint_type); 4275b8e80941Smrg bld.ADD(surf_index, get_nir_src(instr->src[0]), 4276b8e80941Smrg brw_imm_ud(stage_prog_data->binding_table.ubo_start)); 4277b8e80941Smrg surf_index = bld.emit_uniformize(surf_index); 4278b8e80941Smrg } 4279b8e80941Smrg 4280b8e80941Smrg if (!nir_src_is_const(instr->src[1])) { 4281b8e80941Smrg fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4282b8e80941Smrg BRW_REGISTER_TYPE_UD); 4283b8e80941Smrg 4284b8e80941Smrg for (int i = 0; i < instr->num_components; i++) 4285b8e80941Smrg VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4286b8e80941Smrg base_offset, i * type_sz(dest.type)); 4287b8e80941Smrg } else { 4288b8e80941Smrg /* Even if we are loading doubles, a pull constant load will load 4289b8e80941Smrg * a 32-bit vec4, so should only reserve vgrf space for that. If we 4290b8e80941Smrg * need to load a full dvec4 we will have to emit 2 loads. This is 4291b8e80941Smrg * similar to demote_pull_constants(), except that in that case we 4292b8e80941Smrg * see individual accesses to each component of the vector and then 4293b8e80941Smrg * we let CSE deal with duplicate loads. Here we see a vector access 4294b8e80941Smrg * and we have to split it if necessary. 4295b8e80941Smrg */ 4296b8e80941Smrg const unsigned type_size = type_sz(dest.type); 4297b8e80941Smrg const unsigned load_offset = nir_src_as_uint(instr->src[1]); 4298b8e80941Smrg 4299b8e80941Smrg /* See if we've selected this as a push constant candidate */ 4300b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 4301b8e80941Smrg const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 4302b8e80941Smrg const unsigned offset_256b = load_offset / 32; 4303b8e80941Smrg 4304b8e80941Smrg fs_reg push_reg; 4305b8e80941Smrg for (int i = 0; i < 4; i++) { 4306b8e80941Smrg const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4307b8e80941Smrg if (range->block == ubo_block && 4308b8e80941Smrg offset_256b >= range->start && 4309b8e80941Smrg offset_256b < range->start + range->length) { 4310b8e80941Smrg 4311b8e80941Smrg push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4312b8e80941Smrg push_reg.offset = load_offset - 32 * range->start; 4313b8e80941Smrg break; 4314b8e80941Smrg } 4315b8e80941Smrg } 4316b8e80941Smrg 4317b8e80941Smrg if (push_reg.file != BAD_FILE) { 4318b8e80941Smrg for (unsigned i = 0; i < instr->num_components; i++) { 4319b8e80941Smrg bld.MOV(offset(dest, bld, i), 4320b8e80941Smrg byte_offset(push_reg, i * type_size)); 4321b8e80941Smrg } 4322b8e80941Smrg break; 4323b8e80941Smrg } 4324b8e80941Smrg } 4325b8e80941Smrg 4326b8e80941Smrg const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4327b8e80941Smrg const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4328b8e80941Smrg const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4329b8e80941Smrg 4330b8e80941Smrg for (unsigned c = 0; c < instr->num_components;) { 4331b8e80941Smrg const unsigned base = load_offset + c * type_size; 4332b8e80941Smrg /* Number of usable components in the next block-aligned load. */ 4333b8e80941Smrg const unsigned count = MIN2(instr->num_components - c, 4334b8e80941Smrg (block_sz - base % block_sz) / type_size); 4335b8e80941Smrg 4336b8e80941Smrg ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4337b8e80941Smrg packed_consts, surf_index, 4338b8e80941Smrg brw_imm_ud(base & ~(block_sz - 1))); 4339b8e80941Smrg 4340b8e80941Smrg const fs_reg consts = 4341b8e80941Smrg retype(byte_offset(packed_consts, base & (block_sz - 1)), 4342b8e80941Smrg dest.type); 4343b8e80941Smrg 4344b8e80941Smrg for (unsigned d = 0; d < count; d++) 4345b8e80941Smrg bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4346b8e80941Smrg 4347b8e80941Smrg c += count; 4348b8e80941Smrg } 4349b8e80941Smrg } 4350b8e80941Smrg break; 4351b8e80941Smrg } 4352b8e80941Smrg 4353b8e80941Smrg case nir_intrinsic_load_global: { 4354b8e80941Smrg assert(devinfo->gen >= 8); 4355b8e80941Smrg 4356b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 4357b8e80941Smrg assert(nir_dest_bit_size(instr->dest) == 32); 4358b8e80941Smrg fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, 4359b8e80941Smrg dest, 4360b8e80941Smrg get_nir_src(instr->src[0]), /* Address */ 4361b8e80941Smrg fs_reg(), /* No source data */ 4362b8e80941Smrg brw_imm_ud(instr->num_components)); 4363b8e80941Smrg inst->size_written = instr->num_components * 4364b8e80941Smrg inst->dst.component_size(inst->exec_size); 4365b8e80941Smrg } else { 4366b8e80941Smrg const unsigned bit_size = nir_dest_bit_size(instr->dest); 4367b8e80941Smrg assert(bit_size <= 32); 4368b8e80941Smrg assert(nir_dest_num_components(instr->dest) == 1); 4369b8e80941Smrg brw_reg_type data_type = 4370b8e80941Smrg brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4371b8e80941Smrg fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4372b8e80941Smrg bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, 4373b8e80941Smrg tmp, 4374b8e80941Smrg get_nir_src(instr->src[0]), /* Address */ 4375b8e80941Smrg fs_reg(), /* No source data */ 4376b8e80941Smrg brw_imm_ud(bit_size)); 4377b8e80941Smrg bld.MOV(retype(dest, data_type), tmp); 4378b8e80941Smrg } 4379b8e80941Smrg break; 4380b8e80941Smrg } 4381b8e80941Smrg 4382b8e80941Smrg case nir_intrinsic_store_global: 4383b8e80941Smrg assert(devinfo->gen >= 8); 4384b8e80941Smrg 4385b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 4386b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 4387b8e80941Smrg 4388b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 4389b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) == 32); 4390b8e80941Smrg bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, 4391b8e80941Smrg fs_reg(), 4392b8e80941Smrg get_nir_src(instr->src[1]), /* Address */ 4393b8e80941Smrg get_nir_src(instr->src[0]), /* Data */ 4394b8e80941Smrg brw_imm_ud(instr->num_components)); 4395b8e80941Smrg } else { 4396b8e80941Smrg const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4397b8e80941Smrg assert(bit_size <= 32); 4398b8e80941Smrg assert(nir_src_num_components(instr->src[0]) == 1); 4399b8e80941Smrg brw_reg_type data_type = 4400b8e80941Smrg brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4401b8e80941Smrg fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4402b8e80941Smrg bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); 4403b8e80941Smrg bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, 4404b8e80941Smrg fs_reg(), 4405b8e80941Smrg get_nir_src(instr->src[1]), /* Address */ 4406b8e80941Smrg tmp, /* Data */ 4407b8e80941Smrg brw_imm_ud(nir_src_bit_size(instr->src[0]))); 4408b8e80941Smrg } 4409b8e80941Smrg break; 4410b8e80941Smrg 4411b8e80941Smrg case nir_intrinsic_global_atomic_add: 4412b8e80941Smrg nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr); 4413b8e80941Smrg break; 4414b8e80941Smrg case nir_intrinsic_global_atomic_imin: 4415b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr); 4416b8e80941Smrg break; 4417b8e80941Smrg case nir_intrinsic_global_atomic_umin: 4418b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr); 4419b8e80941Smrg break; 4420b8e80941Smrg case nir_intrinsic_global_atomic_imax: 4421b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr); 4422b8e80941Smrg break; 4423b8e80941Smrg case nir_intrinsic_global_atomic_umax: 4424b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr); 4425b8e80941Smrg break; 4426b8e80941Smrg case nir_intrinsic_global_atomic_and: 4427b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_AND, instr); 4428b8e80941Smrg break; 4429b8e80941Smrg case nir_intrinsic_global_atomic_or: 4430b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_OR, instr); 4431b8e80941Smrg break; 4432b8e80941Smrg case nir_intrinsic_global_atomic_xor: 4433b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_XOR, instr); 4434b8e80941Smrg break; 4435b8e80941Smrg case nir_intrinsic_global_atomic_exchange: 4436b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_MOV, instr); 4437b8e80941Smrg break; 4438b8e80941Smrg case nir_intrinsic_global_atomic_comp_swap: 4439b8e80941Smrg nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr); 4440b8e80941Smrg break; 4441b8e80941Smrg case nir_intrinsic_global_atomic_fmin: 4442b8e80941Smrg nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr); 4443b8e80941Smrg break; 4444b8e80941Smrg case nir_intrinsic_global_atomic_fmax: 4445b8e80941Smrg nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr); 4446b8e80941Smrg break; 4447b8e80941Smrg case nir_intrinsic_global_atomic_fcomp_swap: 4448b8e80941Smrg nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr); 4449b8e80941Smrg break; 4450b8e80941Smrg 4451b8e80941Smrg case nir_intrinsic_load_ssbo: { 4452b8e80941Smrg assert(devinfo->gen >= 7); 4453b8e80941Smrg 4454b8e80941Smrg const unsigned bit_size = nir_dest_bit_size(instr->dest); 4455b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4456b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4457b8e80941Smrg get_nir_ssbo_intrinsic_index(bld, instr); 4458b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4459b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4460b8e80941Smrg 4461b8e80941Smrg /* Make dest unsigned because that's what the temporary will be */ 4462b8e80941Smrg dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4463b8e80941Smrg 4464b8e80941Smrg /* Read the vector */ 4465b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 4466b8e80941Smrg assert(nir_dest_bit_size(instr->dest) == 32); 4467b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4468b8e80941Smrg fs_inst *inst = 4469b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4470b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4471b8e80941Smrg inst->size_written = instr->num_components * dispatch_width * 4; 4472b8e80941Smrg } else { 4473b8e80941Smrg assert(nir_dest_bit_size(instr->dest) <= 32); 4474b8e80941Smrg assert(nir_dest_num_components(instr->dest) == 1); 4475b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4476b8e80941Smrg 4477b8e80941Smrg fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 4478b8e80941Smrg bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 4479b8e80941Smrg read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 4480b8e80941Smrg bld.MOV(dest, read_result); 4481b8e80941Smrg } 4482b8e80941Smrg break; 4483b8e80941Smrg } 4484b8e80941Smrg 4485b8e80941Smrg case nir_intrinsic_store_ssbo: { 4486b8e80941Smrg assert(devinfo->gen >= 7); 4487b8e80941Smrg 4488b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 4489b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 4490b8e80941Smrg 4491b8e80941Smrg const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4492b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4493b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4494b8e80941Smrg get_nir_ssbo_intrinsic_index(bld, instr); 4495b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); 4496b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4497b8e80941Smrg 4498b8e80941Smrg fs_reg data = get_nir_src(instr->src[0]); 4499b8e80941Smrg data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4500b8e80941Smrg 4501b8e80941Smrg assert(nir_intrinsic_write_mask(instr) == 4502b8e80941Smrg (1u << instr->num_components) - 1); 4503b8e80941Smrg if (nir_intrinsic_align(instr) >= 4) { 4504b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) == 32); 4505b8e80941Smrg assert(nir_src_num_components(instr->src[0]) <= 4); 4506b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4507b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4508b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4509b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4510b8e80941Smrg } else { 4511b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) <= 32); 4512b8e80941Smrg assert(nir_src_num_components(instr->src[0]) == 1); 4513b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4514b8e80941Smrg 4515b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 4516b8e80941Smrg bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 4517b8e80941Smrg 4518b8e80941Smrg bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 4519b8e80941Smrg fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4520b8e80941Smrg } 4521b8e80941Smrg break; 4522b8e80941Smrg } 4523b8e80941Smrg 4524b8e80941Smrg case nir_intrinsic_store_output: { 4525b8e80941Smrg fs_reg src = get_nir_src(instr->src[0]); 4526b8e80941Smrg 4527b8e80941Smrg unsigned store_offset = nir_src_as_uint(instr->src[1]); 4528b8e80941Smrg unsigned num_components = instr->num_components; 4529b8e80941Smrg unsigned first_component = nir_intrinsic_component(instr); 4530b8e80941Smrg if (nir_src_bit_size(instr->src[0]) == 64) { 4531b8e80941Smrg src = shuffle_for_32bit_write(bld, src, 0, num_components); 4532b8e80941Smrg num_components *= 2; 4533b8e80941Smrg } 4534b8e80941Smrg 4535b8e80941Smrg fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 4536b8e80941Smrg 4 * store_offset), src.type); 4537b8e80941Smrg for (unsigned j = 0; j < num_components; j++) { 4538b8e80941Smrg bld.MOV(offset(new_dest, bld, j + first_component), 4539b8e80941Smrg offset(src, bld, j)); 4540b8e80941Smrg } 4541b8e80941Smrg break; 4542b8e80941Smrg } 4543b8e80941Smrg 4544b8e80941Smrg case nir_intrinsic_ssbo_atomic_add: 4545b8e80941Smrg nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); 4546b8e80941Smrg break; 4547b8e80941Smrg case nir_intrinsic_ssbo_atomic_imin: 4548b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); 4549b8e80941Smrg break; 4550b8e80941Smrg case nir_intrinsic_ssbo_atomic_umin: 4551b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); 4552b8e80941Smrg break; 4553b8e80941Smrg case nir_intrinsic_ssbo_atomic_imax: 4554b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); 4555b8e80941Smrg break; 4556b8e80941Smrg case nir_intrinsic_ssbo_atomic_umax: 4557b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); 4558b8e80941Smrg break; 4559b8e80941Smrg case nir_intrinsic_ssbo_atomic_and: 4560b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); 4561b8e80941Smrg break; 4562b8e80941Smrg case nir_intrinsic_ssbo_atomic_or: 4563b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); 4564b8e80941Smrg break; 4565b8e80941Smrg case nir_intrinsic_ssbo_atomic_xor: 4566b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); 4567b8e80941Smrg break; 4568b8e80941Smrg case nir_intrinsic_ssbo_atomic_exchange: 4569b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); 4570b8e80941Smrg break; 4571b8e80941Smrg case nir_intrinsic_ssbo_atomic_comp_swap: 4572b8e80941Smrg nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); 4573b8e80941Smrg break; 4574b8e80941Smrg case nir_intrinsic_ssbo_atomic_fmin: 4575b8e80941Smrg nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); 4576b8e80941Smrg break; 4577b8e80941Smrg case nir_intrinsic_ssbo_atomic_fmax: 4578b8e80941Smrg nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); 4579b8e80941Smrg break; 4580b8e80941Smrg case nir_intrinsic_ssbo_atomic_fcomp_swap: 4581b8e80941Smrg nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); 4582b8e80941Smrg break; 4583b8e80941Smrg 4584b8e80941Smrg case nir_intrinsic_get_buffer_size: { 4585b8e80941Smrg assert(nir_src_num_components(instr->src[0]) == 1); 4586b8e80941Smrg unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 4587b8e80941Smrg nir_src_as_uint(instr->src[0]) : 0; 4588b8e80941Smrg 4589b8e80941Smrg /* A resinfo's sampler message is used to get the buffer size. The 4590b8e80941Smrg * SIMD8's writeback message consists of four registers and SIMD16's 4591b8e80941Smrg * writeback message consists of 8 destination registers (two per each 4592b8e80941Smrg * component). Because we are only interested on the first channel of 4593b8e80941Smrg * the first returned component, where resinfo returns the buffer size 4594b8e80941Smrg * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 4595b8e80941Smrg * the dispatch width. 4596b8e80941Smrg */ 4597b8e80941Smrg const fs_builder ubld = bld.exec_all().group(8, 0); 4598b8e80941Smrg fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4599b8e80941Smrg fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4600b8e80941Smrg 4601b8e80941Smrg /* Set LOD = 0 */ 4602b8e80941Smrg ubld.MOV(src_payload, brw_imm_d(0)); 4603b8e80941Smrg 4604b8e80941Smrg const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; 4605b8e80941Smrg fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 4606b8e80941Smrg src_payload, brw_imm_ud(index)); 4607b8e80941Smrg inst->header_size = 0; 4608b8e80941Smrg inst->mlen = 1; 4609b8e80941Smrg inst->size_written = 4 * REG_SIZE; 4610b8e80941Smrg 4611b8e80941Smrg /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 4612b8e80941Smrg * 4613b8e80941Smrg * "Out-of-bounds checking is always performed at a DWord granularity. If 4614b8e80941Smrg * any part of the DWord is out-of-bounds then the whole DWord is 4615b8e80941Smrg * considered out-of-bounds." 4616b8e80941Smrg * 4617b8e80941Smrg * This implies that types with size smaller than 4-bytes need to be 4618b8e80941Smrg * padded if they don't complete the last dword of the buffer. But as we 4619b8e80941Smrg * need to maintain the original size we need to reverse the padding 4620b8e80941Smrg * calculation to return the correct size to know the number of elements 4621b8e80941Smrg * of an unsized array. As we stored in the last two bits of the surface 4622b8e80941Smrg * size the needed padding for the buffer, we calculate here the 4623b8e80941Smrg * original buffer_size reversing the surface_size calculation: 4624b8e80941Smrg * 4625b8e80941Smrg * surface_size = isl_align(buffer_size, 4) + 4626b8e80941Smrg * (isl_align(buffer_size) - buffer_size) 4627b8e80941Smrg * 4628b8e80941Smrg * buffer_size = surface_size & ~3 - surface_size & 3 4629b8e80941Smrg */ 4630b8e80941Smrg 4631b8e80941Smrg fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4632b8e80941Smrg fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4633b8e80941Smrg fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4634b8e80941Smrg 4635b8e80941Smrg ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 4636b8e80941Smrg ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 4637b8e80941Smrg ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 4638b8e80941Smrg 4639b8e80941Smrg bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 4640b8e80941Smrg break; 4641b8e80941Smrg } 4642b8e80941Smrg 4643b8e80941Smrg case nir_intrinsic_load_subgroup_invocation: 4644b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 4645b8e80941Smrg nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 4646b8e80941Smrg break; 4647b8e80941Smrg 4648b8e80941Smrg case nir_intrinsic_load_subgroup_eq_mask: 4649b8e80941Smrg case nir_intrinsic_load_subgroup_ge_mask: 4650b8e80941Smrg case nir_intrinsic_load_subgroup_gt_mask: 4651b8e80941Smrg case nir_intrinsic_load_subgroup_le_mask: 4652b8e80941Smrg case nir_intrinsic_load_subgroup_lt_mask: 4653b8e80941Smrg unreachable("not reached"); 4654b8e80941Smrg 4655b8e80941Smrg case nir_intrinsic_vote_any: { 4656b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 4657b8e80941Smrg 4658b8e80941Smrg /* The any/all predicates do not consider channel enables. To prevent 4659b8e80941Smrg * dead channels from affecting the result, we initialize the flag with 4660b8e80941Smrg * with the identity value for the logical operation. 4661b8e80941Smrg */ 4662b8e80941Smrg if (dispatch_width == 32) { 4663b8e80941Smrg /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4664b8e80941Smrg ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4665b8e80941Smrg brw_imm_ud(0)); 4666b8e80941Smrg } else { 4667b8e80941Smrg ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 4668b8e80941Smrg } 4669b8e80941Smrg bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4670b8e80941Smrg 4671b8e80941Smrg /* For some reason, the any/all predicates don't work properly with 4672b8e80941Smrg * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4673b8e80941Smrg * doesn't read the correct subset of the flag register and you end up 4674b8e80941Smrg * getting garbage in the second half. Work around this by using a pair 4675b8e80941Smrg * of 1-wide MOVs and scattering the result. 4676b8e80941Smrg */ 4677b8e80941Smrg fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4678b8e80941Smrg ubld.MOV(res1, brw_imm_d(0)); 4679b8e80941Smrg set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 4680b8e80941Smrg dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 4681b8e80941Smrg BRW_PREDICATE_ALIGN1_ANY32H, 4682b8e80941Smrg ubld.MOV(res1, brw_imm_d(-1))); 4683b8e80941Smrg 4684b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4685b8e80941Smrg break; 4686b8e80941Smrg } 4687b8e80941Smrg case nir_intrinsic_vote_all: { 4688b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 4689b8e80941Smrg 4690b8e80941Smrg /* The any/all predicates do not consider channel enables. To prevent 4691b8e80941Smrg * dead channels from affecting the result, we initialize the flag with 4692b8e80941Smrg * with the identity value for the logical operation. 4693b8e80941Smrg */ 4694b8e80941Smrg if (dispatch_width == 32) { 4695b8e80941Smrg /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4696b8e80941Smrg ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4697b8e80941Smrg brw_imm_ud(0xffffffff)); 4698b8e80941Smrg } else { 4699b8e80941Smrg ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4700b8e80941Smrg } 4701b8e80941Smrg bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4702b8e80941Smrg 4703b8e80941Smrg /* For some reason, the any/all predicates don't work properly with 4704b8e80941Smrg * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4705b8e80941Smrg * doesn't read the correct subset of the flag register and you end up 4706b8e80941Smrg * getting garbage in the second half. Work around this by using a pair 4707b8e80941Smrg * of 1-wide MOVs and scattering the result. 4708b8e80941Smrg */ 4709b8e80941Smrg fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4710b8e80941Smrg ubld.MOV(res1, brw_imm_d(0)); 4711b8e80941Smrg set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4712b8e80941Smrg dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4713b8e80941Smrg BRW_PREDICATE_ALIGN1_ALL32H, 4714b8e80941Smrg ubld.MOV(res1, brw_imm_d(-1))); 4715b8e80941Smrg 4716b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4717b8e80941Smrg break; 4718b8e80941Smrg } 4719b8e80941Smrg case nir_intrinsic_vote_feq: 4720b8e80941Smrg case nir_intrinsic_vote_ieq: { 4721b8e80941Smrg fs_reg value = get_nir_src(instr->src[0]); 4722b8e80941Smrg if (instr->intrinsic == nir_intrinsic_vote_feq) { 4723b8e80941Smrg const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4724b8e80941Smrg value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : 4725b8e80941Smrg brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 4726b8e80941Smrg } 4727b8e80941Smrg 4728b8e80941Smrg fs_reg uniformized = bld.emit_uniformize(value); 4729b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 4730b8e80941Smrg 4731b8e80941Smrg /* The any/all predicates do not consider channel enables. To prevent 4732b8e80941Smrg * dead channels from affecting the result, we initialize the flag with 4733b8e80941Smrg * with the identity value for the logical operation. 4734b8e80941Smrg */ 4735b8e80941Smrg if (dispatch_width == 32) { 4736b8e80941Smrg /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4737b8e80941Smrg ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4738b8e80941Smrg brw_imm_ud(0xffffffff)); 4739b8e80941Smrg } else { 4740b8e80941Smrg ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4741b8e80941Smrg } 4742b8e80941Smrg bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 4743b8e80941Smrg 4744b8e80941Smrg /* For some reason, the any/all predicates don't work properly with 4745b8e80941Smrg * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4746b8e80941Smrg * doesn't read the correct subset of the flag register and you end up 4747b8e80941Smrg * getting garbage in the second half. Work around this by using a pair 4748b8e80941Smrg * of 1-wide MOVs and scattering the result. 4749b8e80941Smrg */ 4750b8e80941Smrg fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4751b8e80941Smrg ubld.MOV(res1, brw_imm_d(0)); 4752b8e80941Smrg set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4753b8e80941Smrg dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4754b8e80941Smrg BRW_PREDICATE_ALIGN1_ALL32H, 4755b8e80941Smrg ubld.MOV(res1, brw_imm_d(-1))); 4756b8e80941Smrg 4757b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4758b8e80941Smrg break; 4759b8e80941Smrg } 4760b8e80941Smrg 4761b8e80941Smrg case nir_intrinsic_ballot: { 4762b8e80941Smrg const fs_reg value = retype(get_nir_src(instr->src[0]), 4763b8e80941Smrg BRW_REGISTER_TYPE_UD); 4764b8e80941Smrg struct brw_reg flag = brw_flag_reg(0, 0); 4765b8e80941Smrg /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 4766b8e80941Smrg * as f0.0. This is a problem for fragment programs as we currently use 4767b8e80941Smrg * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 4768b8e80941Smrg * programs yet so this isn't a problem. When we do, something will 4769b8e80941Smrg * have to change. 4770b8e80941Smrg */ 4771b8e80941Smrg if (dispatch_width == 32) 4772b8e80941Smrg flag.type = BRW_REGISTER_TYPE_UD; 4773b8e80941Smrg 4774b8e80941Smrg bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 4775b8e80941Smrg bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 4776b8e80941Smrg 4777b8e80941Smrg if (instr->dest.ssa.bit_size > 32) { 4778b8e80941Smrg dest.type = BRW_REGISTER_TYPE_UQ; 4779b8e80941Smrg } else { 4780b8e80941Smrg dest.type = BRW_REGISTER_TYPE_UD; 4781b8e80941Smrg } 4782b8e80941Smrg bld.MOV(dest, flag); 4783b8e80941Smrg break; 4784b8e80941Smrg } 4785b8e80941Smrg 4786b8e80941Smrg case nir_intrinsic_read_invocation: { 4787b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4788b8e80941Smrg const fs_reg invocation = get_nir_src(instr->src[1]); 4789b8e80941Smrg fs_reg tmp = bld.vgrf(value.type); 4790b8e80941Smrg 4791b8e80941Smrg bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 4792b8e80941Smrg bld.emit_uniformize(invocation)); 4793b8e80941Smrg 4794b8e80941Smrg bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 4795b8e80941Smrg break; 4796b8e80941Smrg } 4797b8e80941Smrg 4798b8e80941Smrg case nir_intrinsic_read_first_invocation: { 4799b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4800b8e80941Smrg bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 4801b8e80941Smrg break; 4802b8e80941Smrg } 4803b8e80941Smrg 4804b8e80941Smrg case nir_intrinsic_shuffle: { 4805b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4806b8e80941Smrg const fs_reg index = get_nir_src(instr->src[1]); 4807b8e80941Smrg 4808b8e80941Smrg bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 4809b8e80941Smrg break; 4810b8e80941Smrg } 4811b8e80941Smrg 4812b8e80941Smrg case nir_intrinsic_first_invocation: { 4813b8e80941Smrg fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4814b8e80941Smrg bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 4815b8e80941Smrg bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 4816b8e80941Smrg fs_reg(component(tmp, 0))); 4817b8e80941Smrg break; 4818b8e80941Smrg } 4819b8e80941Smrg 4820b8e80941Smrg case nir_intrinsic_quad_broadcast: { 4821b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4822b8e80941Smrg const unsigned index = nir_src_as_uint(instr->src[1]); 4823b8e80941Smrg 4824b8e80941Smrg bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 4825b8e80941Smrg value, brw_imm_ud(index), brw_imm_ud(4)); 4826b8e80941Smrg break; 4827b8e80941Smrg } 4828b8e80941Smrg 4829b8e80941Smrg case nir_intrinsic_quad_swap_horizontal: { 4830b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4831b8e80941Smrg const fs_reg tmp = bld.vgrf(value.type); 4832b8e80941Smrg if (devinfo->gen <= 7) { 4833b8e80941Smrg /* The hardware doesn't seem to support these crazy regions with 4834b8e80941Smrg * compressed instructions on gen7 and earlier so we fall back to 4835b8e80941Smrg * using quad swizzles. Fortunately, we don't support 64-bit 4836b8e80941Smrg * anything in Vulkan on gen7. 4837b8e80941Smrg */ 4838b8e80941Smrg assert(nir_src_bit_size(instr->src[0]) == 32); 4839b8e80941Smrg const fs_builder ubld = bld.exec_all(); 4840b8e80941Smrg ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4841b8e80941Smrg brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); 4842b8e80941Smrg bld.MOV(retype(dest, value.type), tmp); 4843b8e80941Smrg } else { 4844b8e80941Smrg const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 4845b8e80941Smrg 4846b8e80941Smrg const fs_reg src_left = horiz_stride(value, 2); 4847b8e80941Smrg const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 4848b8e80941Smrg const fs_reg tmp_left = horiz_stride(tmp, 2); 4849b8e80941Smrg const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 4850b8e80941Smrg 4851b8e80941Smrg ubld.MOV(tmp_left, src_right); 4852b8e80941Smrg ubld.MOV(tmp_right, src_left); 4853b8e80941Smrg 4854b8e80941Smrg } 4855b8e80941Smrg bld.MOV(retype(dest, value.type), tmp); 4856b8e80941Smrg break; 4857b8e80941Smrg } 4858b8e80941Smrg 4859b8e80941Smrg case nir_intrinsic_quad_swap_vertical: { 4860b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4861b8e80941Smrg if (nir_src_bit_size(instr->src[0]) == 32) { 4862b8e80941Smrg /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4863b8e80941Smrg const fs_reg tmp = bld.vgrf(value.type); 4864b8e80941Smrg const fs_builder ubld = bld.exec_all(); 4865b8e80941Smrg ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4866b8e80941Smrg brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 4867b8e80941Smrg bld.MOV(retype(dest, value.type), tmp); 4868b8e80941Smrg } else { 4869b8e80941Smrg /* For larger data types, we have to either emit dispatch_width many 4870b8e80941Smrg * MOVs or else fall back to doing indirects. 4871b8e80941Smrg */ 4872b8e80941Smrg fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4873b8e80941Smrg bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4874b8e80941Smrg brw_imm_w(0x2)); 4875b8e80941Smrg bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4876b8e80941Smrg } 4877b8e80941Smrg break; 4878b8e80941Smrg } 4879b8e80941Smrg 4880b8e80941Smrg case nir_intrinsic_quad_swap_diagonal: { 4881b8e80941Smrg const fs_reg value = get_nir_src(instr->src[0]); 4882b8e80941Smrg if (nir_src_bit_size(instr->src[0]) == 32) { 4883b8e80941Smrg /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4884b8e80941Smrg const fs_reg tmp = bld.vgrf(value.type); 4885b8e80941Smrg const fs_builder ubld = bld.exec_all(); 4886b8e80941Smrg ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4887b8e80941Smrg brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 4888b8e80941Smrg bld.MOV(retype(dest, value.type), tmp); 4889b8e80941Smrg } else { 4890b8e80941Smrg /* For larger data types, we have to either emit dispatch_width many 4891b8e80941Smrg * MOVs or else fall back to doing indirects. 4892b8e80941Smrg */ 4893b8e80941Smrg fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4894b8e80941Smrg bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4895b8e80941Smrg brw_imm_w(0x3)); 4896b8e80941Smrg bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4897b8e80941Smrg } 4898b8e80941Smrg break; 4899b8e80941Smrg } 4900b8e80941Smrg 4901b8e80941Smrg case nir_intrinsic_reduce: { 4902b8e80941Smrg fs_reg src = get_nir_src(instr->src[0]); 4903b8e80941Smrg nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4904b8e80941Smrg unsigned cluster_size = nir_intrinsic_cluster_size(instr); 4905b8e80941Smrg if (cluster_size == 0 || cluster_size > dispatch_width) 4906b8e80941Smrg cluster_size = dispatch_width; 4907b8e80941Smrg 4908b8e80941Smrg /* Figure out the source type */ 4909b8e80941Smrg src.type = brw_type_for_nir_type(devinfo, 4910b8e80941Smrg (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4911b8e80941Smrg nir_src_bit_size(instr->src[0]))); 4912b8e80941Smrg 4913b8e80941Smrg fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4914b8e80941Smrg opcode brw_op = brw_op_for_nir_reduction_op(redop); 4915b8e80941Smrg brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4916b8e80941Smrg 4917b8e80941Smrg /* Set up a register for all of our scratching around and initialize it 4918b8e80941Smrg * to reduction operation's identity value. 4919b8e80941Smrg */ 4920b8e80941Smrg fs_reg scan = bld.vgrf(src.type); 4921b8e80941Smrg bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4922b8e80941Smrg 4923b8e80941Smrg bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 4924b8e80941Smrg 4925b8e80941Smrg dest.type = src.type; 4926b8e80941Smrg if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 4927b8e80941Smrg /* In this case, CLUSTER_BROADCAST instruction isn't needed because 4928b8e80941Smrg * the distance between clusters is at least 2 GRFs. In this case, 4929b8e80941Smrg * we don't need the weird striding of the CLUSTER_BROADCAST 4930b8e80941Smrg * instruction and can just do regular MOVs. 4931b8e80941Smrg */ 4932b8e80941Smrg assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 4933b8e80941Smrg const unsigned groups = 4934b8e80941Smrg (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 4935b8e80941Smrg const unsigned group_size = dispatch_width / groups; 4936b8e80941Smrg for (unsigned i = 0; i < groups; i++) { 4937b8e80941Smrg const unsigned cluster = (i * group_size) / cluster_size; 4938b8e80941Smrg const unsigned comp = cluster * cluster_size + (cluster_size - 1); 4939b8e80941Smrg bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 4940b8e80941Smrg component(scan, comp)); 4941b8e80941Smrg } 4942b8e80941Smrg } else { 4943b8e80941Smrg bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 4944b8e80941Smrg brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 4945b8e80941Smrg } 4946b8e80941Smrg break; 4947b8e80941Smrg } 4948b8e80941Smrg 4949b8e80941Smrg case nir_intrinsic_inclusive_scan: 4950b8e80941Smrg case nir_intrinsic_exclusive_scan: { 4951b8e80941Smrg fs_reg src = get_nir_src(instr->src[0]); 4952b8e80941Smrg nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4953b8e80941Smrg 4954b8e80941Smrg /* Figure out the source type */ 4955b8e80941Smrg src.type = brw_type_for_nir_type(devinfo, 4956b8e80941Smrg (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4957b8e80941Smrg nir_src_bit_size(instr->src[0]))); 4958b8e80941Smrg 4959b8e80941Smrg fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4960b8e80941Smrg opcode brw_op = brw_op_for_nir_reduction_op(redop); 4961b8e80941Smrg brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4962b8e80941Smrg 4963b8e80941Smrg /* Set up a register for all of our scratching around and initialize it 4964b8e80941Smrg * to reduction operation's identity value. 4965b8e80941Smrg */ 4966b8e80941Smrg fs_reg scan = bld.vgrf(src.type); 4967b8e80941Smrg const fs_builder allbld = bld.exec_all(); 4968b8e80941Smrg allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4969b8e80941Smrg 4970b8e80941Smrg if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 4971b8e80941Smrg /* Exclusive scan is a bit harder because we have to do an annoying 4972b8e80941Smrg * shift of the contents before we can begin. To make things worse, 4973b8e80941Smrg * we can't do this with a normal stride; we have to use indirects. 4974b8e80941Smrg */ 4975b8e80941Smrg fs_reg shifted = bld.vgrf(src.type); 4976b8e80941Smrg fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4977b8e80941Smrg allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4978b8e80941Smrg brw_imm_w(-1)); 4979b8e80941Smrg allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 4980b8e80941Smrg allbld.group(1, 0).MOV(component(shifted, 0), identity); 4981b8e80941Smrg scan = shifted; 4982b8e80941Smrg } 4983b8e80941Smrg 4984b8e80941Smrg bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 4985b8e80941Smrg 4986b8e80941Smrg bld.MOV(retype(dest, src.type), scan); 4987b8e80941Smrg break; 4988b8e80941Smrg } 4989b8e80941Smrg 4990b8e80941Smrg case nir_intrinsic_begin_invocation_interlock: { 4991b8e80941Smrg const fs_builder ubld = bld.group(8, 0); 4992b8e80941Smrg const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4993b8e80941Smrg 4994b8e80941Smrg ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) 4995b8e80941Smrg ->size_written = 2 * REG_SIZE; 4996b8e80941Smrg break; 4997b8e80941Smrg } 4998b8e80941Smrg 4999b8e80941Smrg case nir_intrinsic_end_invocation_interlock: { 5000b8e80941Smrg /* For endInvocationInterlock(), we need to insert a memory fence which 5001b8e80941Smrg * stalls in the shader until the memory transactions prior to that 5002b8e80941Smrg * fence are complete. This ensures that the shader does not end before 5003b8e80941Smrg * any writes from its critical section have landed. Otherwise, you can 5004b8e80941Smrg * end up with a case where the next invocation on that pixel properly 5005b8e80941Smrg * stalls for previous FS invocation on its pixel to complete but 5006b8e80941Smrg * doesn't actually wait for the dataport memory transactions from that 5007b8e80941Smrg * thread to land before submitting its own. 5008b8e80941Smrg */ 5009b8e80941Smrg const fs_builder ubld = bld.group(8, 0); 5010b8e80941Smrg const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 5011b8e80941Smrg ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, 5012b8e80941Smrg brw_vec8_grf(0, 0), brw_imm_ud(1)) 5013b8e80941Smrg ->size_written = 2 * REG_SIZE; 5014b8e80941Smrg break; 5015b8e80941Smrg } 5016b8e80941Smrg 5017b8e80941Smrg default: 5018b8e80941Smrg unreachable("unknown intrinsic"); 5019b8e80941Smrg } 5020b8e80941Smrg} 5021b8e80941Smrg 5022b8e80941Smrgvoid 5023b8e80941Smrgfs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 5024b8e80941Smrg int op, nir_intrinsic_instr *instr) 5025b8e80941Smrg{ 5026b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 5027b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 5028b8e80941Smrg 5029b8e80941Smrg /* The BTI untyped atomic messages only support 32-bit atomics. If you 5030b8e80941Smrg * just look at the big table of messages in the Vol 7 of the SKL PRM, they 5031b8e80941Smrg * appear to exist. However, if you look at Vol 2a, there are no message 5032b8e80941Smrg * descriptors provided for Qword atomic ops except for A64 messages. 5033b8e80941Smrg */ 5034b8e80941Smrg assert(nir_dest_bit_size(instr->dest) == 32); 5035b8e80941Smrg 5036b8e80941Smrg fs_reg dest; 5037b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5038b8e80941Smrg dest = get_nir_dest(instr->dest); 5039b8e80941Smrg 5040b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5041b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5042b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5043b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5044b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5045b8e80941Smrg 5046b8e80941Smrg fs_reg data; 5047b8e80941Smrg if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5048b8e80941Smrg data = get_nir_src(instr->src[2]); 5049b8e80941Smrg 5050b8e80941Smrg if (op == BRW_AOP_CMPWR) { 5051b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5052b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5053b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5054b8e80941Smrg data = tmp; 5055b8e80941Smrg } 5056b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5057b8e80941Smrg 5058b8e80941Smrg /* Emit the actual atomic operation */ 5059b8e80941Smrg 5060b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5061b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5062b8e80941Smrg} 5063b8e80941Smrg 5064b8e80941Smrgvoid 5065b8e80941Smrgfs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 5066b8e80941Smrg int op, nir_intrinsic_instr *instr) 5067b8e80941Smrg{ 5068b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 5069b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 5070b8e80941Smrg 5071b8e80941Smrg fs_reg dest; 5072b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5073b8e80941Smrg dest = get_nir_dest(instr->dest); 5074b8e80941Smrg 5075b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5076b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5077b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5078b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5079b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5080b8e80941Smrg 5081b8e80941Smrg fs_reg data = get_nir_src(instr->src[2]); 5082b8e80941Smrg if (op == BRW_AOP_FCMPWR) { 5083b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5084b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5085b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5086b8e80941Smrg data = tmp; 5087b8e80941Smrg } 5088b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5089b8e80941Smrg 5090b8e80941Smrg /* Emit the actual atomic operation */ 5091b8e80941Smrg 5092b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5093b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5094b8e80941Smrg} 5095b8e80941Smrg 5096b8e80941Smrgvoid 5097b8e80941Smrgfs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 5098b8e80941Smrg int op, nir_intrinsic_instr *instr) 5099b8e80941Smrg{ 5100b8e80941Smrg fs_reg dest; 5101b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5102b8e80941Smrg dest = get_nir_dest(instr->dest); 5103b8e80941Smrg 5104b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5105b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 5106b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5107b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5108b8e80941Smrg 5109b8e80941Smrg fs_reg data; 5110b8e80941Smrg if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5111b8e80941Smrg data = get_nir_src(instr->src[1]); 5112b8e80941Smrg if (op == BRW_AOP_CMPWR) { 5113b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5114b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5115b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5116b8e80941Smrg data = tmp; 5117b8e80941Smrg } 5118b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5119b8e80941Smrg 5120b8e80941Smrg /* Get the offset */ 5121b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 5122b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5123b8e80941Smrg brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5124b8e80941Smrg } else { 5125b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5126b8e80941Smrg bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5127b8e80941Smrg retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5128b8e80941Smrg brw_imm_ud(instr->const_index[0])); 5129b8e80941Smrg } 5130b8e80941Smrg 5131b8e80941Smrg /* Emit the actual atomic operation operation */ 5132b8e80941Smrg 5133b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5134b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5135b8e80941Smrg} 5136b8e80941Smrg 5137b8e80941Smrgvoid 5138b8e80941Smrgfs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 5139b8e80941Smrg int op, nir_intrinsic_instr *instr) 5140b8e80941Smrg{ 5141b8e80941Smrg fs_reg dest; 5142b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5143b8e80941Smrg dest = get_nir_dest(instr->dest); 5144b8e80941Smrg 5145b8e80941Smrg fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5146b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 5147b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5148b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5149b8e80941Smrg 5150b8e80941Smrg fs_reg data = get_nir_src(instr->src[1]); 5151b8e80941Smrg if (op == BRW_AOP_FCMPWR) { 5152b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5153b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5154b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5155b8e80941Smrg data = tmp; 5156b8e80941Smrg } 5157b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5158b8e80941Smrg 5159b8e80941Smrg /* Get the offset */ 5160b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 5161b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5162b8e80941Smrg brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5163b8e80941Smrg } else { 5164b8e80941Smrg srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5165b8e80941Smrg bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5166b8e80941Smrg retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5167b8e80941Smrg brw_imm_ud(instr->const_index[0])); 5168b8e80941Smrg } 5169b8e80941Smrg 5170b8e80941Smrg /* Emit the actual atomic operation operation */ 5171b8e80941Smrg 5172b8e80941Smrg bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5173b8e80941Smrg dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5174b8e80941Smrg} 5175b8e80941Smrg 5176b8e80941Smrgvoid 5177b8e80941Smrgfs_visitor::nir_emit_global_atomic(const fs_builder &bld, 5178b8e80941Smrg int op, nir_intrinsic_instr *instr) 5179b8e80941Smrg{ 5180b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 5181b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 5182b8e80941Smrg 5183b8e80941Smrg fs_reg dest; 5184b8e80941Smrg if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5185b8e80941Smrg dest = get_nir_dest(instr->dest); 5186b8e80941Smrg 5187b8e80941Smrg fs_reg addr = get_nir_src(instr->src[0]); 5188b8e80941Smrg 5189b8e80941Smrg fs_reg data; 5190b8e80941Smrg if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5191b8e80941Smrg data = get_nir_src(instr->src[1]); 5192b8e80941Smrg 5193b8e80941Smrg if (op == BRW_AOP_CMPWR) { 5194b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5195b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5196b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5197b8e80941Smrg data = tmp; 5198b8e80941Smrg } 5199b8e80941Smrg 5200b8e80941Smrg if (nir_dest_bit_size(instr->dest) == 64) { 5201b8e80941Smrg bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, 5202b8e80941Smrg dest, addr, data, brw_imm_ud(op)); 5203b8e80941Smrg } else { 5204b8e80941Smrg assert(nir_dest_bit_size(instr->dest) == 32); 5205b8e80941Smrg bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, 5206b8e80941Smrg dest, addr, data, brw_imm_ud(op)); 5207b8e80941Smrg } 5208b8e80941Smrg} 5209b8e80941Smrg 5210b8e80941Smrgvoid 5211b8e80941Smrgfs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, 5212b8e80941Smrg int op, nir_intrinsic_instr *instr) 5213b8e80941Smrg{ 5214b8e80941Smrg if (stage == MESA_SHADER_FRAGMENT) 5215b8e80941Smrg brw_wm_prog_data(prog_data)->has_side_effects = true; 5216b8e80941Smrg 5217b8e80941Smrg assert(nir_intrinsic_infos[instr->intrinsic].has_dest); 5218b8e80941Smrg fs_reg dest = get_nir_dest(instr->dest); 5219b8e80941Smrg 5220b8e80941Smrg fs_reg addr = get_nir_src(instr->src[0]); 5221b8e80941Smrg 5222b8e80941Smrg assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC); 5223b8e80941Smrg fs_reg data = get_nir_src(instr->src[1]); 5224b8e80941Smrg 5225b8e80941Smrg if (op == BRW_AOP_FCMPWR) { 5226b8e80941Smrg fs_reg tmp = bld.vgrf(data.type, 2); 5227b8e80941Smrg fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5228b8e80941Smrg bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5229b8e80941Smrg data = tmp; 5230b8e80941Smrg } 5231b8e80941Smrg 5232b8e80941Smrg bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, 5233b8e80941Smrg dest, addr, data, brw_imm_ud(op)); 5234b8e80941Smrg} 5235b8e80941Smrg 5236b8e80941Smrgvoid 5237b8e80941Smrgfs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 5238b8e80941Smrg{ 5239b8e80941Smrg unsigned texture = instr->texture_index; 5240b8e80941Smrg unsigned sampler = instr->sampler_index; 5241b8e80941Smrg 5242b8e80941Smrg fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 5243b8e80941Smrg 5244b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 5245b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 5246b8e80941Smrg 5247b8e80941Smrg int lod_components = 0; 5248b8e80941Smrg 5249b8e80941Smrg /* The hardware requires a LOD for buffer textures */ 5250b8e80941Smrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 5251b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 5252b8e80941Smrg 5253b8e80941Smrg uint32_t header_bits = 0; 5254b8e80941Smrg for (unsigned i = 0; i < instr->num_srcs; i++) { 5255b8e80941Smrg fs_reg src = get_nir_src(instr->src[i].src); 5256b8e80941Smrg switch (instr->src[i].src_type) { 5257b8e80941Smrg case nir_tex_src_bias: 5258b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = 5259b8e80941Smrg retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5260b8e80941Smrg break; 5261b8e80941Smrg case nir_tex_src_comparator: 5262b8e80941Smrg srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 5263b8e80941Smrg break; 5264b8e80941Smrg case nir_tex_src_coord: 5265b8e80941Smrg switch (instr->op) { 5266b8e80941Smrg case nir_texop_txf: 5267b8e80941Smrg case nir_texop_txf_ms: 5268b8e80941Smrg case nir_texop_txf_ms_mcs: 5269b8e80941Smrg case nir_texop_samples_identical: 5270b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 5271b8e80941Smrg break; 5272b8e80941Smrg default: 5273b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 5274b8e80941Smrg break; 5275b8e80941Smrg } 5276b8e80941Smrg break; 5277b8e80941Smrg case nir_tex_src_ddx: 5278b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 5279b8e80941Smrg lod_components = nir_tex_instr_src_size(instr, i); 5280b8e80941Smrg break; 5281b8e80941Smrg case nir_tex_src_ddy: 5282b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 5283b8e80941Smrg break; 5284b8e80941Smrg case nir_tex_src_lod: 5285b8e80941Smrg switch (instr->op) { 5286b8e80941Smrg case nir_texop_txs: 5287b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = 5288b8e80941Smrg retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 5289b8e80941Smrg break; 5290b8e80941Smrg case nir_texop_txf: 5291b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = 5292b8e80941Smrg retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 5293b8e80941Smrg break; 5294b8e80941Smrg default: 5295b8e80941Smrg srcs[TEX_LOGICAL_SRC_LOD] = 5296b8e80941Smrg retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5297b8e80941Smrg break; 5298b8e80941Smrg } 5299b8e80941Smrg break; 5300b8e80941Smrg case nir_tex_src_min_lod: 5301b8e80941Smrg srcs[TEX_LOGICAL_SRC_MIN_LOD] = 5302b8e80941Smrg retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5303b8e80941Smrg break; 5304b8e80941Smrg case nir_tex_src_ms_index: 5305b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 5306b8e80941Smrg break; 5307b8e80941Smrg 5308b8e80941Smrg case nir_tex_src_offset: { 5309b8e80941Smrg uint32_t offset_bits = 0; 5310b8e80941Smrg if (brw_texture_offset(instr, i, &offset_bits)) { 5311b8e80941Smrg header_bits |= offset_bits; 5312b8e80941Smrg } else { 5313b8e80941Smrg srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 5314b8e80941Smrg retype(src, BRW_REGISTER_TYPE_D); 5315b8e80941Smrg } 5316b8e80941Smrg break; 5317b8e80941Smrg } 5318b8e80941Smrg 5319b8e80941Smrg case nir_tex_src_projector: 5320b8e80941Smrg unreachable("should be lowered"); 5321b8e80941Smrg 5322b8e80941Smrg case nir_tex_src_texture_offset: { 5323b8e80941Smrg /* Emit code to evaluate the actual indexing expression */ 5324b8e80941Smrg fs_reg tmp = vgrf(glsl_type::uint_type); 5325b8e80941Smrg bld.ADD(tmp, src, brw_imm_ud(texture)); 5326b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 5327b8e80941Smrg break; 5328b8e80941Smrg } 5329b8e80941Smrg 5330b8e80941Smrg case nir_tex_src_sampler_offset: { 5331b8e80941Smrg /* Emit code to evaluate the actual indexing expression */ 5332b8e80941Smrg fs_reg tmp = vgrf(glsl_type::uint_type); 5333b8e80941Smrg bld.ADD(tmp, src, brw_imm_ud(sampler)); 5334b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 5335b8e80941Smrg break; 5336b8e80941Smrg } 5337b8e80941Smrg 5338b8e80941Smrg case nir_tex_src_texture_handle: 5339b8e80941Smrg assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); 5340b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); 5341b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); 5342b8e80941Smrg break; 5343b8e80941Smrg 5344b8e80941Smrg case nir_tex_src_sampler_handle: 5345b8e80941Smrg assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); 5346b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); 5347b8e80941Smrg srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); 5348b8e80941Smrg break; 5349b8e80941Smrg 5350b8e80941Smrg case nir_tex_src_ms_mcs: 5351b8e80941Smrg assert(instr->op == nir_texop_txf_ms); 5352b8e80941Smrg srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 5353b8e80941Smrg break; 5354b8e80941Smrg 5355b8e80941Smrg case nir_tex_src_plane: { 5356b8e80941Smrg const uint32_t plane = nir_src_as_uint(instr->src[i].src); 5357b8e80941Smrg const uint32_t texture_index = 5358b8e80941Smrg instr->texture_index + 5359b8e80941Smrg stage_prog_data->binding_table.plane_start[plane] - 5360b8e80941Smrg stage_prog_data->binding_table.texture_start; 5361b8e80941Smrg 5362b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); 5363b8e80941Smrg break; 5364b8e80941Smrg } 5365b8e80941Smrg 5366b8e80941Smrg default: 5367b8e80941Smrg unreachable("unknown texture source"); 5368b8e80941Smrg } 5369b8e80941Smrg } 5370b8e80941Smrg 5371b8e80941Smrg if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 5372b8e80941Smrg (instr->op == nir_texop_txf_ms || 5373b8e80941Smrg instr->op == nir_texop_samples_identical)) { 5374b8e80941Smrg if (devinfo->gen >= 7 && 5375b8e80941Smrg key_tex->compressed_multisample_layout_mask & (1 << texture)) { 5376b8e80941Smrg srcs[TEX_LOGICAL_SRC_MCS] = 5377b8e80941Smrg emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 5378b8e80941Smrg instr->coord_components, 5379b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE], 5380b8e80941Smrg srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); 5381b8e80941Smrg } else { 5382b8e80941Smrg srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 5383b8e80941Smrg } 5384b8e80941Smrg } 5385b8e80941Smrg 5386b8e80941Smrg srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 5387b8e80941Smrg srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 5388b8e80941Smrg 5389b8e80941Smrg enum opcode opcode; 5390b8e80941Smrg switch (instr->op) { 5391b8e80941Smrg case nir_texop_tex: 5392b8e80941Smrg opcode = SHADER_OPCODE_TEX_LOGICAL; 5393b8e80941Smrg break; 5394b8e80941Smrg case nir_texop_txb: 5395b8e80941Smrg opcode = FS_OPCODE_TXB_LOGICAL; 5396b8e80941Smrg break; 5397b8e80941Smrg case nir_texop_txl: 5398b8e80941Smrg opcode = SHADER_OPCODE_TXL_LOGICAL; 5399b8e80941Smrg break; 5400b8e80941Smrg case nir_texop_txd: 5401b8e80941Smrg opcode = SHADER_OPCODE_TXD_LOGICAL; 5402b8e80941Smrg break; 5403b8e80941Smrg case nir_texop_txf: 5404b8e80941Smrg opcode = SHADER_OPCODE_TXF_LOGICAL; 5405b8e80941Smrg break; 5406b8e80941Smrg case nir_texop_txf_ms: 5407b8e80941Smrg if ((key_tex->msaa_16 & (1 << sampler))) 5408b8e80941Smrg opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 5409b8e80941Smrg else 5410b8e80941Smrg opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 5411b8e80941Smrg break; 5412b8e80941Smrg case nir_texop_txf_ms_mcs: 5413b8e80941Smrg opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 5414b8e80941Smrg break; 5415b8e80941Smrg case nir_texop_query_levels: 5416b8e80941Smrg case nir_texop_txs: 5417b8e80941Smrg opcode = SHADER_OPCODE_TXS_LOGICAL; 5418b8e80941Smrg break; 5419b8e80941Smrg case nir_texop_lod: 5420b8e80941Smrg opcode = SHADER_OPCODE_LOD_LOGICAL; 5421b8e80941Smrg break; 5422b8e80941Smrg case nir_texop_tg4: 5423b8e80941Smrg if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 5424b8e80941Smrg opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 5425b8e80941Smrg else 5426b8e80941Smrg opcode = SHADER_OPCODE_TG4_LOGICAL; 5427b8e80941Smrg break; 5428b8e80941Smrg case nir_texop_texture_samples: 5429b8e80941Smrg opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 5430b8e80941Smrg break; 5431b8e80941Smrg case nir_texop_samples_identical: { 5432b8e80941Smrg fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 5433b8e80941Smrg 5434b8e80941Smrg /* If mcs is an immediate value, it means there is no MCS. In that case 5435b8e80941Smrg * just return false. 5436b8e80941Smrg */ 5437b8e80941Smrg if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 5438b8e80941Smrg bld.MOV(dst, brw_imm_ud(0u)); 5439b8e80941Smrg } else if ((key_tex->msaa_16 & (1 << sampler))) { 5440b8e80941Smrg fs_reg tmp = vgrf(glsl_type::uint_type); 5441b8e80941Smrg bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 5442b8e80941Smrg offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 5443b8e80941Smrg bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 5444b8e80941Smrg } else { 5445b8e80941Smrg bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 5446b8e80941Smrg BRW_CONDITIONAL_EQ); 5447b8e80941Smrg } 5448b8e80941Smrg return; 5449b8e80941Smrg } 5450b8e80941Smrg default: 5451b8e80941Smrg unreachable("unknown texture opcode"); 5452b8e80941Smrg } 5453b8e80941Smrg 5454b8e80941Smrg if (instr->op == nir_texop_tg4) { 5455b8e80941Smrg if (instr->component == 1 && 5456b8e80941Smrg key_tex->gather_channel_quirk_mask & (1 << texture)) { 5457b8e80941Smrg /* gather4 sampler is broken for green channel on RG32F -- 5458b8e80941Smrg * we must ask for blue instead. 5459b8e80941Smrg */ 5460b8e80941Smrg header_bits |= 2 << 16; 5461b8e80941Smrg } else { 5462b8e80941Smrg header_bits |= instr->component << 16; 5463b8e80941Smrg } 5464b8e80941Smrg } 5465b8e80941Smrg 5466b8e80941Smrg fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 5467b8e80941Smrg fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 5468b8e80941Smrg inst->offset = header_bits; 5469b8e80941Smrg 5470b8e80941Smrg const unsigned dest_size = nir_tex_instr_dest_size(instr); 5471b8e80941Smrg if (devinfo->gen >= 9 && 5472b8e80941Smrg instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 5473b8e80941Smrg unsigned write_mask = instr->dest.is_ssa ? 5474b8e80941Smrg nir_ssa_def_components_read(&instr->dest.ssa): 5475b8e80941Smrg (1 << dest_size) - 1; 5476b8e80941Smrg assert(write_mask != 0); /* dead code should have been eliminated */ 5477b8e80941Smrg inst->size_written = util_last_bit(write_mask) * 5478b8e80941Smrg inst->dst.component_size(inst->exec_size); 5479b8e80941Smrg } else { 5480b8e80941Smrg inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 5481b8e80941Smrg } 5482b8e80941Smrg 5483b8e80941Smrg if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 5484b8e80941Smrg inst->shadow_compare = true; 5485b8e80941Smrg 5486b8e80941Smrg if (instr->op == nir_texop_tg4 && devinfo->gen == 6) 5487b8e80941Smrg emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst); 5488b8e80941Smrg 5489b8e80941Smrg fs_reg nir_dest[4]; 5490b8e80941Smrg for (unsigned i = 0; i < dest_size; i++) 5491b8e80941Smrg nir_dest[i] = offset(dst, bld, i); 5492b8e80941Smrg 5493b8e80941Smrg if (instr->op == nir_texop_query_levels) { 5494b8e80941Smrg /* # levels is in .w */ 5495b8e80941Smrg nir_dest[0] = offset(dst, bld, 3); 5496b8e80941Smrg } else if (instr->op == nir_texop_txs && 5497b8e80941Smrg dest_size >= 3 && devinfo->gen < 7) { 5498b8e80941Smrg /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 5499b8e80941Smrg fs_reg depth = offset(dst, bld, 2); 5500b8e80941Smrg nir_dest[2] = vgrf(glsl_type::int_type); 5501b8e80941Smrg bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 5502b8e80941Smrg } 5503b8e80941Smrg 5504b8e80941Smrg bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 5505b8e80941Smrg} 5506b8e80941Smrg 5507b8e80941Smrgvoid 5508b8e80941Smrgfs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 5509b8e80941Smrg{ 5510b8e80941Smrg switch (instr->type) { 5511b8e80941Smrg case nir_jump_break: 5512b8e80941Smrg bld.emit(BRW_OPCODE_BREAK); 5513b8e80941Smrg break; 5514b8e80941Smrg case nir_jump_continue: 5515b8e80941Smrg bld.emit(BRW_OPCODE_CONTINUE); 5516b8e80941Smrg break; 5517b8e80941Smrg case nir_jump_return: 5518b8e80941Smrg default: 5519b8e80941Smrg unreachable("unknown jump"); 5520b8e80941Smrg } 5521b8e80941Smrg} 5522b8e80941Smrg 5523b8e80941Smrg/* 5524b8e80941Smrg * This helper takes a source register and un/shuffles it into the destination 5525b8e80941Smrg * register. 5526b8e80941Smrg * 5527b8e80941Smrg * If source type size is smaller than destination type size the operation 5528b8e80941Smrg * needed is a component shuffle. The opposite case would be an unshuffle. If 5529b8e80941Smrg * source/destination type size is equal a shuffle is done that would be 5530b8e80941Smrg * equivalent to a simple MOV. 5531b8e80941Smrg * 5532b8e80941Smrg * For example, if source is a 16-bit type and destination is 32-bit. A 3 5533b8e80941Smrg * components .xyz 16-bit vector on SIMD8 would be. 5534b8e80941Smrg * 5535b8e80941Smrg * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 5536b8e80941Smrg * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 5537b8e80941Smrg * 5538b8e80941Smrg * This helper will return the following 2 32-bit components with the 16-bit 5539b8e80941Smrg * values shuffled: 5540b8e80941Smrg * 5541b8e80941Smrg * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 5542b8e80941Smrg * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 5543b8e80941Smrg * 5544b8e80941Smrg * For unshuffle, the example would be the opposite, a 64-bit type source 5545b8e80941Smrg * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 5546b8e80941Smrg * would be: 5547b8e80941Smrg * 5548b8e80941Smrg * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 5549b8e80941Smrg * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 5550b8e80941Smrg * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 5551b8e80941Smrg * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 5552b8e80941Smrg * 5553b8e80941Smrg * The returned result would be the following 4 32-bit components unshuffled: 5554b8e80941Smrg * 5555b8e80941Smrg * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 5556b8e80941Smrg * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 5557b8e80941Smrg * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 5558b8e80941Smrg * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 5559b8e80941Smrg * 5560b8e80941Smrg * - Source and destination register must not be overlapped. 5561b8e80941Smrg * - components units are measured in terms of the smaller type between 5562b8e80941Smrg * source and destination because we are un/shuffling the smaller 5563b8e80941Smrg * components from/into the bigger ones. 5564b8e80941Smrg * - first_component parameter allows skipping source components. 5565b8e80941Smrg */ 5566b8e80941Smrgvoid 5567b8e80941Smrgshuffle_src_to_dst(const fs_builder &bld, 5568b8e80941Smrg const fs_reg &dst, 5569b8e80941Smrg const fs_reg &src, 5570b8e80941Smrg uint32_t first_component, 5571b8e80941Smrg uint32_t components) 5572b8e80941Smrg{ 5573b8e80941Smrg if (type_sz(src.type) == type_sz(dst.type)) { 5574b8e80941Smrg assert(!regions_overlap(dst, 5575b8e80941Smrg type_sz(dst.type) * bld.dispatch_width() * components, 5576b8e80941Smrg offset(src, bld, first_component), 5577b8e80941Smrg type_sz(src.type) * bld.dispatch_width() * components)); 5578b8e80941Smrg for (unsigned i = 0; i < components; i++) { 5579b8e80941Smrg bld.MOV(retype(offset(dst, bld, i), src.type), 5580b8e80941Smrg offset(src, bld, i + first_component)); 5581b8e80941Smrg } 5582b8e80941Smrg } else if (type_sz(src.type) < type_sz(dst.type)) { 5583b8e80941Smrg /* Source is shuffled into destination */ 5584b8e80941Smrg unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 5585b8e80941Smrg assert(!regions_overlap(dst, 5586b8e80941Smrg type_sz(dst.type) * bld.dispatch_width() * 5587b8e80941Smrg DIV_ROUND_UP(components, size_ratio), 5588b8e80941Smrg offset(src, bld, first_component), 5589b8e80941Smrg type_sz(src.type) * bld.dispatch_width() * components)); 5590b8e80941Smrg 5591b8e80941Smrg brw_reg_type shuffle_type = 5592b8e80941Smrg brw_reg_type_from_bit_size(8 * type_sz(src.type), 5593b8e80941Smrg BRW_REGISTER_TYPE_D); 5594b8e80941Smrg for (unsigned i = 0; i < components; i++) { 5595b8e80941Smrg fs_reg shuffle_component_i = 5596b8e80941Smrg subscript(offset(dst, bld, i / size_ratio), 5597b8e80941Smrg shuffle_type, i % size_ratio); 5598b8e80941Smrg bld.MOV(shuffle_component_i, 5599b8e80941Smrg retype(offset(src, bld, i + first_component), shuffle_type)); 5600b8e80941Smrg } 5601b8e80941Smrg } else { 5602b8e80941Smrg /* Source is unshuffled into destination */ 5603b8e80941Smrg unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 5604b8e80941Smrg assert(!regions_overlap(dst, 5605b8e80941Smrg type_sz(dst.type) * bld.dispatch_width() * components, 5606b8e80941Smrg offset(src, bld, first_component / size_ratio), 5607b8e80941Smrg type_sz(src.type) * bld.dispatch_width() * 5608b8e80941Smrg DIV_ROUND_UP(components + (first_component % size_ratio), 5609b8e80941Smrg size_ratio))); 5610b8e80941Smrg 5611b8e80941Smrg brw_reg_type shuffle_type = 5612b8e80941Smrg brw_reg_type_from_bit_size(8 * type_sz(dst.type), 5613b8e80941Smrg BRW_REGISTER_TYPE_D); 5614b8e80941Smrg for (unsigned i = 0; i < components; i++) { 5615b8e80941Smrg fs_reg shuffle_component_i = 5616b8e80941Smrg subscript(offset(src, bld, (first_component + i) / size_ratio), 5617b8e80941Smrg shuffle_type, (first_component + i) % size_ratio); 5618b8e80941Smrg bld.MOV(retype(offset(dst, bld, i), shuffle_type), 5619b8e80941Smrg shuffle_component_i); 5620b8e80941Smrg } 5621b8e80941Smrg } 5622b8e80941Smrg} 5623b8e80941Smrg 5624b8e80941Smrgvoid 5625b8e80941Smrgshuffle_from_32bit_read(const fs_builder &bld, 5626b8e80941Smrg const fs_reg &dst, 5627b8e80941Smrg const fs_reg &src, 5628b8e80941Smrg uint32_t first_component, 5629b8e80941Smrg uint32_t components) 5630b8e80941Smrg{ 5631b8e80941Smrg assert(type_sz(src.type) == 4); 5632b8e80941Smrg 5633b8e80941Smrg /* This function takes components in units of the destination type while 5634b8e80941Smrg * shuffle_src_to_dst takes components in units of the smallest type 5635b8e80941Smrg */ 5636b8e80941Smrg if (type_sz(dst.type) > 4) { 5637b8e80941Smrg assert(type_sz(dst.type) == 8); 5638b8e80941Smrg first_component *= 2; 5639b8e80941Smrg components *= 2; 5640b8e80941Smrg } 5641b8e80941Smrg 5642b8e80941Smrg shuffle_src_to_dst(bld, dst, src, first_component, components); 5643b8e80941Smrg} 5644b8e80941Smrg 5645b8e80941Smrgfs_reg 5646b8e80941Smrgshuffle_for_32bit_write(const fs_builder &bld, 5647b8e80941Smrg const fs_reg &src, 5648b8e80941Smrg uint32_t first_component, 5649b8e80941Smrg uint32_t components) 5650b8e80941Smrg{ 5651b8e80941Smrg fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 5652b8e80941Smrg DIV_ROUND_UP (components * type_sz(src.type), 4)); 5653b8e80941Smrg /* This function takes components in units of the source type while 5654b8e80941Smrg * shuffle_src_to_dst takes components in units of the smallest type 5655b8e80941Smrg */ 5656b8e80941Smrg if (type_sz(src.type) > 4) { 5657b8e80941Smrg assert(type_sz(src.type) == 8); 5658b8e80941Smrg first_component *= 2; 5659b8e80941Smrg components *= 2; 5660b8e80941Smrg } 5661b8e80941Smrg 5662b8e80941Smrg shuffle_src_to_dst(bld, dst, src, first_component, components); 5663b8e80941Smrg 5664b8e80941Smrg return dst; 5665b8e80941Smrg} 5666b8e80941Smrg 5667b8e80941Smrgfs_reg 5668b8e80941Smrgsetup_imm_df(const fs_builder &bld, double v) 5669b8e80941Smrg{ 5670b8e80941Smrg const struct gen_device_info *devinfo = bld.shader->devinfo; 5671b8e80941Smrg assert(devinfo->gen >= 7); 5672b8e80941Smrg 5673b8e80941Smrg if (devinfo->gen >= 8) 5674b8e80941Smrg return brw_imm_df(v); 5675b8e80941Smrg 5676b8e80941Smrg /* gen7.5 does not support DF immediates straighforward but the DIM 5677b8e80941Smrg * instruction allows to set the 64-bit immediate value. 5678b8e80941Smrg */ 5679b8e80941Smrg if (devinfo->is_haswell) { 5680b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 5681b8e80941Smrg fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 5682b8e80941Smrg ubld.DIM(dst, brw_imm_df(v)); 5683b8e80941Smrg return component(dst, 0); 5684b8e80941Smrg } 5685b8e80941Smrg 5686b8e80941Smrg /* gen7 does not support DF immediates, so we generate a 64-bit constant by 5687b8e80941Smrg * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 5688b8e80941Smrg * the high 32-bit to suboffset 4 and then applying a stride of 0. 5689b8e80941Smrg * 5690b8e80941Smrg * Alternatively, we could also produce a normal VGRF (without stride 0) 5691b8e80941Smrg * by writing to all the channels in the VGRF, however, that would hit the 5692b8e80941Smrg * gen7 bug where we have to split writes that span more than 1 register 5693b8e80941Smrg * into instructions with a width of 4 (otherwise the write to the second 5694b8e80941Smrg * register written runs into an execmask hardware bug) which isn't very 5695b8e80941Smrg * nice. 5696b8e80941Smrg */ 5697b8e80941Smrg union { 5698b8e80941Smrg double d; 5699b8e80941Smrg struct { 5700b8e80941Smrg uint32_t i1; 5701b8e80941Smrg uint32_t i2; 5702b8e80941Smrg }; 5703b8e80941Smrg } di; 5704b8e80941Smrg 5705b8e80941Smrg di.d = v; 5706b8e80941Smrg 5707b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 5708b8e80941Smrg const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 5709b8e80941Smrg ubld.MOV(tmp, brw_imm_ud(di.i1)); 5710b8e80941Smrg ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 5711b8e80941Smrg 5712b8e80941Smrg return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 5713b8e80941Smrg} 5714b8e80941Smrg 5715b8e80941Smrgfs_reg 5716b8e80941Smrgsetup_imm_b(const fs_builder &bld, int8_t v) 5717b8e80941Smrg{ 5718b8e80941Smrg const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 5719b8e80941Smrg bld.MOV(tmp, brw_imm_w(v)); 5720b8e80941Smrg return tmp; 5721b8e80941Smrg} 5722b8e80941Smrg 5723b8e80941Smrgfs_reg 5724b8e80941Smrgsetup_imm_ub(const fs_builder &bld, uint8_t v) 5725b8e80941Smrg{ 5726b8e80941Smrg const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 5727b8e80941Smrg bld.MOV(tmp, brw_imm_uw(v)); 5728b8e80941Smrg return tmp; 5729b8e80941Smrg} 5730