1/* 2 Copyright (C) Intel Corp. 2006. All Rights Reserved. 3 Intel funded Tungsten Graphics to 4 develop this 3D driver. 5 6 Permission is hereby granted, free of charge, to any person obtaining 7 a copy of this software and associated documentation files (the 8 "Software"), to deal in the Software without restriction, including 9 without limitation the rights to use, copy, modify, merge, publish, 10 distribute, sublicense, and/or sell copies of the Software, and to 11 permit persons to whom the Software is furnished to do so, subject to 12 the following conditions: 13 14 The above copyright notice and this permission notice (including the 15 next paragraph) shall be included in all copies or substantial 16 portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 26 **********************************************************************/ 27 /* 28 * Authors: 29 * Keith Whitwell <keithw@vmware.com> 30 */ 31 32 33#include "brw_eu_defines.h" 34#include "brw_eu.h" 35 36#include "util/ralloc.h" 37 38/** 39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source 40 * registers, implicitly moving the operand to a message register. 41 * 42 * On Sandybridge, this is no longer the case. This function performs the 43 * explicit move; it should be called before emitting a SEND instruction. 44 */ 45void 46gfx6_resolve_implied_move(struct brw_codegen *p, 47 struct brw_reg *src, 48 unsigned msg_reg_nr) 49{ 50 const struct intel_device_info *devinfo = p->devinfo; 51 if (devinfo->ver < 6) 52 return; 53 54 if (src->file == BRW_MESSAGE_REGISTER_FILE) 55 return; 56 57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { 58 assert(devinfo->ver < 12); 59 brw_push_insn_state(p); 60 brw_set_default_exec_size(p, BRW_EXECUTE_8); 61 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), 64 retype(*src, BRW_REGISTER_TYPE_UD)); 65 brw_pop_insn_state(p); 66 } 67 *src = brw_message_reg(msg_reg_nr); 68} 69 70static void 71gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg) 72{ 73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"): 74 * "The send with EOT should use register space R112-R127 for <src>. This is 75 * to enable loading of a new thread into the same slot while the message 76 * with EOT for current thread is pending dispatch." 77 * 78 * Since we're pretending to have 16 MRFs anyway, we may as well use the 79 * registers required for messages with EOT. 80 */ 81 const struct intel_device_info *devinfo = p->devinfo; 82 if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) { 83 reg->file = BRW_GENERAL_REGISTER_FILE; 84 reg->nr += GFX7_MRF_HACK_START; 85 } 86} 87 88void 89brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) 90{ 91 const struct intel_device_info *devinfo = p->devinfo; 92 93 if (dest.file == BRW_MESSAGE_REGISTER_FILE) 94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); 95 else if (dest.file == BRW_GENERAL_REGISTER_FILE) 96 assert(dest.nr < 128); 97 98 /* The hardware has a restriction where a destination of size Byte with 99 * a stride of 1 is only allowed for a packed byte MOV. For any other 100 * instruction, the stride must be at least 2, even when the destination 101 * is the NULL register. 102 */ 103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && 104 dest.nr == BRW_ARF_NULL && 105 type_sz(dest.type) == 1 && 106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) { 107 dest.hstride = BRW_HORIZONTAL_STRIDE_2; 108 } 109 110 gfx7_convert_mrf_to_grf(p, &dest); 111 112 if (devinfo->ver >= 12 && 113 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 114 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { 115 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE); 117 assert(dest.address_mode == BRW_ADDRESS_DIRECT); 118 assert(dest.subnr == 0); 119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 || 120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 && 121 dest.vstride == dest.width + 1)); 122 assert(!dest.negate && !dest.abs); 123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file); 124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); 125 126 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || 127 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { 128 assert(devinfo->ver < 12); 129 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE); 131 assert(dest.address_mode == BRW_ADDRESS_DIRECT); 132 assert(dest.subnr % 16 == 0); 133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 && 134 dest.vstride == dest.width + 1); 135 assert(!dest.negate && !dest.abs); 136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); 137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); 138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file); 139 } else { 140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type); 141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode); 142 143 if (dest.address_mode == BRW_ADDRESS_DIRECT) { 144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); 145 146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr); 148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 149 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); 151 } else { 152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); 153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask); 154 if (dest.file == BRW_GENERAL_REGISTER_FILE || 155 dest.file == BRW_MESSAGE_REGISTER_FILE) { 156 assert(dest.writemask != 0); 157 } 158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: 159 * Although Dst.HorzStride is a don't care for Align16, HW needs 160 * this to be programmed as "01". 161 */ 162 brw_inst_set_dst_hstride(devinfo, inst, 1); 163 } 164 } else { 165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr); 166 167 /* These are different sizes in align1 vs align16: 168 */ 169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst, 171 dest.indirect_offset); 172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 173 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); 175 } else { 176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst, 177 dest.indirect_offset); 178 /* even ignored in da16, still need to set as '01' */ 179 brw_inst_set_dst_hstride(devinfo, inst, 1); 180 } 181 } 182 } 183 184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8) 185 * or 16 (SIMD16), as that's normally correct. However, when dealing with 186 * small registers, it can be useful for us to automatically reduce it to 187 * match the register size. 188 */ 189 if (p->automatic_exec_sizes) { 190 /* 191 * In platforms that support fp64 we can emit instructions with a width 192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In 193 * these cases we need to make sure that these instructions have their 194 * exec sizes set properly when they are emitted and we can't rely on 195 * this code to fix it. 196 */ 197 bool fix_exec_size; 198 if (devinfo->ver >= 6) 199 fix_exec_size = dest.width < BRW_EXECUTE_4; 200 else 201 fix_exec_size = dest.width < BRW_EXECUTE_8; 202 203 if (fix_exec_size) 204 brw_inst_set_exec_size(devinfo, inst, dest.width); 205 } 206} 207 208void 209brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) 210{ 211 const struct intel_device_info *devinfo = p->devinfo; 212 213 if (reg.file == BRW_MESSAGE_REGISTER_FILE) 214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); 215 else if (reg.file == BRW_GENERAL_REGISTER_FILE) 216 assert(reg.nr < 128); 217 218 gfx7_convert_mrf_to_grf(p, ®); 219 220 if (devinfo->ver >= 6 && 221 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC || 223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || 224 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) { 225 /* Any source modifiers or regions will be ignored, since this just 226 * identifies the MRF/GRF to start reading the message contents from. 227 * Check for some likely failures. 228 */ 229 assert(!reg.negate); 230 assert(!reg.abs); 231 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 232 } 233 234 if (devinfo->ver >= 12 && 235 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 236 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { 237 assert(reg.file != BRW_IMMEDIATE_VALUE); 238 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 239 assert(reg.subnr == 0); 240 assert(has_scalar_region(reg) || 241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && 242 reg.vstride == reg.width + 1)); 243 assert(!reg.negate && !reg.abs); 244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file); 245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); 246 247 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || 248 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) { 249 assert(reg.file == BRW_GENERAL_REGISTER_FILE); 250 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 251 assert(reg.subnr % 16 == 0); 252 assert(has_scalar_region(reg) || 253 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && 254 reg.vstride == reg.width + 1)); 255 assert(!reg.negate && !reg.abs); 256 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); 257 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); 258 } else { 259 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type); 260 brw_inst_set_src0_abs(devinfo, inst, reg.abs); 261 brw_inst_set_src0_negate(devinfo, inst, reg.negate); 262 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode); 263 264 if (reg.file == BRW_IMMEDIATE_VALUE) { 265 if (reg.type == BRW_REGISTER_TYPE_DF || 266 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM) 267 brw_inst_set_imm_df(devinfo, inst, reg.df); 268 else if (reg.type == BRW_REGISTER_TYPE_UQ || 269 reg.type == BRW_REGISTER_TYPE_Q) 270 brw_inst_set_imm_uq(devinfo, inst, reg.u64); 271 else 272 brw_inst_set_imm_ud(devinfo, inst, reg.ud); 273 274 if (devinfo->ver < 12 && type_sz(reg.type) < 8) { 275 brw_inst_set_src1_reg_file(devinfo, inst, 276 BRW_ARCHITECTURE_REGISTER_FILE); 277 brw_inst_set_src1_reg_hw_type(devinfo, inst, 278 brw_inst_src0_reg_hw_type(devinfo, inst)); 279 } 280 } else { 281 if (reg.address_mode == BRW_ADDRESS_DIRECT) { 282 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); 283 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 284 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr); 285 } else { 286 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); 287 } 288 } else { 289 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr); 290 291 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 292 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset); 293 } else { 294 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset); 295 } 296 } 297 298 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 299 if (reg.width == BRW_WIDTH_1 && 300 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { 301 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); 302 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1); 303 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); 304 } else { 305 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride); 306 brw_inst_set_src0_width(devinfo, inst, reg.width); 307 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); 308 } 309 } else { 310 brw_inst_set_src0_da16_swiz_x(devinfo, inst, 311 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); 312 brw_inst_set_src0_da16_swiz_y(devinfo, inst, 313 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); 314 brw_inst_set_src0_da16_swiz_z(devinfo, inst, 315 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); 316 brw_inst_set_src0_da16_swiz_w(devinfo, inst, 317 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); 318 319 if (reg.vstride == BRW_VERTICAL_STRIDE_8) { 320 /* This is an oddity of the fact we're using the same 321 * descriptions for registers in align_16 as align_1: 322 */ 323 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 324 } else if (devinfo->verx10 == 70 && 325 reg.type == BRW_REGISTER_TYPE_DF && 326 reg.vstride == BRW_VERTICAL_STRIDE_2) { 327 /* From SNB PRM: 328 * 329 * "For Align16 access mode, only encodings of 0000 and 0011 330 * are allowed. Other codes are reserved." 331 * 332 * Presumably the DevSNB behavior applies to IVB as well. 333 */ 334 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 335 } else { 336 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); 337 } 338 } 339 } 340 } 341} 342 343 344void 345brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) 346{ 347 const struct intel_device_info *devinfo = p->devinfo; 348 349 if (reg.file == BRW_GENERAL_REGISTER_FILE) 350 assert(reg.nr < 128); 351 352 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS || 353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC || 354 (devinfo->ver >= 12 && 355 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 356 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) { 357 assert(reg.file == BRW_GENERAL_REGISTER_FILE || 358 reg.file == BRW_ARCHITECTURE_REGISTER_FILE); 359 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 360 assert(reg.subnr == 0); 361 assert(has_scalar_region(reg) || 362 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 && 363 reg.vstride == reg.width + 1)); 364 assert(!reg.negate && !reg.abs); 365 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr); 366 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file); 367 } else { 368 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: 369 * 370 * "Accumulator registers may be accessed explicitly as src0 371 * operands only." 372 */ 373 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE || 374 reg.nr != BRW_ARF_ACCUMULATOR); 375 376 gfx7_convert_mrf_to_grf(p, ®); 377 assert(reg.file != BRW_MESSAGE_REGISTER_FILE); 378 379 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type); 380 brw_inst_set_src1_abs(devinfo, inst, reg.abs); 381 brw_inst_set_src1_negate(devinfo, inst, reg.negate); 382 383 /* Only src1 can be immediate in two-argument instructions. 384 */ 385 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE); 386 387 if (reg.file == BRW_IMMEDIATE_VALUE) { 388 /* two-argument instructions can only use 32-bit immediates */ 389 assert(type_sz(reg.type) < 8); 390 brw_inst_set_imm_ud(devinfo, inst, reg.ud); 391 } else { 392 /* This is a hardware restriction, which may or may not be lifted 393 * in the future: 394 */ 395 assert (reg.address_mode == BRW_ADDRESS_DIRECT); 396 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */ 397 398 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr); 399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 400 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr); 401 } else { 402 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16); 403 } 404 405 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 406 if (reg.width == BRW_WIDTH_1 && 407 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { 408 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); 409 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1); 410 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); 411 } else { 412 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride); 413 brw_inst_set_src1_width(devinfo, inst, reg.width); 414 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); 415 } 416 } else { 417 brw_inst_set_src1_da16_swiz_x(devinfo, inst, 418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); 419 brw_inst_set_src1_da16_swiz_y(devinfo, inst, 420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); 421 brw_inst_set_src1_da16_swiz_z(devinfo, inst, 422 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); 423 brw_inst_set_src1_da16_swiz_w(devinfo, inst, 424 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); 425 426 if (reg.vstride == BRW_VERTICAL_STRIDE_8) { 427 /* This is an oddity of the fact we're using the same 428 * descriptions for registers in align_16 as align_1: 429 */ 430 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 431 } else if (devinfo->verx10 == 70 && 432 reg.type == BRW_REGISTER_TYPE_DF && 433 reg.vstride == BRW_VERTICAL_STRIDE_2) { 434 /* From SNB PRM: 435 * 436 * "For Align16 access mode, only encodings of 0000 and 0011 437 * are allowed. Other codes are reserved." 438 * 439 * Presumably the DevSNB behavior applies to IVB as well. 440 */ 441 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 442 } else { 443 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); 444 } 445 } 446 } 447 } 448} 449 450/** 451 * Specify the descriptor and extended descriptor immediate for a SEND(C) 452 * message instruction. 453 */ 454void 455brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst, 456 unsigned desc, unsigned ex_desc) 457{ 458 const struct intel_device_info *devinfo = p->devinfo; 459 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 460 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC); 461 if (devinfo->ver < 12) 462 brw_inst_set_src1_file_type(devinfo, inst, 463 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD); 464 brw_inst_set_send_desc(devinfo, inst, desc); 465 if (devinfo->ver >= 9) 466 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc); 467} 468 469static void brw_set_math_message( struct brw_codegen *p, 470 brw_inst *inst, 471 unsigned function, 472 unsigned integer_type, 473 bool low_precision, 474 unsigned dataType ) 475{ 476 const struct intel_device_info *devinfo = p->devinfo; 477 unsigned msg_length; 478 unsigned response_length; 479 480 /* Infer message length from the function */ 481 switch (function) { 482 case BRW_MATH_FUNCTION_POW: 483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: 484 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: 485 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 486 msg_length = 2; 487 break; 488 default: 489 msg_length = 1; 490 break; 491 } 492 493 /* Infer response length from the function */ 494 switch (function) { 495 case BRW_MATH_FUNCTION_SINCOS: 496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 497 response_length = 2; 498 break; 499 default: 500 response_length = 1; 501 break; 502 } 503 504 brw_set_desc(p, inst, brw_message_desc( 505 devinfo, msg_length, response_length, false)); 506 507 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH); 508 brw_inst_set_math_msg_function(devinfo, inst, function); 509 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type); 510 brw_inst_set_math_msg_precision(devinfo, inst, low_precision); 511 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst)); 512 brw_inst_set_math_msg_data_type(devinfo, inst, dataType); 513 brw_inst_set_saturate(devinfo, inst, 0); 514} 515 516 517static void brw_set_ff_sync_message(struct brw_codegen *p, 518 brw_inst *insn, 519 bool allocate, 520 unsigned response_length, 521 bool end_of_thread) 522{ 523 const struct intel_device_info *devinfo = p->devinfo; 524 525 brw_set_desc(p, insn, brw_message_desc( 526 devinfo, 1, response_length, true)); 527 528 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB); 529 brw_inst_set_eot(devinfo, insn, end_of_thread); 530 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */ 531 brw_inst_set_urb_allocate(devinfo, insn, allocate); 532 /* The following fields are not used by FF_SYNC: */ 533 brw_inst_set_urb_global_offset(devinfo, insn, 0); 534 brw_inst_set_urb_swizzle_control(devinfo, insn, 0); 535 brw_inst_set_urb_used(devinfo, insn, 0); 536 brw_inst_set_urb_complete(devinfo, insn, 0); 537} 538 539static void brw_set_urb_message( struct brw_codegen *p, 540 brw_inst *insn, 541 enum brw_urb_write_flags flags, 542 unsigned msg_length, 543 unsigned response_length, 544 unsigned offset, 545 unsigned swizzle_control ) 546{ 547 const struct intel_device_info *devinfo = p->devinfo; 548 549 assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE); 550 assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE)); 551 assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); 552 553 brw_set_desc(p, insn, brw_message_desc( 554 devinfo, msg_length, response_length, true)); 555 556 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB); 557 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT)); 558 559 if (flags & BRW_URB_WRITE_OWORD) { 560 assert(msg_length == 2); /* header + one OWORD of data */ 561 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD); 562 } else { 563 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD); 564 } 565 566 brw_inst_set_urb_global_offset(devinfo, insn, offset); 567 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control); 568 569 if (devinfo->ver < 8) { 570 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE)); 571 } 572 573 if (devinfo->ver < 7) { 574 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE)); 575 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED)); 576 } else { 577 brw_inst_set_urb_per_slot_offset(devinfo, insn, 578 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); 579 } 580} 581 582static void 583gfx7_set_dp_scratch_message(struct brw_codegen *p, 584 brw_inst *inst, 585 bool write, 586 bool dword, 587 bool invalidate_after_read, 588 unsigned num_regs, 589 unsigned addr_offset, 590 unsigned mlen, 591 unsigned rlen, 592 bool header_present) 593{ 594 const struct intel_device_info *devinfo = p->devinfo; 595 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 || 596 (devinfo->ver >= 8 && num_regs == 8)); 597 const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) : 598 num_regs - 1); 599 600 brw_set_desc(p, inst, brw_message_desc( 601 devinfo, mlen, rlen, header_present)); 602 603 brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE); 604 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */ 605 brw_inst_set_scratch_read_write(devinfo, inst, write); 606 brw_inst_set_scratch_type(devinfo, inst, dword); 607 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read); 608 brw_inst_set_scratch_block_size(devinfo, inst, block_size); 609 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset); 610} 611 612static void 613brw_inst_set_state(const struct intel_device_info *devinfo, 614 brw_inst *insn, 615 const struct brw_insn_state *state) 616{ 617 brw_inst_set_exec_size(devinfo, insn, state->exec_size); 618 brw_inst_set_group(devinfo, insn, state->group); 619 brw_inst_set_compression(devinfo, insn, state->compressed); 620 brw_inst_set_access_mode(devinfo, insn, state->access_mode); 621 brw_inst_set_mask_control(devinfo, insn, state->mask_control); 622 if (devinfo->ver >= 12) 623 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb)); 624 brw_inst_set_saturate(devinfo, insn, state->saturate); 625 brw_inst_set_pred_control(devinfo, insn, state->predicate); 626 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv); 627 628 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) && 629 state->access_mode == BRW_ALIGN_16) { 630 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2); 631 if (devinfo->ver >= 7) 632 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2); 633 } else { 634 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2); 635 if (devinfo->ver >= 7) 636 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2); 637 } 638 639 if (devinfo->ver >= 6) 640 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control); 641} 642 643static brw_inst * 644brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align) 645{ 646 assert(util_is_power_of_two_or_zero(sizeof(brw_inst))); 647 assert(util_is_power_of_two_or_zero(align)); 648 const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1); 649 const unsigned start_insn = ALIGN(p->nr_insn, align_insn); 650 const unsigned new_nr_insn = start_insn + nr_insn; 651 652 if (p->store_size < new_nr_insn) { 653 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst)); 654 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size); 655 } 656 657 /* Memset any padding due to alignment to 0. We don't want to be hashing 658 * or caching a bunch of random bits we got from a memory allocation. 659 */ 660 if (p->nr_insn < start_insn) { 661 memset(&p->store[p->nr_insn], 0, 662 (start_insn - p->nr_insn) * sizeof(brw_inst)); 663 } 664 665 assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst)); 666 p->nr_insn = new_nr_insn; 667 p->next_insn_offset = new_nr_insn * sizeof(brw_inst); 668 669 return &p->store[start_insn]; 670} 671 672void 673brw_realign(struct brw_codegen *p, unsigned align) 674{ 675 brw_append_insns(p, 0, align); 676} 677 678int 679brw_append_data(struct brw_codegen *p, void *data, 680 unsigned size, unsigned align) 681{ 682 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst)); 683 void *dst = brw_append_insns(p, nr_insn, align); 684 memcpy(dst, data, size); 685 686 /* If it's not a whole number of instructions, memset the end */ 687 if (size < nr_insn * sizeof(brw_inst)) 688 memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size); 689 690 return dst - (void *)p->store; 691} 692 693#define next_insn brw_next_insn 694brw_inst * 695brw_next_insn(struct brw_codegen *p, unsigned opcode) 696{ 697 const struct intel_device_info *devinfo = p->devinfo; 698 brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst)); 699 700 memset(insn, 0, sizeof(*insn)); 701 brw_inst_set_opcode(devinfo, insn, opcode); 702 703 /* Apply the default instruction state */ 704 brw_inst_set_state(devinfo, insn, p->current); 705 706 return insn; 707} 708 709void 710brw_add_reloc(struct brw_codegen *p, uint32_t id, 711 enum brw_shader_reloc_type type, 712 uint32_t offset, uint32_t delta) 713{ 714 if (p->num_relocs + 1 > p->reloc_array_size) { 715 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2); 716 p->relocs = reralloc(p->mem_ctx, p->relocs, 717 struct brw_shader_reloc, p->reloc_array_size); 718 } 719 720 p->relocs[p->num_relocs++] = (struct brw_shader_reloc) { 721 .id = id, 722 .type = type, 723 .offset = offset, 724 .delta = delta, 725 }; 726} 727 728static brw_inst * 729brw_alu1(struct brw_codegen *p, unsigned opcode, 730 struct brw_reg dest, struct brw_reg src) 731{ 732 brw_inst *insn = next_insn(p, opcode); 733 brw_set_dest(p, insn, dest); 734 brw_set_src0(p, insn, src); 735 return insn; 736} 737 738static brw_inst * 739brw_alu2(struct brw_codegen *p, unsigned opcode, 740 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1) 741{ 742 /* 64-bit immediates are only supported on 1-src instructions */ 743 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4); 744 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4); 745 746 brw_inst *insn = next_insn(p, opcode); 747 brw_set_dest(p, insn, dest); 748 brw_set_src0(p, insn, src0); 749 brw_set_src1(p, insn, src1); 750 return insn; 751} 752 753static int 754get_3src_subreg_nr(struct brw_reg reg) 755{ 756 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions 757 * use 32-bit units (components 0..7). Since they only support F/D/UD 758 * types, this doesn't lose any flexibility, but uses fewer bits. 759 */ 760 return reg.subnr / 4; 761} 762 763static enum gfx10_align1_3src_vertical_stride 764to_3src_align1_vstride(const struct intel_device_info *devinfo, 765 enum brw_vertical_stride vstride) 766{ 767 switch (vstride) { 768 case BRW_VERTICAL_STRIDE_0: 769 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0; 770 case BRW_VERTICAL_STRIDE_1: 771 assert(devinfo->ver >= 12); 772 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1; 773 case BRW_VERTICAL_STRIDE_2: 774 assert(devinfo->ver < 12); 775 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2; 776 case BRW_VERTICAL_STRIDE_4: 777 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4; 778 case BRW_VERTICAL_STRIDE_8: 779 case BRW_VERTICAL_STRIDE_16: 780 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8; 781 default: 782 unreachable("invalid vstride"); 783 } 784} 785 786 787static enum gfx10_align1_3src_src_horizontal_stride 788to_3src_align1_hstride(enum brw_horizontal_stride hstride) 789{ 790 switch (hstride) { 791 case BRW_HORIZONTAL_STRIDE_0: 792 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0; 793 case BRW_HORIZONTAL_STRIDE_1: 794 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1; 795 case BRW_HORIZONTAL_STRIDE_2: 796 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2; 797 case BRW_HORIZONTAL_STRIDE_4: 798 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4; 799 default: 800 unreachable("invalid hstride"); 801 } 802} 803 804static brw_inst * 805brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, 806 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) 807{ 808 const struct intel_device_info *devinfo = p->devinfo; 809 brw_inst *inst = next_insn(p, opcode); 810 811 gfx7_convert_mrf_to_grf(p, &dest); 812 813 assert(dest.nr < 128); 814 815 if (devinfo->ver >= 10) 816 assert(!(src0.file == BRW_IMMEDIATE_VALUE && 817 src2.file == BRW_IMMEDIATE_VALUE)); 818 819 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128); 820 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128); 821 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128); 822 assert(dest.address_mode == BRW_ADDRESS_DIRECT); 823 assert(src0.address_mode == BRW_ADDRESS_DIRECT); 824 assert(src1.address_mode == BRW_ADDRESS_DIRECT); 825 assert(src2.address_mode == BRW_ADDRESS_DIRECT); 826 827 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 828 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 829 dest.file == BRW_ARCHITECTURE_REGISTER_FILE); 830 831 if (devinfo->ver >= 12) { 832 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file); 833 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); 834 } else { 835 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) { 836 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, 837 BRW_ALIGN1_3SRC_ACCUMULATOR); 838 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); 839 } else { 840 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, 841 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE); 842 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); 843 } 844 } 845 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8); 846 847 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1); 848 849 if (brw_reg_type_is_floating_point(dest.type)) { 850 brw_inst_set_3src_a1_exec_type(devinfo, inst, 851 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT); 852 } else { 853 brw_inst_set_3src_a1_exec_type(devinfo, inst, 854 BRW_ALIGN1_3SRC_EXEC_TYPE_INT); 855 } 856 857 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type); 858 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type); 859 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type); 860 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type); 861 862 if (src0.file == BRW_IMMEDIATE_VALUE) { 863 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud); 864 } else { 865 brw_inst_set_3src_a1_src0_vstride( 866 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride)); 867 brw_inst_set_3src_a1_src0_hstride(devinfo, inst, 868 to_3src_align1_hstride(src0.hstride)); 869 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr); 870 if (src0.type == BRW_REGISTER_TYPE_NF) { 871 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); 872 } else { 873 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); 874 } 875 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); 876 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); 877 } 878 brw_inst_set_3src_a1_src1_vstride( 879 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride)); 880 brw_inst_set_3src_a1_src1_hstride(devinfo, inst, 881 to_3src_align1_hstride(src1.hstride)); 882 883 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr); 884 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) { 885 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR); 886 } else { 887 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); 888 } 889 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); 890 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); 891 892 if (src2.file == BRW_IMMEDIATE_VALUE) { 893 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud); 894 } else { 895 brw_inst_set_3src_a1_src2_hstride(devinfo, inst, 896 to_3src_align1_hstride(src2.hstride)); 897 /* no vstride on src2 */ 898 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr); 899 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); 900 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); 901 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); 902 } 903 904 assert(src0.file == BRW_GENERAL_REGISTER_FILE || 905 src0.file == BRW_IMMEDIATE_VALUE || 906 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE && 907 src0.type == BRW_REGISTER_TYPE_NF)); 908 assert(src1.file == BRW_GENERAL_REGISTER_FILE || 909 src1.file == BRW_ARCHITECTURE_REGISTER_FILE); 910 assert(src2.file == BRW_GENERAL_REGISTER_FILE || 911 src2.file == BRW_IMMEDIATE_VALUE); 912 913 if (devinfo->ver >= 12) { 914 if (src0.file == BRW_IMMEDIATE_VALUE) { 915 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1); 916 } else { 917 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file); 918 } 919 920 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file); 921 922 if (src2.file == BRW_IMMEDIATE_VALUE) { 923 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1); 924 } else { 925 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file); 926 } 927 } else { 928 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, 929 src0.file == BRW_GENERAL_REGISTER_FILE ? 930 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : 931 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); 932 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, 933 src1.file == BRW_GENERAL_REGISTER_FILE ? 934 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : 935 BRW_ALIGN1_3SRC_ACCUMULATOR); 936 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, 937 src2.file == BRW_GENERAL_REGISTER_FILE ? 938 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE : 939 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE); 940 } 941 942 } else { 943 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 944 dest.file == BRW_MESSAGE_REGISTER_FILE); 945 assert(dest.type == BRW_REGISTER_TYPE_F || 946 dest.type == BRW_REGISTER_TYPE_DF || 947 dest.type == BRW_REGISTER_TYPE_D || 948 dest.type == BRW_REGISTER_TYPE_UD || 949 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8)); 950 if (devinfo->ver == 6) { 951 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst, 952 dest.file == BRW_MESSAGE_REGISTER_FILE); 953 } 954 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); 955 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4); 956 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask); 957 958 assert(src0.file == BRW_GENERAL_REGISTER_FILE); 959 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle); 960 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0)); 961 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); 962 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); 963 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); 964 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst, 965 src0.vstride == BRW_VERTICAL_STRIDE_0); 966 967 assert(src1.file == BRW_GENERAL_REGISTER_FILE); 968 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle); 969 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1)); 970 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); 971 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); 972 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); 973 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst, 974 src1.vstride == BRW_VERTICAL_STRIDE_0); 975 976 assert(src2.file == BRW_GENERAL_REGISTER_FILE); 977 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle); 978 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2)); 979 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); 980 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); 981 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); 982 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst, 983 src2.vstride == BRW_VERTICAL_STRIDE_0); 984 985 if (devinfo->ver >= 7) { 986 /* Set both the source and destination types based on dest.type, 987 * ignoring the source register types. The MAD and LRP emitters ensure 988 * that all four types are float. The BFE and BFI2 emitters, however, 989 * may send us mixed D and UD types and want us to ignore that and use 990 * the destination type. 991 */ 992 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type); 993 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type); 994 995 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType: 996 * 997 * "Three source instructions can use operands with mixed-mode 998 * precision. When SrcType field is set to :f or :hf it defines 999 * precision for source 0 only, and fields Src1Type and Src2Type 1000 * define precision for other source operands: 1001 * 1002 * 0b = :f. Single precision Float (32-bit). 1003 * 1b = :hf. Half precision Float (16-bit)." 1004 */ 1005 if (src1.type == BRW_REGISTER_TYPE_HF) 1006 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1); 1007 1008 if (src2.type == BRW_REGISTER_TYPE_HF) 1009 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1); 1010 } 1011 } 1012 1013 return inst; 1014} 1015 1016 1017/*********************************************************************** 1018 * Convenience routines. 1019 */ 1020#define ALU1(OP) \ 1021brw_inst *brw_##OP(struct brw_codegen *p, \ 1022 struct brw_reg dest, \ 1023 struct brw_reg src0) \ 1024{ \ 1025 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ 1026} 1027 1028#define ALU2(OP) \ 1029brw_inst *brw_##OP(struct brw_codegen *p, \ 1030 struct brw_reg dest, \ 1031 struct brw_reg src0, \ 1032 struct brw_reg src1) \ 1033{ \ 1034 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ 1035} 1036 1037#define ALU3(OP) \ 1038brw_inst *brw_##OP(struct brw_codegen *p, \ 1039 struct brw_reg dest, \ 1040 struct brw_reg src0, \ 1041 struct brw_reg src1, \ 1042 struct brw_reg src2) \ 1043{ \ 1044 if (p->current->access_mode == BRW_ALIGN_16) { \ 1045 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \ 1046 src0.swizzle = BRW_SWIZZLE_XXXX; \ 1047 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \ 1048 src1.swizzle = BRW_SWIZZLE_XXXX; \ 1049 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \ 1050 src2.swizzle = BRW_SWIZZLE_XXXX; \ 1051 } \ 1052 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 1053} 1054 1055#define ALU3F(OP) \ 1056brw_inst *brw_##OP(struct brw_codegen *p, \ 1057 struct brw_reg dest, \ 1058 struct brw_reg src0, \ 1059 struct brw_reg src1, \ 1060 struct brw_reg src2) \ 1061{ \ 1062 assert(dest.type == BRW_REGISTER_TYPE_F || \ 1063 dest.type == BRW_REGISTER_TYPE_DF); \ 1064 if (dest.type == BRW_REGISTER_TYPE_F) { \ 1065 assert(src0.type == BRW_REGISTER_TYPE_F); \ 1066 assert(src1.type == BRW_REGISTER_TYPE_F); \ 1067 assert(src2.type == BRW_REGISTER_TYPE_F); \ 1068 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \ 1069 assert(src0.type == BRW_REGISTER_TYPE_DF); \ 1070 assert(src1.type == BRW_REGISTER_TYPE_DF); \ 1071 assert(src2.type == BRW_REGISTER_TYPE_DF); \ 1072 } \ 1073 \ 1074 if (p->current->access_mode == BRW_ALIGN_16) { \ 1075 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \ 1076 src0.swizzle = BRW_SWIZZLE_XXXX; \ 1077 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \ 1078 src1.swizzle = BRW_SWIZZLE_XXXX; \ 1079 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \ 1080 src2.swizzle = BRW_SWIZZLE_XXXX; \ 1081 } \ 1082 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 1083} 1084 1085ALU2(SEL) 1086ALU1(NOT) 1087ALU2(AND) 1088ALU2(OR) 1089ALU2(XOR) 1090ALU2(SHR) 1091ALU2(SHL) 1092ALU1(DIM) 1093ALU2(ASR) 1094ALU2(ROL) 1095ALU2(ROR) 1096ALU3(CSEL) 1097ALU1(FRC) 1098ALU1(RNDD) 1099ALU1(RNDE) 1100ALU1(RNDU) 1101ALU1(RNDZ) 1102ALU2(MAC) 1103ALU2(MACH) 1104ALU1(LZD) 1105ALU2(DP4) 1106ALU2(DPH) 1107ALU2(DP3) 1108ALU2(DP2) 1109ALU3(DP4A) 1110ALU3(MAD) 1111ALU3F(LRP) 1112ALU1(BFREV) 1113ALU3(BFE) 1114ALU2(BFI1) 1115ALU3(BFI2) 1116ALU1(FBH) 1117ALU1(FBL) 1118ALU1(CBIT) 1119ALU2(ADDC) 1120ALU2(SUBB) 1121ALU3(ADD3) 1122 1123brw_inst * 1124brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0) 1125{ 1126 const struct intel_device_info *devinfo = p->devinfo; 1127 1128 /* When converting F->DF on IVB/BYT, every odd source channel is ignored. 1129 * To avoid the problems that causes, we use an <X,2,0> source region to 1130 * read each element twice. 1131 */ 1132 if (devinfo->verx10 == 70 && 1133 brw_get_default_access_mode(p) == BRW_ALIGN_1 && 1134 dest.type == BRW_REGISTER_TYPE_DF && 1135 (src0.type == BRW_REGISTER_TYPE_F || 1136 src0.type == BRW_REGISTER_TYPE_D || 1137 src0.type == BRW_REGISTER_TYPE_UD) && 1138 !has_scalar_region(src0)) { 1139 assert(src0.vstride == src0.width + src0.hstride); 1140 src0.vstride = src0.hstride; 1141 src0.width = BRW_WIDTH_2; 1142 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1143 } 1144 1145 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0); 1146} 1147 1148brw_inst * 1149brw_ADD(struct brw_codegen *p, struct brw_reg dest, 1150 struct brw_reg src0, struct brw_reg src1) 1151{ 1152 /* 6.2.2: add */ 1153 if (src0.type == BRW_REGISTER_TYPE_F || 1154 (src0.file == BRW_IMMEDIATE_VALUE && 1155 src0.type == BRW_REGISTER_TYPE_VF)) { 1156 assert(src1.type != BRW_REGISTER_TYPE_UD); 1157 assert(src1.type != BRW_REGISTER_TYPE_D); 1158 } 1159 1160 if (src1.type == BRW_REGISTER_TYPE_F || 1161 (src1.file == BRW_IMMEDIATE_VALUE && 1162 src1.type == BRW_REGISTER_TYPE_VF)) { 1163 assert(src0.type != BRW_REGISTER_TYPE_UD); 1164 assert(src0.type != BRW_REGISTER_TYPE_D); 1165 } 1166 1167 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1); 1168} 1169 1170brw_inst * 1171brw_AVG(struct brw_codegen *p, struct brw_reg dest, 1172 struct brw_reg src0, struct brw_reg src1) 1173{ 1174 assert(dest.type == src0.type); 1175 assert(src0.type == src1.type); 1176 switch (src0.type) { 1177 case BRW_REGISTER_TYPE_B: 1178 case BRW_REGISTER_TYPE_UB: 1179 case BRW_REGISTER_TYPE_W: 1180 case BRW_REGISTER_TYPE_UW: 1181 case BRW_REGISTER_TYPE_D: 1182 case BRW_REGISTER_TYPE_UD: 1183 break; 1184 default: 1185 unreachable("Bad type for brw_AVG"); 1186 } 1187 1188 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1); 1189} 1190 1191brw_inst * 1192brw_MUL(struct brw_codegen *p, struct brw_reg dest, 1193 struct brw_reg src0, struct brw_reg src1) 1194{ 1195 /* 6.32.38: mul */ 1196 if (src0.type == BRW_REGISTER_TYPE_D || 1197 src0.type == BRW_REGISTER_TYPE_UD || 1198 src1.type == BRW_REGISTER_TYPE_D || 1199 src1.type == BRW_REGISTER_TYPE_UD) { 1200 assert(dest.type != BRW_REGISTER_TYPE_F); 1201 } 1202 1203 if (src0.type == BRW_REGISTER_TYPE_F || 1204 (src0.file == BRW_IMMEDIATE_VALUE && 1205 src0.type == BRW_REGISTER_TYPE_VF)) { 1206 assert(src1.type != BRW_REGISTER_TYPE_UD); 1207 assert(src1.type != BRW_REGISTER_TYPE_D); 1208 } 1209 1210 if (src1.type == BRW_REGISTER_TYPE_F || 1211 (src1.file == BRW_IMMEDIATE_VALUE && 1212 src1.type == BRW_REGISTER_TYPE_VF)) { 1213 assert(src0.type != BRW_REGISTER_TYPE_UD); 1214 assert(src0.type != BRW_REGISTER_TYPE_D); 1215 } 1216 1217 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE || 1218 src0.nr != BRW_ARF_ACCUMULATOR); 1219 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE || 1220 src1.nr != BRW_ARF_ACCUMULATOR); 1221 1222 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1); 1223} 1224 1225brw_inst * 1226brw_LINE(struct brw_codegen *p, struct brw_reg dest, 1227 struct brw_reg src0, struct brw_reg src1) 1228{ 1229 src0.vstride = BRW_VERTICAL_STRIDE_0; 1230 src0.width = BRW_WIDTH_1; 1231 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1232 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1); 1233} 1234 1235brw_inst * 1236brw_PLN(struct brw_codegen *p, struct brw_reg dest, 1237 struct brw_reg src0, struct brw_reg src1) 1238{ 1239 src0.vstride = BRW_VERTICAL_STRIDE_0; 1240 src0.width = BRW_WIDTH_1; 1241 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1242 src1.vstride = BRW_VERTICAL_STRIDE_8; 1243 src1.width = BRW_WIDTH_8; 1244 src1.hstride = BRW_HORIZONTAL_STRIDE_1; 1245 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1); 1246} 1247 1248brw_inst * 1249brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) 1250{ 1251 const struct intel_device_info *devinfo = p->devinfo; 1252 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16; 1253 /* The F32TO16 instruction doesn't support 32-bit destination types in 1254 * Align1 mode, and neither does the Gfx8 implementation in terms of a 1255 * converting MOV. Gfx7 does zero out the high 16 bits in Align16 mode as 1256 * an undocumented feature. 1257 */ 1258 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD && 1259 (!align16 || devinfo->ver >= 8)); 1260 brw_inst *inst; 1261 1262 if (align16) { 1263 assert(dst.type == BRW_REGISTER_TYPE_UD); 1264 } else { 1265 assert(dst.type == BRW_REGISTER_TYPE_UD || 1266 dst.type == BRW_REGISTER_TYPE_W || 1267 dst.type == BRW_REGISTER_TYPE_UW || 1268 dst.type == BRW_REGISTER_TYPE_HF); 1269 } 1270 1271 brw_push_insn_state(p); 1272 1273 if (needs_zero_fill) { 1274 brw_set_default_access_mode(p, BRW_ALIGN_1); 1275 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); 1276 } 1277 1278 if (devinfo->ver >= 8) { 1279 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src); 1280 } else { 1281 assert(devinfo->ver == 7); 1282 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src); 1283 } 1284 1285 if (needs_zero_fill) { 1286 if (devinfo->ver < 12) 1287 brw_inst_set_no_dd_clear(devinfo, inst, true); 1288 brw_set_default_swsb(p, tgl_swsb_null()); 1289 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0)); 1290 if (devinfo->ver < 12) 1291 brw_inst_set_no_dd_check(devinfo, inst, true); 1292 } 1293 1294 brw_pop_insn_state(p); 1295 return inst; 1296} 1297 1298brw_inst * 1299brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) 1300{ 1301 const struct intel_device_info *devinfo = p->devinfo; 1302 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16; 1303 1304 if (align16) { 1305 assert(src.type == BRW_REGISTER_TYPE_UD); 1306 } else { 1307 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 1308 * 1309 * Because this instruction does not have a 16-bit floating-point 1310 * type, the source data type must be Word (W). The destination type 1311 * must be F (Float). 1312 */ 1313 if (src.type == BRW_REGISTER_TYPE_UD) 1314 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2); 1315 1316 assert(src.type == BRW_REGISTER_TYPE_W || 1317 src.type == BRW_REGISTER_TYPE_UW || 1318 src.type == BRW_REGISTER_TYPE_HF); 1319 } 1320 1321 if (devinfo->ver >= 8) { 1322 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF)); 1323 } else { 1324 assert(devinfo->ver == 7); 1325 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src); 1326 } 1327} 1328 1329 1330void brw_NOP(struct brw_codegen *p) 1331{ 1332 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP); 1333 memset(insn, 0, sizeof(*insn)); 1334 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP); 1335} 1336 1337void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func) 1338{ 1339 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC); 1340 brw_inst_set_cond_modifier(p->devinfo, insn, func); 1341} 1342 1343/*********************************************************************** 1344 * Comparisons, if/else/endif 1345 */ 1346 1347brw_inst * 1348brw_JMPI(struct brw_codegen *p, struct brw_reg index, 1349 unsigned predicate_control) 1350{ 1351 const struct intel_device_info *devinfo = p->devinfo; 1352 struct brw_reg ip = brw_ip_reg(); 1353 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index); 1354 1355 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1); 1356 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); 1357 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); 1358 brw_inst_set_pred_control(devinfo, inst, predicate_control); 1359 1360 return inst; 1361} 1362 1363static void 1364push_if_stack(struct brw_codegen *p, brw_inst *inst) 1365{ 1366 p->if_stack[p->if_stack_depth] = inst - p->store; 1367 1368 p->if_stack_depth++; 1369 if (p->if_stack_array_size <= p->if_stack_depth) { 1370 p->if_stack_array_size *= 2; 1371 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int, 1372 p->if_stack_array_size); 1373 } 1374} 1375 1376static brw_inst * 1377pop_if_stack(struct brw_codegen *p) 1378{ 1379 p->if_stack_depth--; 1380 return &p->store[p->if_stack[p->if_stack_depth]]; 1381} 1382 1383static void 1384push_loop_stack(struct brw_codegen *p, brw_inst *inst) 1385{ 1386 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) { 1387 p->loop_stack_array_size *= 2; 1388 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, 1389 p->loop_stack_array_size); 1390 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int, 1391 p->loop_stack_array_size); 1392 } 1393 1394 p->loop_stack[p->loop_stack_depth] = inst - p->store; 1395 p->loop_stack_depth++; 1396 p->if_depth_in_loop[p->loop_stack_depth] = 0; 1397} 1398 1399static brw_inst * 1400get_inner_do_insn(struct brw_codegen *p) 1401{ 1402 return &p->store[p->loop_stack[p->loop_stack_depth - 1]]; 1403} 1404 1405/* EU takes the value from the flag register and pushes it onto some 1406 * sort of a stack (presumably merging with any flag value already on 1407 * the stack). Within an if block, the flags at the top of the stack 1408 * control execution on each channel of the unit, eg. on each of the 1409 * 16 pixel values in our wm programs. 1410 * 1411 * When the matching 'else' instruction is reached (presumably by 1412 * countdown of the instruction count patched in by our ELSE/ENDIF 1413 * functions), the relevant flags are inverted. 1414 * 1415 * When the matching 'endif' instruction is reached, the flags are 1416 * popped off. If the stack is now empty, normal execution resumes. 1417 */ 1418brw_inst * 1419brw_IF(struct brw_codegen *p, unsigned execute_size) 1420{ 1421 const struct intel_device_info *devinfo = p->devinfo; 1422 brw_inst *insn; 1423 1424 insn = next_insn(p, BRW_OPCODE_IF); 1425 1426 /* Override the defaults for this instruction: 1427 */ 1428 if (devinfo->ver < 6) { 1429 brw_set_dest(p, insn, brw_ip_reg()); 1430 brw_set_src0(p, insn, brw_ip_reg()); 1431 brw_set_src1(p, insn, brw_imm_d(0x0)); 1432 } else if (devinfo->ver == 6) { 1433 brw_set_dest(p, insn, brw_imm_w(0)); 1434 brw_inst_set_gfx6_jump_count(devinfo, insn, 0); 1435 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1436 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1437 } else if (devinfo->ver == 7) { 1438 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1439 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1440 brw_set_src1(p, insn, brw_imm_w(0)); 1441 brw_inst_set_jip(devinfo, insn, 0); 1442 brw_inst_set_uip(devinfo, insn, 0); 1443 } else { 1444 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1445 if (devinfo->ver < 12) 1446 brw_set_src0(p, insn, brw_imm_d(0)); 1447 brw_inst_set_jip(devinfo, insn, 0); 1448 brw_inst_set_uip(devinfo, insn, 0); 1449 } 1450 1451 brw_inst_set_exec_size(devinfo, insn, execute_size); 1452 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1453 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL); 1454 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1455 if (!p->single_program_flow && devinfo->ver < 6) 1456 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1457 1458 push_if_stack(p, insn); 1459 p->if_depth_in_loop[p->loop_stack_depth]++; 1460 return insn; 1461} 1462 1463/* This function is only used for gfx6-style IF instructions with an 1464 * embedded comparison (conditional modifier). It is not used on gfx7. 1465 */ 1466brw_inst * 1467gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, 1468 struct brw_reg src0, struct brw_reg src1) 1469{ 1470 const struct intel_device_info *devinfo = p->devinfo; 1471 brw_inst *insn; 1472 1473 insn = next_insn(p, BRW_OPCODE_IF); 1474 1475 brw_set_dest(p, insn, brw_imm_w(0)); 1476 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); 1477 brw_inst_set_gfx6_jump_count(devinfo, insn, 0); 1478 brw_set_src0(p, insn, src0); 1479 brw_set_src1(p, insn, src1); 1480 1481 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE); 1482 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); 1483 brw_inst_set_cond_modifier(devinfo, insn, conditional); 1484 1485 push_if_stack(p, insn); 1486 return insn; 1487} 1488 1489/** 1490 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs. 1491 */ 1492static void 1493convert_IF_ELSE_to_ADD(struct brw_codegen *p, 1494 brw_inst *if_inst, brw_inst *else_inst) 1495{ 1496 const struct intel_device_info *devinfo = p->devinfo; 1497 1498 /* The next instruction (where the ENDIF would be, if it existed) */ 1499 brw_inst *next_inst = &p->store[p->nr_insn]; 1500 1501 assert(p->single_program_flow); 1502 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); 1503 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); 1504 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1); 1505 1506 /* Convert IF to an ADD instruction that moves the instruction pointer 1507 * to the first instruction of the ELSE block. If there is no ELSE 1508 * block, point to where ENDIF would be. Reverse the predicate. 1509 * 1510 * There's no need to execute an ENDIF since we don't need to do any 1511 * stack operations, and if we're currently executing, we just want to 1512 * continue normally. 1513 */ 1514 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD); 1515 brw_inst_set_pred_inv(devinfo, if_inst, true); 1516 1517 if (else_inst != NULL) { 1518 /* Convert ELSE to an ADD instruction that points where the ENDIF 1519 * would be. 1520 */ 1521 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD); 1522 1523 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16); 1524 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16); 1525 } else { 1526 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16); 1527 } 1528} 1529 1530/** 1531 * Patch IF and ELSE instructions with appropriate jump targets. 1532 */ 1533static void 1534patch_IF_ELSE(struct brw_codegen *p, 1535 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst) 1536{ 1537 const struct intel_device_info *devinfo = p->devinfo; 1538 1539 /* We shouldn't be patching IF and ELSE instructions in single program flow 1540 * mode when gen < 6, because in single program flow mode on those 1541 * platforms, we convert flow control instructions to conditional ADDs that 1542 * operate on IP (see brw_ENDIF). 1543 * 1544 * However, on Gfx6, writing to IP doesn't work in single program flow mode 1545 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1546 * not be updated by non-flow control instructions."). And on later 1547 * platforms, there is no significant benefit to converting control flow 1548 * instructions to conditional ADDs. So we do patch IF and ELSE 1549 * instructions in single program flow mode on those platforms. 1550 */ 1551 if (devinfo->ver < 6) 1552 assert(!p->single_program_flow); 1553 1554 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); 1555 assert(endif_inst != NULL); 1556 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); 1557 1558 unsigned br = brw_jump_scale(devinfo); 1559 1560 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF); 1561 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst)); 1562 1563 if (else_inst == NULL) { 1564 /* Patch IF -> ENDIF */ 1565 if (devinfo->ver < 6) { 1566 /* Turn it into an IFF, which means no mask stack operations for 1567 * all-false and jumping past the ENDIF. 1568 */ 1569 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF); 1570 brw_inst_set_gfx4_jump_count(devinfo, if_inst, 1571 br * (endif_inst - if_inst + 1)); 1572 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0); 1573 } else if (devinfo->ver == 6) { 1574 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */ 1575 brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst)); 1576 } else { 1577 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); 1578 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst)); 1579 } 1580 } else { 1581 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst)); 1582 1583 /* Patch IF -> ELSE */ 1584 if (devinfo->ver < 6) { 1585 brw_inst_set_gfx4_jump_count(devinfo, if_inst, 1586 br * (else_inst - if_inst)); 1587 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0); 1588 } else if (devinfo->ver == 6) { 1589 brw_inst_set_gfx6_jump_count(devinfo, if_inst, 1590 br * (else_inst - if_inst + 1)); 1591 } 1592 1593 /* Patch ELSE -> ENDIF */ 1594 if (devinfo->ver < 6) { 1595 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the 1596 * matching ENDIF. 1597 */ 1598 brw_inst_set_gfx4_jump_count(devinfo, else_inst, 1599 br * (endif_inst - else_inst + 1)); 1600 brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1); 1601 } else if (devinfo->ver == 6) { 1602 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */ 1603 brw_inst_set_gfx6_jump_count(devinfo, else_inst, 1604 br * (endif_inst - else_inst)); 1605 } else { 1606 /* The IF instruction's JIP should point just past the ELSE */ 1607 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1)); 1608 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */ 1609 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); 1610 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst)); 1611 if (devinfo->ver >= 8) { 1612 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both 1613 * should point to ENDIF. 1614 */ 1615 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst)); 1616 } 1617 } 1618 } 1619} 1620 1621void 1622brw_ELSE(struct brw_codegen *p) 1623{ 1624 const struct intel_device_info *devinfo = p->devinfo; 1625 brw_inst *insn; 1626 1627 insn = next_insn(p, BRW_OPCODE_ELSE); 1628 1629 if (devinfo->ver < 6) { 1630 brw_set_dest(p, insn, brw_ip_reg()); 1631 brw_set_src0(p, insn, brw_ip_reg()); 1632 brw_set_src1(p, insn, brw_imm_d(0x0)); 1633 } else if (devinfo->ver == 6) { 1634 brw_set_dest(p, insn, brw_imm_w(0)); 1635 brw_inst_set_gfx6_jump_count(devinfo, insn, 0); 1636 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1637 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1638 } else if (devinfo->ver == 7) { 1639 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1640 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1641 brw_set_src1(p, insn, brw_imm_w(0)); 1642 brw_inst_set_jip(devinfo, insn, 0); 1643 brw_inst_set_uip(devinfo, insn, 0); 1644 } else { 1645 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1646 if (devinfo->ver < 12) 1647 brw_set_src0(p, insn, brw_imm_d(0)); 1648 brw_inst_set_jip(devinfo, insn, 0); 1649 brw_inst_set_uip(devinfo, insn, 0); 1650 } 1651 1652 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1653 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1654 if (!p->single_program_flow && devinfo->ver < 6) 1655 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1656 1657 push_if_stack(p, insn); 1658} 1659 1660void 1661brw_ENDIF(struct brw_codegen *p) 1662{ 1663 const struct intel_device_info *devinfo = p->devinfo; 1664 brw_inst *insn = NULL; 1665 brw_inst *else_inst = NULL; 1666 brw_inst *if_inst = NULL; 1667 brw_inst *tmp; 1668 bool emit_endif = true; 1669 1670 /* In single program flow mode, we can express IF and ELSE instructions 1671 * equivalently as ADD instructions that operate on IP. On platforms prior 1672 * to Gfx6, flow control instructions cause an implied thread switch, so 1673 * this is a significant savings. 1674 * 1675 * However, on Gfx6, writing to IP doesn't work in single program flow mode 1676 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1677 * not be updated by non-flow control instructions."). And on later 1678 * platforms, there is no significant benefit to converting control flow 1679 * instructions to conditional ADDs. So we only do this trick on Gfx4 and 1680 * Gfx5. 1681 */ 1682 if (devinfo->ver < 6 && p->single_program_flow) 1683 emit_endif = false; 1684 1685 /* 1686 * A single next_insn() may change the base address of instruction store 1687 * memory(p->store), so call it first before referencing the instruction 1688 * store pointer from an index 1689 */ 1690 if (emit_endif) 1691 insn = next_insn(p, BRW_OPCODE_ENDIF); 1692 1693 /* Pop the IF and (optional) ELSE instructions from the stack */ 1694 p->if_depth_in_loop[p->loop_stack_depth]--; 1695 tmp = pop_if_stack(p); 1696 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) { 1697 else_inst = tmp; 1698 tmp = pop_if_stack(p); 1699 } 1700 if_inst = tmp; 1701 1702 if (!emit_endif) { 1703 /* ENDIF is useless; don't bother emitting it. */ 1704 convert_IF_ELSE_to_ADD(p, if_inst, else_inst); 1705 return; 1706 } 1707 1708 if (devinfo->ver < 6) { 1709 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1710 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1711 brw_set_src1(p, insn, brw_imm_d(0x0)); 1712 } else if (devinfo->ver == 6) { 1713 brw_set_dest(p, insn, brw_imm_w(0)); 1714 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1715 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1716 } else if (devinfo->ver == 7) { 1717 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1718 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1719 brw_set_src1(p, insn, brw_imm_w(0)); 1720 } else { 1721 brw_set_src0(p, insn, brw_imm_d(0)); 1722 } 1723 1724 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1725 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1726 if (devinfo->ver < 6) 1727 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1728 1729 /* Also pop item off the stack in the endif instruction: */ 1730 if (devinfo->ver < 6) { 1731 brw_inst_set_gfx4_jump_count(devinfo, insn, 0); 1732 brw_inst_set_gfx4_pop_count(devinfo, insn, 1); 1733 } else if (devinfo->ver == 6) { 1734 brw_inst_set_gfx6_jump_count(devinfo, insn, 2); 1735 } else { 1736 brw_inst_set_jip(devinfo, insn, 2); 1737 } 1738 patch_IF_ELSE(p, if_inst, else_inst, insn); 1739} 1740 1741brw_inst * 1742brw_BREAK(struct brw_codegen *p) 1743{ 1744 const struct intel_device_info *devinfo = p->devinfo; 1745 brw_inst *insn; 1746 1747 insn = next_insn(p, BRW_OPCODE_BREAK); 1748 if (devinfo->ver >= 8) { 1749 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1750 brw_set_src0(p, insn, brw_imm_d(0x0)); 1751 } else if (devinfo->ver >= 6) { 1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1753 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1754 brw_set_src1(p, insn, brw_imm_d(0x0)); 1755 } else { 1756 brw_set_dest(p, insn, brw_ip_reg()); 1757 brw_set_src0(p, insn, brw_ip_reg()); 1758 brw_set_src1(p, insn, brw_imm_d(0x0)); 1759 brw_inst_set_gfx4_pop_count(devinfo, insn, 1760 p->if_depth_in_loop[p->loop_stack_depth]); 1761 } 1762 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1763 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); 1764 1765 return insn; 1766} 1767 1768brw_inst * 1769brw_CONT(struct brw_codegen *p) 1770{ 1771 const struct intel_device_info *devinfo = p->devinfo; 1772 brw_inst *insn; 1773 1774 insn = next_insn(p, BRW_OPCODE_CONTINUE); 1775 brw_set_dest(p, insn, brw_ip_reg()); 1776 if (devinfo->ver >= 8) { 1777 brw_set_src0(p, insn, brw_imm_d(0x0)); 1778 } else { 1779 brw_set_src0(p, insn, brw_ip_reg()); 1780 brw_set_src1(p, insn, brw_imm_d(0x0)); 1781 } 1782 1783 if (devinfo->ver < 6) { 1784 brw_inst_set_gfx4_pop_count(devinfo, insn, 1785 p->if_depth_in_loop[p->loop_stack_depth]); 1786 } 1787 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1788 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); 1789 return insn; 1790} 1791 1792brw_inst * 1793brw_HALT(struct brw_codegen *p) 1794{ 1795 const struct intel_device_info *devinfo = p->devinfo; 1796 brw_inst *insn; 1797 1798 insn = next_insn(p, BRW_OPCODE_HALT); 1799 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1800 if (devinfo->ver < 6) { 1801 /* From the Gfx4 PRM: 1802 * 1803 * "IP register must be put (for example, by the assembler) at <dst> 1804 * and <src0> locations. 1805 */ 1806 brw_set_dest(p, insn, brw_ip_reg()); 1807 brw_set_src0(p, insn, brw_ip_reg()); 1808 brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */ 1809 } else if (devinfo->ver < 8) { 1810 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1811 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ 1812 } else if (devinfo->ver < 12) { 1813 brw_set_src0(p, insn, brw_imm_d(0x0)); 1814 } 1815 1816 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1817 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); 1818 return insn; 1819} 1820 1821/* DO/WHILE loop: 1822 * 1823 * The DO/WHILE is just an unterminated loop -- break or continue are 1824 * used for control within the loop. We have a few ways they can be 1825 * done. 1826 * 1827 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip, 1828 * jip and no DO instruction. 1829 * 1830 * For non-uniform control flow pre-gfx6, there's a DO instruction to 1831 * push the mask, and a WHILE to jump back, and BREAK to get out and 1832 * pop the mask. 1833 * 1834 * For gfx6, there's no more mask stack, so no need for DO. WHILE 1835 * just points back to the first instruction of the loop. 1836 */ 1837brw_inst * 1838brw_DO(struct brw_codegen *p, unsigned execute_size) 1839{ 1840 const struct intel_device_info *devinfo = p->devinfo; 1841 1842 if (devinfo->ver >= 6 || p->single_program_flow) { 1843 push_loop_stack(p, &p->store[p->nr_insn]); 1844 return &p->store[p->nr_insn]; 1845 } else { 1846 brw_inst *insn = next_insn(p, BRW_OPCODE_DO); 1847 1848 push_loop_stack(p, insn); 1849 1850 /* Override the defaults for this instruction: 1851 */ 1852 brw_set_dest(p, insn, brw_null_reg()); 1853 brw_set_src0(p, insn, brw_null_reg()); 1854 brw_set_src1(p, insn, brw_null_reg()); 1855 1856 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1857 brw_inst_set_exec_size(devinfo, insn, execute_size); 1858 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 1859 1860 return insn; 1861 } 1862} 1863 1864/** 1865 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE 1866 * instruction here. 1867 * 1868 * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop 1869 * nesting, since it can always just point to the end of the block/current loop. 1870 */ 1871static void 1872brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst) 1873{ 1874 const struct intel_device_info *devinfo = p->devinfo; 1875 brw_inst *do_inst = get_inner_do_insn(p); 1876 brw_inst *inst; 1877 unsigned br = brw_jump_scale(devinfo); 1878 1879 assert(devinfo->ver < 6); 1880 1881 for (inst = while_inst - 1; inst != do_inst; inst--) { 1882 /* If the jump count is != 0, that means that this instruction has already 1883 * been patched because it's part of a loop inside of the one we're 1884 * patching. 1885 */ 1886 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK && 1887 brw_inst_gfx4_jump_count(devinfo, inst) == 0) { 1888 brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1)); 1889 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE && 1890 brw_inst_gfx4_jump_count(devinfo, inst) == 0) { 1891 brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst)); 1892 } 1893 } 1894} 1895 1896brw_inst * 1897brw_WHILE(struct brw_codegen *p) 1898{ 1899 const struct intel_device_info *devinfo = p->devinfo; 1900 brw_inst *insn, *do_insn; 1901 unsigned br = brw_jump_scale(devinfo); 1902 1903 if (devinfo->ver >= 6) { 1904 insn = next_insn(p, BRW_OPCODE_WHILE); 1905 do_insn = get_inner_do_insn(p); 1906 1907 if (devinfo->ver >= 8) { 1908 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1909 if (devinfo->ver < 12) 1910 brw_set_src0(p, insn, brw_imm_d(0)); 1911 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); 1912 } else if (devinfo->ver == 7) { 1913 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1914 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1915 brw_set_src1(p, insn, brw_imm_w(0)); 1916 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); 1917 } else { 1918 brw_set_dest(p, insn, brw_imm_w(0)); 1919 brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn)); 1920 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1921 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1922 } 1923 1924 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p)); 1925 1926 } else { 1927 if (p->single_program_flow) { 1928 insn = next_insn(p, BRW_OPCODE_ADD); 1929 do_insn = get_inner_do_insn(p); 1930 1931 brw_set_dest(p, insn, brw_ip_reg()); 1932 brw_set_src0(p, insn, brw_ip_reg()); 1933 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16)); 1934 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); 1935 } else { 1936 insn = next_insn(p, BRW_OPCODE_WHILE); 1937 do_insn = get_inner_do_insn(p); 1938 1939 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO); 1940 1941 brw_set_dest(p, insn, brw_ip_reg()); 1942 brw_set_src0(p, insn, brw_ip_reg()); 1943 brw_set_src1(p, insn, brw_imm_d(0)); 1944 1945 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn)); 1946 brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1)); 1947 brw_inst_set_gfx4_pop_count(devinfo, insn, 0); 1948 1949 brw_patch_break_cont(p, insn); 1950 } 1951 } 1952 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1953 1954 p->loop_stack_depth--; 1955 1956 return insn; 1957} 1958 1959/* FORWARD JUMPS: 1960 */ 1961void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx) 1962{ 1963 const struct intel_device_info *devinfo = p->devinfo; 1964 brw_inst *jmp_insn = &p->store[jmp_insn_idx]; 1965 unsigned jmpi = 1; 1966 1967 if (devinfo->ver >= 5) 1968 jmpi = 2; 1969 1970 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI); 1971 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE); 1972 1973 brw_inst_set_gfx4_jump_count(devinfo, jmp_insn, 1974 jmpi * (p->nr_insn - jmp_insn_idx - 1)); 1975} 1976 1977/* To integrate with the above, it makes sense that the comparison 1978 * instruction should populate the flag register. It might be simpler 1979 * just to use the flag reg for most WM tasks? 1980 */ 1981void brw_CMP(struct brw_codegen *p, 1982 struct brw_reg dest, 1983 unsigned conditional, 1984 struct brw_reg src0, 1985 struct brw_reg src1) 1986{ 1987 const struct intel_device_info *devinfo = p->devinfo; 1988 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP); 1989 1990 brw_inst_set_cond_modifier(devinfo, insn, conditional); 1991 brw_set_dest(p, insn, dest); 1992 brw_set_src0(p, insn, src0); 1993 brw_set_src1(p, insn, src1); 1994 1995 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds 1996 * page says: 1997 * "Any CMP instruction with a null destination must use a {switch}." 1998 * 1999 * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't 2000 * mentioned on their work-arounds pages. 2001 */ 2002 if (devinfo->ver == 7) { 2003 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && 2004 dest.nr == BRW_ARF_NULL) { 2005 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 2006 } 2007 } 2008} 2009 2010void brw_CMPN(struct brw_codegen *p, 2011 struct brw_reg dest, 2012 unsigned conditional, 2013 struct brw_reg src0, 2014 struct brw_reg src1) 2015{ 2016 const struct intel_device_info *devinfo = p->devinfo; 2017 brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN); 2018 2019 brw_inst_set_cond_modifier(devinfo, insn, conditional); 2020 brw_set_dest(p, insn, dest); 2021 brw_set_src0(p, insn, src0); 2022 brw_set_src1(p, insn, src1); 2023 2024 /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA) 2025 * says: 2026 * 2027 * If the destination is the null register, the {Switch} instruction 2028 * option must be used. 2029 * 2030 * Page 77 of the Haswell PRM Volume 2b contains the same text. 2031 */ 2032 if (devinfo->ver == 7) { 2033 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && 2034 dest.nr == BRW_ARF_NULL) { 2035 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 2036 } 2037 } 2038} 2039 2040/*********************************************************************** 2041 * Helpers for the various SEND message types: 2042 */ 2043 2044/** Extended math function, float[8]. 2045 */ 2046void gfx4_math(struct brw_codegen *p, 2047 struct brw_reg dest, 2048 unsigned function, 2049 unsigned msg_reg_nr, 2050 struct brw_reg src, 2051 unsigned precision ) 2052{ 2053 const struct intel_device_info *devinfo = p->devinfo; 2054 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2055 unsigned data_type; 2056 if (has_scalar_region(src)) { 2057 data_type = BRW_MATH_DATA_SCALAR; 2058 } else { 2059 data_type = BRW_MATH_DATA_VECTOR; 2060 } 2061 2062 assert(devinfo->ver < 6); 2063 2064 /* Example code doesn't set predicate_control for send 2065 * instructions. 2066 */ 2067 brw_inst_set_pred_control(devinfo, insn, 0); 2068 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2069 2070 brw_set_dest(p, insn, dest); 2071 brw_set_src0(p, insn, src); 2072 brw_set_math_message(p, 2073 insn, 2074 function, 2075 src.type == BRW_REGISTER_TYPE_D, 2076 precision, 2077 data_type); 2078} 2079 2080void gfx6_math(struct brw_codegen *p, 2081 struct brw_reg dest, 2082 unsigned function, 2083 struct brw_reg src0, 2084 struct brw_reg src1) 2085{ 2086 const struct intel_device_info *devinfo = p->devinfo; 2087 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH); 2088 2089 assert(devinfo->ver >= 6); 2090 2091 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 2092 (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); 2093 2094 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); 2095 if (devinfo->ver == 6) { 2096 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1); 2097 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1); 2098 } 2099 2100 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || 2101 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || 2102 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { 2103 assert(src0.type != BRW_REGISTER_TYPE_F); 2104 assert(src1.type != BRW_REGISTER_TYPE_F); 2105 assert(src1.file == BRW_GENERAL_REGISTER_FILE || 2106 (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); 2107 /* From BSpec 6647/47428 "[Instruction] Extended Math Function": 2108 * INT DIV function does not support source modifiers. 2109 */ 2110 assert(!src0.negate); 2111 assert(!src0.abs); 2112 assert(!src1.negate); 2113 assert(!src1.abs); 2114 } else { 2115 assert(src0.type == BRW_REGISTER_TYPE_F || 2116 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9)); 2117 assert(src1.type == BRW_REGISTER_TYPE_F || 2118 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9)); 2119 } 2120 2121 /* Source modifiers are ignored for extended math instructions on Gfx6. */ 2122 if (devinfo->ver == 6) { 2123 assert(!src0.negate); 2124 assert(!src0.abs); 2125 assert(!src1.negate); 2126 assert(!src1.abs); 2127 } 2128 2129 brw_inst_set_math_function(devinfo, insn, function); 2130 2131 brw_set_dest(p, insn, dest); 2132 brw_set_src0(p, insn, src0); 2133 brw_set_src1(p, insn, src1); 2134} 2135 2136/** 2137 * Return the right surface index to access the thread scratch space using 2138 * stateless dataport messages. 2139 */ 2140unsigned 2141brw_scratch_surface_idx(const struct brw_codegen *p) 2142{ 2143 /* The scratch space is thread-local so IA coherency is unnecessary. */ 2144 if (p->devinfo->ver >= 8) 2145 return GFX8_BTI_STATELESS_NON_COHERENT; 2146 else 2147 return BRW_BTI_STATELESS; 2148} 2149 2150/** 2151 * Write a block of OWORDs (half a GRF each) from the scratch buffer, 2152 * using a constant offset per channel. 2153 * 2154 * The offset must be aligned to oword size (16 bytes). Used for 2155 * register spilling. 2156 */ 2157void brw_oword_block_write_scratch(struct brw_codegen *p, 2158 struct brw_reg mrf, 2159 int num_regs, 2160 unsigned offset) 2161{ 2162 const struct intel_device_info *devinfo = p->devinfo; 2163 const unsigned target_cache = 2164 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : 2165 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : 2166 BRW_SFID_DATAPORT_WRITE); 2167 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2168 uint32_t msg_type; 2169 2170 if (devinfo->ver >= 6) 2171 offset /= 16; 2172 2173 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2174 2175 const unsigned mlen = 1 + num_regs; 2176 2177 /* Set up the message header. This is g0, with g0.2 filled with 2178 * the offset. We don't want to leave our offset around in g0 or 2179 * it'll screw up texture samples, so set it up inside the message 2180 * reg. 2181 */ 2182 { 2183 brw_push_insn_state(p); 2184 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2185 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2186 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2187 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2188 2189 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2190 2191 /* set message header global offset field (reg 0, element 2) */ 2192 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2193 brw_set_default_swsb(p, tgl_swsb_null()); 2194 brw_MOV(p, 2195 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2196 mrf.nr, 2197 2), BRW_REGISTER_TYPE_UD), 2198 brw_imm_ud(offset)); 2199 2200 brw_pop_insn_state(p); 2201 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2202 } 2203 2204 { 2205 struct brw_reg dest; 2206 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2207 int send_commit_msg; 2208 struct brw_reg src_header = retype(brw_vec8_grf(0, 0), 2209 BRW_REGISTER_TYPE_UW); 2210 2211 brw_inst_set_sfid(devinfo, insn, target_cache); 2212 brw_inst_set_compression(devinfo, insn, false); 2213 2214 if (brw_inst_exec_size(devinfo, insn) >= 16) 2215 src_header = vec16(src_header); 2216 2217 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); 2218 if (devinfo->ver < 6) 2219 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2220 2221 /* Until gfx6, writes followed by reads from the same location 2222 * are not guaranteed to be ordered unless write_commit is set. 2223 * If set, then a no-op write is issued to the destination 2224 * register to set a dependency, and a read from the destination 2225 * can be used to ensure the ordering. 2226 * 2227 * For gfx6, only writes between different threads need ordering 2228 * protection. Our use of DP writes is all about register 2229 * spilling within a thread. 2230 */ 2231 if (devinfo->ver >= 6) { 2232 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2233 send_commit_msg = 0; 2234 } else { 2235 dest = src_header; 2236 send_commit_msg = 1; 2237 } 2238 2239 brw_set_dest(p, insn, dest); 2240 if (devinfo->ver >= 6) { 2241 brw_set_src0(p, insn, mrf); 2242 } else { 2243 brw_set_src0(p, insn, brw_null_reg()); 2244 } 2245 2246 if (devinfo->ver >= 6) 2247 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2248 else 2249 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2250 2251 brw_set_desc(p, insn, 2252 brw_message_desc(devinfo, mlen, send_commit_msg, true) | 2253 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p), 2254 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), 2255 msg_type, send_commit_msg)); 2256 } 2257} 2258 2259 2260/** 2261 * Read a block of owords (half a GRF each) from the scratch buffer 2262 * using a constant index per channel. 2263 * 2264 * Offset must be aligned to oword size (16 bytes). Used for register 2265 * spilling. 2266 */ 2267void 2268brw_oword_block_read_scratch(struct brw_codegen *p, 2269 struct brw_reg dest, 2270 struct brw_reg mrf, 2271 int num_regs, 2272 unsigned offset) 2273{ 2274 const struct intel_device_info *devinfo = p->devinfo; 2275 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2276 2277 if (devinfo->ver >= 6) 2278 offset /= 16; 2279 2280 if (p->devinfo->ver >= 7) { 2281 /* On gen 7 and above, we no longer have message registers and we can 2282 * send from any register we want. By using the destination register 2283 * for the message, we guarantee that the implied message write won't 2284 * accidentally overwrite anything. This has been a problem because 2285 * the MRF registers and source for the final FB write are both fixed 2286 * and may overlap. 2287 */ 2288 mrf = retype(dest, BRW_REGISTER_TYPE_UD); 2289 } else { 2290 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2291 } 2292 dest = retype(dest, BRW_REGISTER_TYPE_UW); 2293 2294 const unsigned rlen = num_regs; 2295 const unsigned target_cache = 2296 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : 2297 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : 2298 BRW_SFID_DATAPORT_READ); 2299 2300 { 2301 brw_push_insn_state(p); 2302 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2303 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2304 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2305 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2306 2307 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2308 2309 /* set message header global offset field (reg 0, element 2) */ 2310 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2311 brw_set_default_swsb(p, tgl_swsb_null()); 2312 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset)); 2313 2314 brw_pop_insn_state(p); 2315 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2316 } 2317 2318 { 2319 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2320 2321 brw_inst_set_sfid(devinfo, insn, target_cache); 2322 assert(brw_inst_pred_control(devinfo, insn) == 0); 2323 brw_inst_set_compression(devinfo, insn, false); 2324 2325 brw_set_dest(p, insn, dest); /* UW? */ 2326 if (devinfo->ver >= 6) { 2327 brw_set_src0(p, insn, mrf); 2328 } else { 2329 brw_set_src0(p, insn, brw_null_reg()); 2330 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2331 } 2332 2333 brw_set_desc(p, insn, 2334 brw_message_desc(devinfo, 1, rlen, true) | 2335 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p), 2336 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8), 2337 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, 2338 BRW_DATAPORT_READ_TARGET_RENDER_CACHE)); 2339 } 2340} 2341 2342void 2343gfx7_block_read_scratch(struct brw_codegen *p, 2344 struct brw_reg dest, 2345 int num_regs, 2346 unsigned offset) 2347{ 2348 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2349 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE); 2350 2351 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW)); 2352 2353 /* The HW requires that the header is present; this is to get the g0.5 2354 * scratch offset. 2355 */ 2356 brw_set_src0(p, insn, brw_vec8_grf(0, 0)); 2357 2358 /* According to the docs, offset is "A 12-bit HWord offset into the memory 2359 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD 2360 * is 32 bytes, which happens to be the size of a register. 2361 */ 2362 offset /= REG_SIZE; 2363 assert(offset < (1 << 12)); 2364 2365 gfx7_set_dp_scratch_message(p, insn, 2366 false, /* scratch read */ 2367 false, /* OWords */ 2368 false, /* invalidate after read */ 2369 num_regs, 2370 offset, 2371 1, /* mlen: just g0 */ 2372 num_regs, /* rlen */ 2373 true); /* header present */ 2374} 2375 2376/** 2377 * Read float[4] vectors from the data port constant cache. 2378 * Location (in buffer) should be a multiple of 16. 2379 * Used for fetching shader constants. 2380 */ 2381void brw_oword_block_read(struct brw_codegen *p, 2382 struct brw_reg dest, 2383 struct brw_reg mrf, 2384 uint32_t offset, 2385 uint32_t bind_table_index) 2386{ 2387 const struct intel_device_info *devinfo = p->devinfo; 2388 const unsigned target_cache = 2389 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE : 2390 BRW_SFID_DATAPORT_READ); 2391 const unsigned exec_size = 1 << brw_get_default_exec_size(p); 2392 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2393 2394 /* On newer hardware, offset is in units of owords. */ 2395 if (devinfo->ver >= 6) 2396 offset /= 16; 2397 2398 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2399 2400 brw_push_insn_state(p); 2401 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2402 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2403 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2404 2405 brw_push_insn_state(p); 2406 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2407 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2408 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2409 2410 /* set message header global offset field (reg 0, element 2) */ 2411 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2412 brw_set_default_swsb(p, tgl_swsb_null()); 2413 brw_MOV(p, 2414 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2415 mrf.nr, 2416 2), BRW_REGISTER_TYPE_UD), 2417 brw_imm_ud(offset)); 2418 brw_pop_insn_state(p); 2419 2420 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2421 2422 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2423 2424 brw_inst_set_sfid(devinfo, insn, target_cache); 2425 2426 /* cast dest to a uword[8] vector */ 2427 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); 2428 2429 brw_set_dest(p, insn, dest); 2430 if (devinfo->ver >= 6) { 2431 brw_set_src0(p, insn, mrf); 2432 } else { 2433 brw_set_src0(p, insn, brw_null_reg()); 2434 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2435 } 2436 2437 brw_set_desc(p, insn, 2438 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) | 2439 brw_dp_read_desc(devinfo, bind_table_index, 2440 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size), 2441 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, 2442 BRW_DATAPORT_READ_TARGET_DATA_CACHE)); 2443 2444 brw_pop_insn_state(p); 2445} 2446 2447brw_inst * 2448brw_fb_WRITE(struct brw_codegen *p, 2449 struct brw_reg payload, 2450 struct brw_reg implied_header, 2451 unsigned msg_control, 2452 unsigned binding_table_index, 2453 unsigned msg_length, 2454 unsigned response_length, 2455 bool eot, 2456 bool last_render_target, 2457 bool header_present) 2458{ 2459 const struct intel_device_info *devinfo = p->devinfo; 2460 const unsigned target_cache = 2461 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : 2462 BRW_SFID_DATAPORT_WRITE); 2463 brw_inst *insn; 2464 struct brw_reg dest, src0; 2465 2466 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16) 2467 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2468 else 2469 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2470 2471 if (devinfo->ver >= 6) { 2472 insn = next_insn(p, BRW_OPCODE_SENDC); 2473 } else { 2474 insn = next_insn(p, BRW_OPCODE_SEND); 2475 } 2476 brw_inst_set_sfid(devinfo, insn, target_cache); 2477 brw_inst_set_compression(devinfo, insn, false); 2478 2479 if (devinfo->ver >= 6) { 2480 /* headerless version, just submit color payload */ 2481 src0 = payload; 2482 } else { 2483 assert(payload.file == BRW_MESSAGE_REGISTER_FILE); 2484 brw_inst_set_base_mrf(devinfo, insn, payload.nr); 2485 src0 = implied_header; 2486 } 2487 2488 brw_set_dest(p, insn, dest); 2489 brw_set_src0(p, insn, src0); 2490 brw_set_desc(p, insn, 2491 brw_message_desc(devinfo, msg_length, response_length, 2492 header_present) | 2493 brw_fb_write_desc(devinfo, binding_table_index, msg_control, 2494 last_render_target, 2495 false /* coarse_write */)); 2496 brw_inst_set_eot(devinfo, insn, eot); 2497 2498 return insn; 2499} 2500 2501brw_inst * 2502gfx9_fb_READ(struct brw_codegen *p, 2503 struct brw_reg dst, 2504 struct brw_reg payload, 2505 unsigned binding_table_index, 2506 unsigned msg_length, 2507 unsigned response_length, 2508 bool per_sample) 2509{ 2510 const struct intel_device_info *devinfo = p->devinfo; 2511 assert(devinfo->ver >= 9); 2512 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC); 2513 2514 brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE); 2515 brw_set_dest(p, insn, dst); 2516 brw_set_src0(p, insn, payload); 2517 brw_set_desc( 2518 p, insn, 2519 brw_message_desc(devinfo, msg_length, response_length, true) | 2520 brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */, 2521 1 << brw_get_default_exec_size(p), per_sample)); 2522 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16); 2523 2524 return insn; 2525} 2526 2527/** 2528 * Texture sample instruction. 2529 * Note: the msg_type plus msg_length values determine exactly what kind 2530 * of sampling operation is performed. See volume 4, page 161 of docs. 2531 */ 2532void brw_SAMPLE(struct brw_codegen *p, 2533 struct brw_reg dest, 2534 unsigned msg_reg_nr, 2535 struct brw_reg src0, 2536 unsigned binding_table_index, 2537 unsigned sampler, 2538 unsigned msg_type, 2539 unsigned response_length, 2540 unsigned msg_length, 2541 unsigned header_present, 2542 unsigned simd_mode, 2543 unsigned return_format) 2544{ 2545 const struct intel_device_info *devinfo = p->devinfo; 2546 brw_inst *insn; 2547 2548 if (msg_reg_nr != -1) 2549 gfx6_resolve_implied_move(p, &src0, msg_reg_nr); 2550 2551 insn = next_insn(p, BRW_OPCODE_SEND); 2552 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER); 2553 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */ 2554 2555 /* From the 965 PRM (volume 4, part 1, section 14.2.41): 2556 * 2557 * "Instruction compression is not allowed for this instruction (that 2558 * is, send). The hardware behavior is undefined if this instruction is 2559 * set as compressed. However, compress control can be set to "SecHalf" 2560 * to affect the EMask generation." 2561 * 2562 * No similar wording is found in later PRMs, but there are examples 2563 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages 2564 * are allowed in SIMD16 mode and they could not work without SecHalf. For 2565 * these reasons, we allow BRW_COMPRESSION_2NDHALF here. 2566 */ 2567 brw_inst_set_compression(devinfo, insn, false); 2568 2569 if (devinfo->ver < 6) 2570 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2571 2572 brw_set_dest(p, insn, dest); 2573 brw_set_src0(p, insn, src0); 2574 brw_set_desc(p, insn, 2575 brw_message_desc(devinfo, msg_length, response_length, 2576 header_present) | 2577 brw_sampler_desc(devinfo, binding_table_index, sampler, 2578 msg_type, simd_mode, return_format)); 2579} 2580 2581/* Adjust the message header's sampler state pointer to 2582 * select the correct group of 16 samplers. 2583 */ 2584void brw_adjust_sampler_state_pointer(struct brw_codegen *p, 2585 struct brw_reg header, 2586 struct brw_reg sampler_index) 2587{ 2588 /* The "Sampler Index" field can only store values between 0 and 15. 2589 * However, we can add an offset to the "Sampler State Pointer" 2590 * field, effectively selecting a different set of 16 samplers. 2591 * 2592 * The "Sampler State Pointer" needs to be aligned to a 32-byte 2593 * offset, and each sampler state is only 16-bytes, so we can't 2594 * exclusively use the offset - we have to use both. 2595 */ 2596 2597 const struct intel_device_info *devinfo = p->devinfo; 2598 2599 if (sampler_index.file == BRW_IMMEDIATE_VALUE) { 2600 const int sampler_state_size = 16; /* 16 bytes */ 2601 uint32_t sampler = sampler_index.ud; 2602 2603 if (sampler >= 16) { 2604 assert(devinfo->verx10 >= 75); 2605 brw_ADD(p, 2606 get_element_ud(header, 3), 2607 get_element_ud(brw_vec8_grf(0, 0), 3), 2608 brw_imm_ud(16 * (sampler / 16) * sampler_state_size)); 2609 } 2610 } else { 2611 /* Non-const sampler array indexing case */ 2612 if (devinfo->verx10 <= 70) { 2613 return; 2614 } 2615 2616 struct brw_reg temp = get_element_ud(header, 3); 2617 2618 brw_push_insn_state(p); 2619 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0)); 2620 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 2621 brw_SHL(p, temp, temp, brw_imm_ud(4)); 2622 brw_ADD(p, 2623 get_element_ud(header, 3), 2624 get_element_ud(brw_vec8_grf(0, 0), 3), 2625 temp); 2626 brw_pop_insn_state(p); 2627 } 2628} 2629 2630/* All these variables are pretty confusing - we might be better off 2631 * using bitmasks and macros for this, in the old style. Or perhaps 2632 * just having the caller instantiate the fields in dword3 itself. 2633 */ 2634void brw_urb_WRITE(struct brw_codegen *p, 2635 struct brw_reg dest, 2636 unsigned msg_reg_nr, 2637 struct brw_reg src0, 2638 enum brw_urb_write_flags flags, 2639 unsigned msg_length, 2640 unsigned response_length, 2641 unsigned offset, 2642 unsigned swizzle) 2643{ 2644 const struct intel_device_info *devinfo = p->devinfo; 2645 brw_inst *insn; 2646 2647 gfx6_resolve_implied_move(p, &src0, msg_reg_nr); 2648 2649 if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { 2650 /* Enable Channel Masks in the URB_WRITE_HWORD message header */ 2651 brw_push_insn_state(p); 2652 brw_set_default_access_mode(p, BRW_ALIGN_1); 2653 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2654 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2655 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5), 2656 BRW_REGISTER_TYPE_UD), 2657 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 2658 brw_imm_ud(0xff00)); 2659 brw_pop_insn_state(p); 2660 } 2661 2662 insn = next_insn(p, BRW_OPCODE_SEND); 2663 2664 assert(msg_length < BRW_MAX_MRF(devinfo->ver)); 2665 2666 brw_set_dest(p, insn, dest); 2667 brw_set_src0(p, insn, src0); 2668 brw_set_src1(p, insn, brw_imm_d(0)); 2669 2670 if (devinfo->ver < 6) 2671 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2672 2673 brw_set_urb_message(p, 2674 insn, 2675 flags, 2676 msg_length, 2677 response_length, 2678 offset, 2679 swizzle); 2680} 2681 2682void 2683brw_send_indirect_message(struct brw_codegen *p, 2684 unsigned sfid, 2685 struct brw_reg dst, 2686 struct brw_reg payload, 2687 struct brw_reg desc, 2688 unsigned desc_imm, 2689 bool eot) 2690{ 2691 const struct intel_device_info *devinfo = p->devinfo; 2692 struct brw_inst *send; 2693 2694 dst = retype(dst, BRW_REGISTER_TYPE_UW); 2695 2696 assert(desc.type == BRW_REGISTER_TYPE_UD); 2697 2698 if (desc.file == BRW_IMMEDIATE_VALUE) { 2699 send = next_insn(p, BRW_OPCODE_SEND); 2700 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 2701 brw_set_desc(p, send, desc.ud | desc_imm); 2702 } else { 2703 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2704 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 2705 2706 brw_push_insn_state(p); 2707 brw_set_default_access_mode(p, BRW_ALIGN_1); 2708 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2709 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2710 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2711 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2712 2713 /* Load the indirect descriptor to an address register using OR so the 2714 * caller can specify additional descriptor bits with the desc_imm 2715 * immediate. 2716 */ 2717 brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); 2718 2719 brw_pop_insn_state(p); 2720 2721 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2722 send = next_insn(p, BRW_OPCODE_SEND); 2723 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 2724 2725 if (devinfo->ver >= 12) 2726 brw_inst_set_send_sel_reg32_desc(devinfo, send, true); 2727 else 2728 brw_set_src1(p, send, addr); 2729 } 2730 2731 brw_set_dest(p, send, dst); 2732 brw_inst_set_sfid(devinfo, send, sfid); 2733 brw_inst_set_eot(devinfo, send, eot); 2734} 2735 2736void 2737brw_send_indirect_split_message(struct brw_codegen *p, 2738 unsigned sfid, 2739 struct brw_reg dst, 2740 struct brw_reg payload0, 2741 struct brw_reg payload1, 2742 struct brw_reg desc, 2743 unsigned desc_imm, 2744 struct brw_reg ex_desc, 2745 unsigned ex_desc_imm, 2746 bool eot) 2747{ 2748 const struct intel_device_info *devinfo = p->devinfo; 2749 struct brw_inst *send; 2750 2751 dst = retype(dst, BRW_REGISTER_TYPE_UW); 2752 2753 assert(desc.type == BRW_REGISTER_TYPE_UD); 2754 2755 if (desc.file == BRW_IMMEDIATE_VALUE) { 2756 desc.ud |= desc_imm; 2757 } else { 2758 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2759 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 2760 2761 brw_push_insn_state(p); 2762 brw_set_default_access_mode(p, BRW_ALIGN_1); 2763 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2764 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2765 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2766 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2767 2768 /* Load the indirect descriptor to an address register using OR so the 2769 * caller can specify additional descriptor bits with the desc_imm 2770 * immediate. 2771 */ 2772 brw_OR(p, addr, desc, brw_imm_ud(desc_imm)); 2773 2774 brw_pop_insn_state(p); 2775 desc = addr; 2776 2777 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2778 } 2779 2780 if (ex_desc.file == BRW_IMMEDIATE_VALUE && 2781 (devinfo->ver >= 12 || 2782 ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) { 2783 ex_desc.ud |= ex_desc_imm; 2784 } else { 2785 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2786 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD); 2787 2788 brw_push_insn_state(p); 2789 brw_set_default_access_mode(p, BRW_ALIGN_1); 2790 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2791 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2792 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2793 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2794 2795 /* Load the indirect extended descriptor to an address register using OR 2796 * so the caller can specify additional descriptor bits with the 2797 * desc_imm immediate. 2798 * 2799 * Even though the instruction dispatcher always pulls the SFID and EOT 2800 * fields from the instruction itself, actual external unit which 2801 * processes the message gets the SFID and EOT from the extended 2802 * descriptor which comes from the address register. If we don't OR 2803 * those two bits in, the external unit may get confused and hang. 2804 */ 2805 unsigned imm_part = ex_desc_imm | sfid | eot << 5; 2806 2807 if (ex_desc.file == BRW_IMMEDIATE_VALUE) { 2808 /* ex_desc bits 15:12 don't exist in the instruction encoding prior 2809 * to Gfx12, so we may have fallen back to an indirect extended 2810 * descriptor. 2811 */ 2812 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part)); 2813 } else { 2814 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part)); 2815 } 2816 2817 brw_pop_insn_state(p); 2818 ex_desc = addr; 2819 2820 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2821 } 2822 2823 send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS); 2824 brw_set_dest(p, send, dst); 2825 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD)); 2826 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD)); 2827 2828 if (desc.file == BRW_IMMEDIATE_VALUE) { 2829 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0); 2830 brw_inst_set_send_desc(devinfo, send, desc.ud); 2831 } else { 2832 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE); 2833 assert(desc.nr == BRW_ARF_ADDRESS); 2834 assert(desc.subnr == 0); 2835 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1); 2836 } 2837 2838 if (ex_desc.file == BRW_IMMEDIATE_VALUE) { 2839 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0); 2840 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud); 2841 } else { 2842 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE); 2843 assert(ex_desc.nr == BRW_ARF_ADDRESS); 2844 assert((ex_desc.subnr & 0x3) == 0); 2845 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1); 2846 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2); 2847 } 2848 2849 brw_inst_set_sfid(devinfo, send, sfid); 2850 brw_inst_set_eot(devinfo, send, eot); 2851} 2852 2853static void 2854brw_send_indirect_surface_message(struct brw_codegen *p, 2855 unsigned sfid, 2856 struct brw_reg dst, 2857 struct brw_reg payload, 2858 struct brw_reg surface, 2859 unsigned desc_imm) 2860{ 2861 if (surface.file != BRW_IMMEDIATE_VALUE) { 2862 const struct tgl_swsb swsb = brw_get_default_swsb(p); 2863 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 2864 2865 brw_push_insn_state(p); 2866 brw_set_default_access_mode(p, BRW_ALIGN_1); 2867 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2868 brw_set_default_exec_size(p, BRW_EXECUTE_1); 2869 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2870 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb)); 2871 2872 /* Mask out invalid bits from the surface index to avoid hangs e.g. when 2873 * some surface array is accessed out of bounds. 2874 */ 2875 brw_AND(p, addr, 2876 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)), 2877 BRW_GET_SWZ(surface.swizzle, 0)), 2878 brw_imm_ud(0xff)); 2879 2880 brw_pop_insn_state(p); 2881 2882 surface = addr; 2883 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1)); 2884 } 2885 2886 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false); 2887} 2888 2889static bool 2890while_jumps_before_offset(const struct intel_device_info *devinfo, 2891 brw_inst *insn, int while_offset, int start_offset) 2892{ 2893 int scale = 16 / brw_jump_scale(devinfo); 2894 int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn) 2895 : brw_inst_jip(devinfo, insn); 2896 assert(jip < 0); 2897 return while_offset + jip * scale <= start_offset; 2898} 2899 2900 2901static int 2902brw_find_next_block_end(struct brw_codegen *p, int start_offset) 2903{ 2904 int offset; 2905 void *store = p->store; 2906 const struct intel_device_info *devinfo = p->devinfo; 2907 2908 int depth = 0; 2909 2910 for (offset = next_offset(devinfo, store, start_offset); 2911 offset < p->next_insn_offset; 2912 offset = next_offset(devinfo, store, offset)) { 2913 brw_inst *insn = store + offset; 2914 2915 switch (brw_inst_opcode(devinfo, insn)) { 2916 case BRW_OPCODE_IF: 2917 depth++; 2918 break; 2919 case BRW_OPCODE_ENDIF: 2920 if (depth == 0) 2921 return offset; 2922 depth--; 2923 break; 2924 case BRW_OPCODE_WHILE: 2925 /* If the while doesn't jump before our instruction, it's the end 2926 * of a sibling do...while loop. Ignore it. 2927 */ 2928 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset)) 2929 continue; 2930 FALLTHROUGH; 2931 case BRW_OPCODE_ELSE: 2932 case BRW_OPCODE_HALT: 2933 if (depth == 0) 2934 return offset; 2935 break; 2936 default: 2937 break; 2938 } 2939 } 2940 2941 return 0; 2942} 2943 2944/* There is no DO instruction on gfx6, so to find the end of the loop 2945 * we have to see if the loop is jumping back before our start 2946 * instruction. 2947 */ 2948static int 2949brw_find_loop_end(struct brw_codegen *p, int start_offset) 2950{ 2951 const struct intel_device_info *devinfo = p->devinfo; 2952 int offset; 2953 void *store = p->store; 2954 2955 assert(devinfo->ver >= 6); 2956 2957 /* Always start after the instruction (such as a WHILE) we're trying to fix 2958 * up. 2959 */ 2960 for (offset = next_offset(devinfo, store, start_offset); 2961 offset < p->next_insn_offset; 2962 offset = next_offset(devinfo, store, offset)) { 2963 brw_inst *insn = store + offset; 2964 2965 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) { 2966 if (while_jumps_before_offset(devinfo, insn, offset, start_offset)) 2967 return offset; 2968 } 2969 } 2970 assert(!"not reached"); 2971 return start_offset; 2972} 2973 2974/* After program generation, go back and update the UIP and JIP of 2975 * BREAK, CONT, and HALT instructions to their correct locations. 2976 */ 2977void 2978brw_set_uip_jip(struct brw_codegen *p, int start_offset) 2979{ 2980 const struct intel_device_info *devinfo = p->devinfo; 2981 int offset; 2982 int br = brw_jump_scale(devinfo); 2983 int scale = 16 / br; 2984 void *store = p->store; 2985 2986 if (devinfo->ver < 6) 2987 return; 2988 2989 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) { 2990 brw_inst *insn = store + offset; 2991 assert(brw_inst_cmpt_control(devinfo, insn) == 0); 2992 2993 int block_end_offset = brw_find_next_block_end(p, offset); 2994 switch (brw_inst_opcode(devinfo, insn)) { 2995 case BRW_OPCODE_BREAK: 2996 assert(block_end_offset != 0); 2997 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 2998 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */ 2999 brw_inst_set_uip(devinfo, insn, 3000 (brw_find_loop_end(p, offset) - offset + 3001 (devinfo->ver == 6 ? 16 : 0)) / scale); 3002 break; 3003 case BRW_OPCODE_CONTINUE: 3004 assert(block_end_offset != 0); 3005 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 3006 brw_inst_set_uip(devinfo, insn, 3007 (brw_find_loop_end(p, offset) - offset) / scale); 3008 3009 assert(brw_inst_uip(devinfo, insn) != 0); 3010 assert(brw_inst_jip(devinfo, insn) != 0); 3011 break; 3012 3013 case BRW_OPCODE_ENDIF: { 3014 int32_t jump = (block_end_offset == 0) ? 3015 1 * br : (block_end_offset - offset) / scale; 3016 if (devinfo->ver >= 7) 3017 brw_inst_set_jip(devinfo, insn, jump); 3018 else 3019 brw_inst_set_gfx6_jump_count(devinfo, insn, jump); 3020 break; 3021 } 3022 3023 case BRW_OPCODE_HALT: 3024 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): 3025 * 3026 * "In case of the halt instruction not inside any conditional 3027 * code block, the value of <JIP> and <UIP> should be the 3028 * same. In case of the halt instruction inside conditional code 3029 * block, the <UIP> should be the end of the program, and the 3030 * <JIP> should be end of the most inner conditional code block." 3031 * 3032 * The uip will have already been set by whoever set up the 3033 * instruction. 3034 */ 3035 if (block_end_offset == 0) { 3036 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn)); 3037 } else { 3038 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 3039 } 3040 assert(brw_inst_uip(devinfo, insn) != 0); 3041 assert(brw_inst_jip(devinfo, insn) != 0); 3042 break; 3043 3044 default: 3045 break; 3046 } 3047 } 3048} 3049 3050void brw_ff_sync(struct brw_codegen *p, 3051 struct brw_reg dest, 3052 unsigned msg_reg_nr, 3053 struct brw_reg src0, 3054 bool allocate, 3055 unsigned response_length, 3056 bool eot) 3057{ 3058 const struct intel_device_info *devinfo = p->devinfo; 3059 brw_inst *insn; 3060 3061 gfx6_resolve_implied_move(p, &src0, msg_reg_nr); 3062 3063 insn = next_insn(p, BRW_OPCODE_SEND); 3064 brw_set_dest(p, insn, dest); 3065 brw_set_src0(p, insn, src0); 3066 brw_set_src1(p, insn, brw_imm_d(0)); 3067 3068 if (devinfo->ver < 6) 3069 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 3070 3071 brw_set_ff_sync_message(p, 3072 insn, 3073 allocate, 3074 response_length, 3075 eot); 3076} 3077 3078/** 3079 * Emit the SEND instruction necessary to generate stream output data on Gfx6 3080 * (for transform feedback). 3081 * 3082 * If send_commit_msg is true, this is the last piece of stream output data 3083 * from this thread, so send the data as a committed write. According to the 3084 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1): 3085 * 3086 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all 3087 * writes are complete by sending the final write as a committed write." 3088 */ 3089void 3090brw_svb_write(struct brw_codegen *p, 3091 struct brw_reg dest, 3092 unsigned msg_reg_nr, 3093 struct brw_reg src0, 3094 unsigned binding_table_index, 3095 bool send_commit_msg) 3096{ 3097 const struct intel_device_info *devinfo = p->devinfo; 3098 assert(devinfo->ver == 6); 3099 const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE; 3100 brw_inst *insn; 3101 3102 gfx6_resolve_implied_move(p, &src0, msg_reg_nr); 3103 3104 insn = next_insn(p, BRW_OPCODE_SEND); 3105 brw_inst_set_sfid(devinfo, insn, target_cache); 3106 brw_set_dest(p, insn, dest); 3107 brw_set_src0(p, insn, src0); 3108 brw_set_desc(p, insn, 3109 brw_message_desc(devinfo, 1, send_commit_msg, true) | 3110 brw_dp_write_desc(devinfo, binding_table_index, 3111 0, /* msg_control: ignored */ 3112 GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE, 3113 send_commit_msg)); /* send_commit_msg */ 3114} 3115 3116static unsigned 3117brw_surface_payload_size(unsigned num_channels, 3118 unsigned exec_size /**< 0 for SIMD4x2 */) 3119{ 3120 if (exec_size == 0) 3121 return 1; /* SIMD4x2 */ 3122 else if (exec_size <= 8) 3123 return num_channels; 3124 else 3125 return 2 * num_channels; 3126} 3127 3128void 3129brw_untyped_atomic(struct brw_codegen *p, 3130 struct brw_reg dst, 3131 struct brw_reg payload, 3132 struct brw_reg surface, 3133 unsigned atomic_op, 3134 unsigned msg_length, 3135 bool response_expected, 3136 bool header_present) 3137{ 3138 const struct intel_device_info *devinfo = p->devinfo; 3139 const unsigned sfid = (devinfo->verx10 >= 75 ? 3140 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3141 GFX7_SFID_DATAPORT_DATA_CACHE); 3142 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; 3143 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */ 3144 const bool has_simd4x2 = devinfo->verx10 >= 75; 3145 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 3146 has_simd4x2 ? 0 : 8; 3147 const unsigned response_length = 3148 brw_surface_payload_size(response_expected, exec_size); 3149 const unsigned desc = 3150 brw_message_desc(devinfo, msg_length, response_length, header_present) | 3151 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op, 3152 response_expected); 3153 /* Mask out unused components -- This is especially important in Align16 3154 * mode on generations that don't have native support for SIMD4x2 atomics, 3155 * because unused but enabled components will cause the dataport to perform 3156 * additional atomic operations on the addresses that happen to be in the 3157 * uninitialized Y, Z and W coordinates of the payload. 3158 */ 3159 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; 3160 3161 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask), 3162 payload, surface, desc); 3163} 3164 3165void 3166brw_untyped_surface_read(struct brw_codegen *p, 3167 struct brw_reg dst, 3168 struct brw_reg payload, 3169 struct brw_reg surface, 3170 unsigned msg_length, 3171 unsigned num_channels) 3172{ 3173 const struct intel_device_info *devinfo = p->devinfo; 3174 const unsigned sfid = (devinfo->verx10 >= 75 ? 3175 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3176 GFX7_SFID_DATAPORT_DATA_CACHE); 3177 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; 3178 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0; 3179 const unsigned response_length = 3180 brw_surface_payload_size(num_channels, exec_size); 3181 const unsigned desc = 3182 brw_message_desc(devinfo, msg_length, response_length, false) | 3183 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false); 3184 3185 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc); 3186} 3187 3188void 3189brw_untyped_surface_write(struct brw_codegen *p, 3190 struct brw_reg payload, 3191 struct brw_reg surface, 3192 unsigned msg_length, 3193 unsigned num_channels, 3194 bool header_present) 3195{ 3196 const struct intel_device_info *devinfo = p->devinfo; 3197 const unsigned sfid = (devinfo->verx10 >= 75 ? 3198 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3199 GFX7_SFID_DATAPORT_DATA_CACHE); 3200 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; 3201 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */ 3202 const bool has_simd4x2 = devinfo->verx10 >= 75; 3203 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 3204 has_simd4x2 ? 0 : 8; 3205 const unsigned desc = 3206 brw_message_desc(devinfo, msg_length, 0, header_present) | 3207 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true); 3208 /* Mask out unused components -- See comment in brw_untyped_atomic(). */ 3209 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW; 3210 3211 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask), 3212 payload, surface, desc); 3213} 3214 3215static void 3216brw_set_memory_fence_message(struct brw_codegen *p, 3217 struct brw_inst *insn, 3218 enum brw_message_target sfid, 3219 bool commit_enable, 3220 unsigned bti) 3221{ 3222 const struct intel_device_info *devinfo = p->devinfo; 3223 3224 brw_set_desc(p, insn, brw_message_desc( 3225 devinfo, 1, (commit_enable ? 1 : 0), true)); 3226 3227 brw_inst_set_sfid(devinfo, insn, sfid); 3228 3229 switch (sfid) { 3230 case GFX6_SFID_DATAPORT_RENDER_CACHE: 3231 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE); 3232 break; 3233 case GFX7_SFID_DATAPORT_DATA_CACHE: 3234 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE); 3235 break; 3236 default: 3237 unreachable("Not reached"); 3238 } 3239 3240 if (commit_enable) 3241 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5); 3242 3243 assert(devinfo->ver >= 11 || bti == 0); 3244 brw_inst_set_binding_table_index(devinfo, insn, bti); 3245} 3246 3247static void 3248gfx12_set_memory_fence_message(struct brw_codegen *p, 3249 struct brw_inst *insn, 3250 enum brw_message_target sfid) 3251{ 3252 const unsigned mlen = 1; /* g0 header */ 3253 /* Completion signaled by write to register. No data returned. */ 3254 const unsigned rlen = 1; 3255 3256 brw_inst_set_sfid(p->devinfo, insn, sfid); 3257 3258 if (sfid == BRW_SFID_URB) { 3259 brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) | 3260 brw_message_desc(p->devinfo, mlen, rlen, false)); 3261 } else { 3262 enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP; 3263 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE; 3264 3265 if (sfid == GFX12_SFID_TGM) { 3266 scope = LSC_FENCE_TILE; 3267 flush_type = LSC_FLUSH_TYPE_EVICT; 3268 } 3269 3270 brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope, 3271 flush_type, false) | 3272 brw_message_desc(p->devinfo, mlen, rlen, false)); 3273 } 3274} 3275 3276void 3277brw_memory_fence(struct brw_codegen *p, 3278 struct brw_reg dst, 3279 struct brw_reg src, 3280 enum opcode send_op, 3281 enum brw_message_target sfid, 3282 bool commit_enable, 3283 unsigned bti) 3284{ 3285 const struct intel_device_info *devinfo = p->devinfo; 3286 3287 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW); 3288 src = retype(vec1(src), BRW_REGISTER_TYPE_UD); 3289 3290 /* Set dst as destination for dependency tracking, the MEMORY_FENCE 3291 * message doesn't write anything back. 3292 */ 3293 struct brw_inst *insn = next_insn(p, send_op); 3294 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 3295 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); 3296 brw_set_dest(p, insn, dst); 3297 brw_set_src0(p, insn, src); 3298 3299 /* All DG2 hardware requires LSC for fence messages, even A-step */ 3300 if (devinfo->has_lsc) 3301 gfx12_set_memory_fence_message(p, insn, sfid); 3302 else 3303 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti); 3304} 3305 3306void 3307brw_pixel_interpolator_query(struct brw_codegen *p, 3308 struct brw_reg dest, 3309 struct brw_reg mrf, 3310 bool noperspective, 3311 bool coarse_pixel_rate, 3312 unsigned mode, 3313 struct brw_reg data, 3314 unsigned msg_length, 3315 unsigned response_length) 3316{ 3317 const struct intel_device_info *devinfo = p->devinfo; 3318 const uint16_t exec_size = brw_get_default_exec_size(p); 3319 const unsigned slot_group = brw_get_default_group(p) / 16; 3320 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16); 3321 const unsigned desc = 3322 brw_message_desc(devinfo, msg_length, response_length, false) | 3323 brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate, 3324 simd_mode, slot_group); 3325 3326 /* brw_send_indirect_message will automatically use a direct send message 3327 * if data is actually immediate. 3328 */ 3329 brw_send_indirect_message(p, 3330 GFX7_SFID_PIXEL_INTERPOLATOR, 3331 dest, 3332 mrf, 3333 vec1(data), 3334 desc, 3335 false); 3336} 3337 3338void 3339brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, 3340 struct brw_reg mask) 3341{ 3342 const struct intel_device_info *devinfo = p->devinfo; 3343 const unsigned exec_size = 1 << brw_get_default_exec_size(p); 3344 const unsigned qtr_control = brw_get_default_group(p) / 8; 3345 brw_inst *inst; 3346 3347 assert(devinfo->ver >= 7); 3348 assert(mask.type == BRW_REGISTER_TYPE_UD); 3349 3350 brw_push_insn_state(p); 3351 3352 /* The flag register is only used on Gfx7 in align1 mode, so avoid setting 3353 * unnecessary bits in the instruction words, get the information we need 3354 * and reset the default flag register. This allows more instructions to be 3355 * compacted. 3356 */ 3357 const unsigned flag_subreg = p->current->flag_subreg; 3358 brw_set_default_flag_reg(p, 0, 0); 3359 3360 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) { 3361 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3362 3363 if (devinfo->ver >= 8) { 3364 /* Getting the first active channel index is easy on Gfx8: Just find 3365 * the first bit set in the execution mask. The register exists on 3366 * HSW already but it reads back as all ones when the current 3367 * instruction has execution masking disabled, so it's kind of 3368 * useless. 3369 */ 3370 struct brw_reg exec_mask = 3371 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD); 3372 3373 brw_set_default_exec_size(p, BRW_EXECUTE_1); 3374 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) { 3375 /* Unfortunately, ce0 does not take into account the thread 3376 * dispatch mask, which may be a problem in cases where it's not 3377 * tightly packed (i.e. it doesn't have the form '2^n - 1' for 3378 * some n). Combine ce0 with the given dispatch (or vector) mask 3379 * to mask off those channels which were never dispatched by the 3380 * hardware. 3381 */ 3382 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8)); 3383 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 3384 brw_AND(p, vec1(dst), exec_mask, vec1(dst)); 3385 exec_mask = vec1(dst); 3386 } 3387 3388 /* Quarter control has the effect of magically shifting the value of 3389 * ce0 so you'll get the first active channel relative to the 3390 * specified quarter control as result. 3391 */ 3392 inst = brw_FBL(p, vec1(dst), exec_mask); 3393 } else { 3394 const struct brw_reg flag = brw_flag_subreg(flag_subreg); 3395 3396 brw_set_default_exec_size(p, BRW_EXECUTE_1); 3397 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); 3398 3399 /* Run enough instructions returning zero with execution masking and 3400 * a conditional modifier enabled in order to get the full execution 3401 * mask in f1.0. We could use a single 32-wide move here if it 3402 * weren't because of the hardware bug that causes channel enables to 3403 * be applied incorrectly to the second half of 32-wide instructions 3404 * on Gfx7. 3405 */ 3406 const unsigned lower_size = MIN2(16, exec_size); 3407 for (unsigned i = 0; i < exec_size / lower_size; i++) { 3408 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), 3409 brw_imm_uw(0)); 3410 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); 3411 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control); 3412 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z); 3413 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1); 3414 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2); 3415 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2); 3416 } 3417 3418 /* Find the first bit set in the exec_size-wide portion of the flag 3419 * register that was updated by the last sequence of MOV 3420 * instructions. 3421 */ 3422 const enum brw_reg_type type = brw_int_type(exec_size / 8, false); 3423 brw_set_default_exec_size(p, BRW_EXECUTE_1); 3424 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control)); 3425 } 3426 } else { 3427 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3428 3429 if (devinfo->ver >= 8 && 3430 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) { 3431 /* In SIMD4x2 mode the first active channel index is just the 3432 * negation of the first bit of the mask register. Note that ce0 3433 * doesn't take into account the dispatch mask, so the Gfx7 path 3434 * should be used instead unless you have the guarantee that the 3435 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1' 3436 * for some n). 3437 */ 3438 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X), 3439 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)), 3440 brw_imm_ud(1)); 3441 3442 } else { 3443 /* Overwrite the destination without and with execution masking to 3444 * find out which of the channels is active. 3445 */ 3446 brw_push_insn_state(p); 3447 brw_set_default_exec_size(p, BRW_EXECUTE_4); 3448 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), 3449 brw_imm_ud(1)); 3450 3451 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), 3452 brw_imm_ud(0)); 3453 brw_pop_insn_state(p); 3454 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); 3455 } 3456 } 3457 3458 brw_pop_insn_state(p); 3459} 3460 3461void 3462brw_broadcast(struct brw_codegen *p, 3463 struct brw_reg dst, 3464 struct brw_reg src, 3465 struct brw_reg idx) 3466{ 3467 const struct intel_device_info *devinfo = p->devinfo; 3468 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1; 3469 brw_inst *inst; 3470 3471 brw_push_insn_state(p); 3472 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3473 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4); 3474 3475 assert(src.file == BRW_GENERAL_REGISTER_FILE && 3476 src.address_mode == BRW_ADDRESS_DIRECT); 3477 assert(!src.abs && !src.negate); 3478 assert(src.type == dst.type); 3479 3480 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) || 3481 idx.file == BRW_IMMEDIATE_VALUE) { 3482 /* Trivial, the source is already uniform or the index is a constant. 3483 * We will typically not get here if the optimizer is doing its job, but 3484 * asserting would be mean. 3485 */ 3486 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; 3487 src = align1 ? stride(suboffset(src, i), 0, 1, 0) : 3488 stride(suboffset(src, 4 * i), 0, 4, 1); 3489 3490 if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) { 3491 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 3492 subscript(src, BRW_REGISTER_TYPE_D, 0)); 3493 brw_set_default_swsb(p, tgl_swsb_null()); 3494 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 3495 subscript(src, BRW_REGISTER_TYPE_D, 1)); 3496 } else { 3497 brw_MOV(p, dst, src); 3498 } 3499 } else { 3500 /* From the Haswell PRM section "Register Region Restrictions": 3501 * 3502 * "The lower bits of the AddressImmediate must not overflow to 3503 * change the register address. The lower 5 bits of Address 3504 * Immediate when added to lower 5 bits of address register gives 3505 * the sub-register offset. The upper bits of Address Immediate 3506 * when added to upper bits of address register gives the register 3507 * address. Any overflow from sub-register offset is dropped." 3508 * 3509 * Fortunately, for broadcast, we never have a sub-register offset so 3510 * this isn't an issue. 3511 */ 3512 assert(src.subnr == 0); 3513 3514 if (align1) { 3515 const struct brw_reg addr = 3516 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 3517 unsigned offset = src.nr * REG_SIZE + src.subnr; 3518 /* Limit in bytes of the signed indirect addressing immediate. */ 3519 const unsigned limit = 512; 3520 3521 brw_push_insn_state(p); 3522 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3523 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 3524 3525 /* Take into account the component size and horizontal stride. */ 3526 assert(src.vstride == src.hstride + src.width); 3527 brw_SHL(p, addr, vec1(idx), 3528 brw_imm_ud(util_logbase2(type_sz(src.type)) + 3529 src.hstride - 1)); 3530 3531 /* We can only address up to limit bytes using the indirect 3532 * addressing immediate, account for the difference if the source 3533 * register is above this limit. 3534 */ 3535 if (offset >= limit) { 3536 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 3537 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit)); 3538 offset = offset % limit; 3539 } 3540 3541 brw_pop_insn_state(p); 3542 3543 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 3544 3545 /* Use indirect addressing to fetch the specified component. */ 3546 if (type_sz(src.type) > 4 && 3547 (devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) || 3548 !devinfo->has_64bit_float)) { 3549 /* From the Cherryview PRM Vol 7. "Register Region Restrictions": 3550 * 3551 * "When source or destination datatype is 64b or operation is 3552 * integer DWord multiply, indirect addressing must not be 3553 * used." 3554 * 3555 * To work around both of this issue, we do two integer MOVs 3556 * insead of one 64-bit MOV. Because no double value should ever 3557 * cross a register boundary, it's safe to use the immediate 3558 * offset in the indirect here to handle adding 4 bytes to the 3559 * offset and avoid the extra ADD to the register file. 3560 */ 3561 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), 3562 retype(brw_vec1_indirect(addr.subnr, offset), 3563 BRW_REGISTER_TYPE_D)); 3564 brw_set_default_swsb(p, tgl_swsb_null()); 3565 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), 3566 retype(brw_vec1_indirect(addr.subnr, offset + 4), 3567 BRW_REGISTER_TYPE_D)); 3568 } else { 3569 brw_MOV(p, dst, 3570 retype(brw_vec1_indirect(addr.subnr, offset), src.type)); 3571 } 3572 } else { 3573 /* In SIMD4x2 mode the index can be either zero or one, replicate it 3574 * to all bits of a flag register, 3575 */ 3576 inst = brw_MOV(p, 3577 brw_null_reg(), 3578 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1)); 3579 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE); 3580 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ); 3581 brw_inst_set_flag_reg_nr(devinfo, inst, 1); 3582 3583 /* and use predicated SEL to pick the right channel. */ 3584 inst = brw_SEL(p, dst, 3585 stride(suboffset(src, 4), 4, 4, 1), 3586 stride(src, 4, 4, 1)); 3587 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL); 3588 brw_inst_set_flag_reg_nr(devinfo, inst, 1); 3589 } 3590 } 3591 3592 brw_pop_insn_state(p); 3593} 3594 3595/** 3596 * This instruction is generated as a single-channel align1 instruction by 3597 * both the VS and FS stages when using INTEL_DEBUG=shader_time. 3598 * 3599 * We can't use the typed atomic op in the FS because that has the execution 3600 * mask ANDed with the pixel mask, but we just want to write the one dword for 3601 * all the pixels. 3602 * 3603 * We don't use the SIMD4x2 atomic ops in the VS because want to just write 3604 * one u32. So we use the same untyped atomic write message as the pixel 3605 * shader. 3606 * 3607 * The untyped atomic operation requires a BUFFER surface type with RAW 3608 * format, and is only accessible through the legacy DATA_CACHE dataport 3609 * messages. 3610 */ 3611void brw_shader_time_add(struct brw_codegen *p, 3612 struct brw_reg payload, 3613 uint32_t surf_index) 3614{ 3615 const struct intel_device_info *devinfo = p->devinfo; 3616 const unsigned sfid = (devinfo->verx10 >= 75 ? 3617 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3618 GFX7_SFID_DATAPORT_DATA_CACHE); 3619 assert(devinfo->ver >= 7); 3620 3621 brw_push_insn_state(p); 3622 brw_set_default_access_mode(p, BRW_ALIGN_1); 3623 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3624 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 3625 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 3626 3627 /* We use brw_vec1_reg and unmasked because we want to increment the given 3628 * offset only once. 3629 */ 3630 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 3631 BRW_ARF_NULL, 0)); 3632 brw_set_src0(p, send, brw_vec1_reg(payload.file, 3633 payload.nr, 0)); 3634 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) | 3635 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD, 3636 false))); 3637 3638 brw_inst_set_sfid(devinfo, send, sfid); 3639 brw_inst_set_binding_table_index(devinfo, send, surf_index); 3640 3641 brw_pop_insn_state(p); 3642} 3643 3644 3645/** 3646 * Emit the SEND message for a barrier 3647 */ 3648void 3649brw_barrier(struct brw_codegen *p, struct brw_reg src) 3650{ 3651 const struct intel_device_info *devinfo = p->devinfo; 3652 struct brw_inst *inst; 3653 3654 assert(devinfo->ver >= 7); 3655 3656 brw_push_insn_state(p); 3657 brw_set_default_access_mode(p, BRW_ALIGN_1); 3658 inst = next_insn(p, BRW_OPCODE_SEND); 3659 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 3660 brw_set_src0(p, inst, src); 3661 brw_set_src1(p, inst, brw_null_reg()); 3662 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false)); 3663 3664 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY); 3665 brw_inst_set_gateway_subfuncid(devinfo, inst, 3666 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG); 3667 3668 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); 3669 brw_pop_insn_state(p); 3670} 3671 3672 3673/** 3674 * Emit the wait instruction for a barrier 3675 */ 3676void 3677brw_WAIT(struct brw_codegen *p) 3678{ 3679 const struct intel_device_info *devinfo = p->devinfo; 3680 struct brw_inst *insn; 3681 3682 struct brw_reg src = brw_notification_reg(); 3683 3684 insn = next_insn(p, BRW_OPCODE_WAIT); 3685 brw_set_dest(p, insn, src); 3686 brw_set_src0(p, insn, src); 3687 brw_set_src1(p, insn, brw_null_reg()); 3688 3689 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); 3690 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 3691} 3692 3693void 3694brw_float_controls_mode(struct brw_codegen *p, 3695 unsigned mode, unsigned mask) 3696{ 3697 /* From the Skylake PRM, Volume 7, page 760: 3698 * "Implementation Restriction on Register Access: When the control 3699 * register is used as an explicit source and/or destination, hardware 3700 * does not ensure execution pipeline coherency. Software must set the 3701 * thread control field to ‘switch’ for an instruction that uses 3702 * control register as an explicit operand." 3703 * 3704 * On Gfx12+ this is implemented in terms of SWSB annotations instead. 3705 */ 3706 brw_set_default_swsb(p, tgl_swsb_regdist(1)); 3707 3708 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0), 3709 brw_imm_ud(~mask)); 3710 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1); 3711 if (p->devinfo->ver < 12) 3712 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH); 3713 3714 if (mode) { 3715 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0), 3716 brw_imm_ud(mode)); 3717 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1); 3718 if (p->devinfo->ver < 12) 3719 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH); 3720 } 3721 3722 if (p->devinfo->ver >= 12) 3723 brw_SYNC(p, TGL_SYNC_NOP); 3724} 3725 3726void 3727brw_update_reloc_imm(const struct intel_device_info *devinfo, 3728 brw_inst *inst, 3729 uint32_t value) 3730{ 3731 /* Sanity check that the instruction is a MOV of an immediate */ 3732 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV); 3733 assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE); 3734 3735 /* If it was compacted, we can't safely rewrite */ 3736 assert(brw_inst_cmpt_control(devinfo, inst) == 0); 3737 3738 brw_inst_set_imm_ud(devinfo, inst, value); 3739} 3740 3741/* A default value for constants that will be patched at run-time. 3742 * We pick an arbitrary value that prevents instruction compaction. 3743 */ 3744#define DEFAULT_PATCH_IMM 0x4a7cc037 3745 3746void 3747brw_MOV_reloc_imm(struct brw_codegen *p, 3748 struct brw_reg dst, 3749 enum brw_reg_type src_type, 3750 uint32_t id) 3751{ 3752 assert(type_sz(src_type) == 4); 3753 assert(type_sz(dst.type) == 4); 3754 3755 brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM, 3756 p->next_insn_offset, 0); 3757 3758 brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type)); 3759} 3760