1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_fs.h" 25#include "brw_cfg.h" 26#include "brw_fs_builder.h" 27 28using namespace brw; 29 30namespace { 31 /* From the SKL PRM Vol 2a, "Move": 32 * 33 * "A mov with the same source and destination type, no source modifier, 34 * and no saturation is a raw move. A packed byte destination region (B 35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written 36 * using raw move." 37 */ 38 bool 39 is_byte_raw_mov(const fs_inst *inst) 40 { 41 return type_sz(inst->dst.type) == 1 && 42 inst->opcode == BRW_OPCODE_MOV && 43 inst->src[0].type == inst->dst.type && 44 !inst->saturate && 45 !inst->src[0].negate && 46 !inst->src[0].abs; 47 } 48 49 /* 50 * Return an acceptable byte stride for the destination of an instruction 51 * that requires it to have some particular alignment. 52 */ 53 unsigned 54 required_dst_byte_stride(const fs_inst *inst) 55 { 56 if (inst->dst.is_accumulator()) { 57 /* If the destination is an accumulator, insist that we leave the 58 * stride alone. We cannot "fix" accumulator destinations by writing 59 * to a temporary and emitting a MOV into the original destination. 60 * For multiply instructions (our one use of the accumulator), the 61 * MUL writes the full 66 bits of the accumulator whereas the MOV we 62 * would emit only writes 33 bits and leaves the top 33 bits 63 * undefined. 64 * 65 * It's safe to just require the original stride here because the 66 * lowering pass will detect the mismatch in has_invalid_src_region 67 * and fix the sources of the multiply instead of the destination. 68 */ 69 return inst->dst.stride * type_sz(inst->dst.type); 70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) && 71 !is_byte_raw_mov(inst)) { 72 return get_exec_type_size(inst); 73 } else { 74 /* Calculate the maximum byte stride and the minimum/maximum type 75 * size across all source and destination operands we are required to 76 * lower. 77 */ 78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type); 79 unsigned min_size = type_sz(inst->dst.type); 80 unsigned max_size = type_sz(inst->dst.type); 81 82 for (unsigned i = 0; i < inst->sources; i++) { 83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) { 84 const unsigned size = type_sz(inst->src[i].type); 85 max_stride = MAX2(max_stride, inst->src[i].stride * size); 86 min_size = MIN2(min_size, size); 87 max_size = MAX2(max_size, size); 88 } 89 } 90 91 /* All operands involved in lowering need to fit in the calculated 92 * stride. 93 */ 94 assert(max_size <= 4 * min_size); 95 96 /* Attempt to use the largest byte stride among all present operands, 97 * but never exceed a stride of 4 since that would lead to illegal 98 * destination regions during lowering. 99 */ 100 return MIN2(max_stride, 4 * min_size); 101 } 102 } 103 104 /* 105 * Return an acceptable byte sub-register offset for the destination of an 106 * instruction that requires it to be aligned to the sub-register offset of 107 * the sources. 108 */ 109 unsigned 110 required_dst_byte_offset(const fs_inst *inst) 111 { 112 for (unsigned i = 0; i < inst->sources; i++) { 113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) 114 if (reg_offset(inst->src[i]) % REG_SIZE != 115 reg_offset(inst->dst) % REG_SIZE) 116 return 0; 117 } 118 119 return reg_offset(inst->dst) % REG_SIZE; 120 } 121 122 /* 123 * Return whether the instruction has an unsupported channel bit layout 124 * specified for the i-th source region. 125 */ 126 bool 127 has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst, 128 unsigned i) 129 { 130 if (is_unordered(inst) || inst->is_control_source(i)) 131 return false; 132 133 /* Empirical testing shows that Broadwell has a bug affecting half-float 134 * MAD instructions when any of its sources has a non-zero offset, such 135 * as: 136 * 137 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q }; 138 * 139 * We used to generate code like this for SIMD8 executions where we 140 * used to pack components Y and W of a vector at offset 16B of a SIMD 141 * register. The problem doesn't occur if the stride of the source is 0. 142 */ 143 if (devinfo->ver == 8 && 144 inst->opcode == BRW_OPCODE_MAD && 145 inst->src[i].type == BRW_REGISTER_TYPE_HF && 146 reg_offset(inst->src[i]) % REG_SIZE > 0 && 147 inst->src[i].stride != 0) { 148 return true; 149 } 150 151 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); 152 const unsigned src_byte_stride = inst->src[i].stride * 153 type_sz(inst->src[i].type); 154 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; 155 const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE; 156 157 return has_dst_aligned_region_restriction(devinfo, inst) && 158 !is_uniform(inst->src[i]) && 159 (src_byte_stride != dst_byte_stride || 160 src_byte_offset != dst_byte_offset); 161 } 162 163 /* 164 * Return whether the instruction has an unsupported channel bit layout 165 * specified for the destination region. 166 */ 167 bool 168 has_invalid_dst_region(const intel_device_info *devinfo, 169 const fs_inst *inst) 170 { 171 if (is_unordered(inst)) { 172 return false; 173 } else { 174 const brw_reg_type exec_type = get_exec_type(inst); 175 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; 176 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); 177 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) && 178 type_sz(inst->dst.type) < type_sz(exec_type); 179 180 return (has_dst_aligned_region_restriction(devinfo, inst) && 181 (required_dst_byte_stride(inst) != dst_byte_stride || 182 required_dst_byte_offset(inst) != dst_byte_offset)) || 183 (is_narrowing_conversion && 184 required_dst_byte_stride(inst) != dst_byte_stride); 185 } 186 } 187 188 /** 189 * Return a non-zero value if the execution type of the instruction is 190 * unsupported. The destination and sources matching the returned mask 191 * will be bit-cast to an integer type of appropriate size, lowering any 192 * source or destination modifiers into separate MOV instructions. 193 */ 194 unsigned 195 has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst) 196 { 197 switch (inst->opcode) { 198 case SHADER_OPCODE_SHUFFLE: 199 case SHADER_OPCODE_QUAD_SWIZZLE: 200 return has_dst_aligned_region_restriction(devinfo, inst) ? 201 0x1 : 0; 202 203 case SHADER_OPCODE_BROADCAST: 204 case SHADER_OPCODE_MOV_INDIRECT: 205 return (((devinfo->verx10 == 70) || 206 devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) || 207 devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) || 208 (devinfo->verx10 >= 125 && 209 brw_reg_type_is_floating_point(inst->src[0].type)) ? 210 0x1 : 0; 211 212 default: 213 return 0; 214 } 215 } 216 217 /* 218 * Return whether the instruction has unsupported source modifiers 219 * specified for the i-th source region. 220 */ 221 bool 222 has_invalid_src_modifiers(const intel_device_info *devinfo, 223 const fs_inst *inst, unsigned i) 224 { 225 return (!inst->can_do_source_mods(devinfo) && 226 (inst->src[i].negate || inst->src[i].abs)) || 227 ((has_invalid_exec_type(devinfo, inst) & (1u << i)) && 228 (inst->src[i].negate || inst->src[i].abs || 229 inst->src[i].type != get_exec_type(inst))); 230 } 231 232 /* 233 * Return whether the instruction has an unsupported type conversion 234 * specified for the destination. 235 */ 236 bool 237 has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst) 238 { 239 switch (inst->opcode) { 240 case BRW_OPCODE_MOV: 241 return false; 242 case BRW_OPCODE_SEL: 243 return inst->dst.type != get_exec_type(inst); 244 default: 245 /* FIXME: We assume the opcodes not explicitly mentioned before just 246 * work fine with arbitrary conversions, unless they need to be 247 * bit-cast. 248 */ 249 return has_invalid_exec_type(devinfo, inst) && 250 inst->dst.type != get_exec_type(inst); 251 } 252 } 253 254 /** 255 * Return whether the instruction has unsupported destination modifiers. 256 */ 257 bool 258 has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst) 259 { 260 return (has_invalid_exec_type(devinfo, inst) && 261 (inst->saturate || inst->conditional_mod)) || 262 has_invalid_conversion(devinfo, inst); 263 } 264 265 /** 266 * Return whether the instruction has non-standard semantics for the 267 * conditional mod which don't cause the flag register to be updated with 268 * the comparison result. 269 */ 270 bool 271 has_inconsistent_cmod(const fs_inst *inst) 272 { 273 return inst->opcode == BRW_OPCODE_SEL || 274 inst->opcode == BRW_OPCODE_CSEL || 275 inst->opcode == BRW_OPCODE_IF || 276 inst->opcode == BRW_OPCODE_WHILE; 277 } 278 279 bool 280 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); 281} 282 283namespace brw { 284 /** 285 * Remove any modifiers from the \p i-th source region of the instruction, 286 * including negate, abs and any implicit type conversion to the execution 287 * type. Instead any source modifiers will be implemented as a separate 288 * MOV instruction prior to the original instruction. 289 */ 290 bool 291 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) 292 { 293 assert(inst->components_read(i) == 1); 294 assert(v->devinfo->has_integer_dword_mul || 295 inst->opcode != BRW_OPCODE_MUL || 296 brw_reg_type_is_floating_point(get_exec_type(inst)) || 297 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 || 298 type_sz(inst->src[i].type) == get_exec_type_size(inst)); 299 300 const fs_builder ibld(v, block, inst); 301 const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); 302 303 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); 304 inst->src[i] = tmp; 305 306 return true; 307 } 308} 309 310namespace { 311 /** 312 * Remove any modifiers from the destination region of the instruction, 313 * including saturate, conditional mod and any implicit type conversion 314 * from the execution type. Instead any destination modifiers will be 315 * implemented as a separate MOV instruction after the original 316 * instruction. 317 */ 318 bool 319 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst) 320 { 321 const fs_builder ibld(v, block, inst); 322 const brw_reg_type type = get_exec_type(inst); 323 /* Not strictly necessary, but if possible use a temporary with the same 324 * channel alignment as the current destination in order to avoid 325 * violating the restrictions enforced later on by lower_src_region() 326 * and lower_dst_region(), which would introduce additional copy 327 * instructions into the program unnecessarily. 328 */ 329 const unsigned stride = 330 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 : 331 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); 332 fs_reg tmp = ibld.vgrf(type, stride); 333 ibld.UNDEF(tmp); 334 tmp = horiz_stride(tmp, stride); 335 336 /* Emit a MOV taking care of all the destination modifiers. */ 337 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); 338 mov->saturate = inst->saturate; 339 if (!has_inconsistent_cmod(inst)) 340 mov->conditional_mod = inst->conditional_mod; 341 if (inst->opcode != BRW_OPCODE_SEL) { 342 mov->predicate = inst->predicate; 343 mov->predicate_inverse = inst->predicate_inverse; 344 } 345 mov->flag_subreg = inst->flag_subreg; 346 lower_instruction(v, block, mov); 347 348 /* Point the original instruction at the temporary, and clean up any 349 * destination modifiers. 350 */ 351 assert(inst->size_written == inst->dst.component_size(inst->exec_size)); 352 inst->dst = tmp; 353 inst->size_written = inst->dst.component_size(inst->exec_size); 354 inst->saturate = false; 355 if (!has_inconsistent_cmod(inst)) 356 inst->conditional_mod = BRW_CONDITIONAL_NONE; 357 358 assert(!inst->flags_written(v->devinfo) || !mov->predicate); 359 return true; 360 } 361 362 /** 363 * Remove any non-trivial shuffling of data from the \p i-th source region 364 * of the instruction. Instead implement the region as a series of integer 365 * copies into a temporary with the same channel layout as the destination. 366 */ 367 bool 368 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) 369 { 370 assert(inst->components_read(i) == 1); 371 const fs_builder ibld(v, block, inst); 372 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride / 373 type_sz(inst->src[i].type); 374 assert(stride > 0); 375 fs_reg tmp = ibld.vgrf(inst->src[i].type, stride); 376 ibld.UNDEF(tmp); 377 tmp = horiz_stride(tmp, stride); 378 379 /* Emit a series of 32-bit integer copies with any source modifiers 380 * cleaned up (because their semantics are dependent on the type). 381 */ 382 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), 383 false); 384 const unsigned n = type_sz(tmp.type) / type_sz(raw_type); 385 fs_reg raw_src = inst->src[i]; 386 raw_src.negate = false; 387 raw_src.abs = false; 388 389 for (unsigned j = 0; j < n; j++) 390 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j)); 391 392 /* Point the original instruction at the temporary, making sure to keep 393 * any source modifiers in the instruction. 394 */ 395 fs_reg lower_src = tmp; 396 lower_src.negate = inst->src[i].negate; 397 lower_src.abs = inst->src[i].abs; 398 inst->src[i] = lower_src; 399 400 return true; 401 } 402 403 /** 404 * Remove any non-trivial shuffling of data from the destination region of 405 * the instruction. Instead implement the region as a series of integer 406 * copies from a temporary with a channel layout compatible with the 407 * sources. 408 */ 409 bool 410 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) 411 { 412 /* We cannot replace the result of an integer multiply which writes the 413 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit 414 * value whereas the MOV will act on only 32 or 33 bits of the 415 * accumulator. 416 */ 417 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() || 418 brw_reg_type_is_floating_point(inst->dst.type)); 419 420 const fs_builder ibld(v, block, inst); 421 const unsigned stride = required_dst_byte_stride(inst) / 422 type_sz(inst->dst.type); 423 assert(stride > 0); 424 fs_reg tmp = ibld.vgrf(inst->dst.type, stride); 425 ibld.UNDEF(tmp); 426 tmp = horiz_stride(tmp, stride); 427 428 /* Emit a series of 32-bit integer copies from the temporary into the 429 * original destination. 430 */ 431 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), 432 false); 433 const unsigned n = type_sz(tmp.type) / type_sz(raw_type); 434 435 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { 436 /* Note that in general we cannot simply predicate the copies on the 437 * same flag register as the original instruction, since it may have 438 * been overwritten by the instruction itself. Instead initialize 439 * the temporary with the previous contents of the destination 440 * register. 441 */ 442 for (unsigned j = 0; j < n; j++) 443 ibld.MOV(subscript(tmp, raw_type, j), 444 subscript(inst->dst, raw_type, j)); 445 } 446 447 for (unsigned j = 0; j < n; j++) 448 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j), 449 subscript(tmp, raw_type, j)); 450 451 /* Point the original instruction at the temporary, making sure to keep 452 * any destination modifiers in the instruction. 453 */ 454 assert(inst->size_written == inst->dst.component_size(inst->exec_size)); 455 inst->dst = tmp; 456 inst->size_written = inst->dst.component_size(inst->exec_size); 457 458 return true; 459 } 460 461 /** 462 * Bit-cast sources and destination of the instruction to an appropriate 463 * integer type, to be used in cases where the instruction doesn't support 464 * some other execution type. 465 */ 466 bool 467 lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst) 468 { 469 assert(inst->dst.type == get_exec_type(inst)); 470 const unsigned mask = has_invalid_exec_type(v->devinfo, inst); 471 const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false); 472 473 for (unsigned i = 0; i < inst->sources; i++) { 474 if (mask & (1u << i)) { 475 assert(inst->src[i].type == inst->dst.type); 476 inst->src[i].type = raw_type; 477 } 478 } 479 480 inst->dst.type = raw_type; 481 482 return true; 483 } 484 485 /** 486 * Legalize the source and destination regioning controls of the specified 487 * instruction. 488 */ 489 bool 490 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) 491 { 492 const intel_device_info *devinfo = v->devinfo; 493 bool progress = false; 494 495 if (has_invalid_dst_modifiers(devinfo, inst)) 496 progress |= lower_dst_modifiers(v, block, inst); 497 498 if (has_invalid_dst_region(devinfo, inst)) 499 progress |= lower_dst_region(v, block, inst); 500 501 for (unsigned i = 0; i < inst->sources; i++) { 502 if (has_invalid_src_modifiers(devinfo, inst, i)) 503 progress |= lower_src_modifiers(v, block, inst, i); 504 505 if (has_invalid_src_region(devinfo, inst, i)) 506 progress |= lower_src_region(v, block, inst, i); 507 } 508 509 if (has_invalid_exec_type(devinfo, inst)) 510 progress |= lower_exec_type(v, block, inst); 511 512 return progress; 513 } 514} 515 516bool 517fs_visitor::lower_regioning() 518{ 519 bool progress = false; 520 521 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) 522 progress |= lower_instruction(this, block, inst); 523 524 if (progress) 525 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); 526 527 return progress; 528} 529