1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_fs.h" 25#include "brw_cfg.h" 26#include "brw_fs_builder.h" 27 28using namespace brw; 29 30namespace { 31 /* From the SKL PRM Vol 2a, "Move": 32 * 33 * "A mov with the same source and destination type, no source modifier, 34 * and no saturation is a raw move. A packed byte destination region (B 35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written 36 * using raw move." 37 */ 38 bool 39 is_byte_raw_mov(const fs_inst *inst) 40 { 41 return type_sz(inst->dst.type) == 1 && 42 inst->opcode == BRW_OPCODE_MOV && 43 inst->src[0].type == inst->dst.type && 44 !inst->saturate && 45 !inst->src[0].negate && 46 !inst->src[0].abs; 47 } 48 49 /* 50 * Return an acceptable byte stride for the destination of an instruction 51 * that requires it to have some particular alignment. 52 */ 53 unsigned 54 required_dst_byte_stride(const fs_inst *inst) 55 { 56 if (inst->dst.is_accumulator()) { 57 /* If the destination is an accumulator, insist that we leave the 58 * stride alone. We cannot "fix" accumulator destinations by writing 59 * to a temporary and emitting a MOV into the original destination. 60 * For multiply instructions (our one use of the accumulator), the 61 * MUL writes the full 66 bits of the accumulator whereas the MOV we 62 * would emit only writes 33 bits and leaves the top 33 bits 63 * undefined. 64 * 65 * It's safe to just require the original stride here because the 66 * lowering pass will detect the mismatch in has_invalid_src_region 67 * and fix the sources of the multiply instead of the destination. 68 */ 69 return inst->dst.stride * type_sz(inst->dst.type); 70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) && 71 !is_byte_raw_mov(inst)) { 72 return get_exec_type_size(inst); 73 } else { 74 /* Calculate the maximum byte stride and the minimum/maximum type 75 * size across all source and destination operands we are required to 76 * lower. 77 */ 78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type); 79 unsigned min_size = type_sz(inst->dst.type); 80 unsigned max_size = type_sz(inst->dst.type); 81 82 for (unsigned i = 0; i < inst->sources; i++) { 83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) { 84 const unsigned size = type_sz(inst->src[i].type); 85 max_stride = MAX2(max_stride, inst->src[i].stride * size); 86 min_size = MIN2(min_size, size); 87 max_size = MAX2(max_size, size); 88 } 89 } 90 91 /* All operands involved in lowering need to fit in the calculated 92 * stride. 93 */ 94 assert(max_size <= 4 * min_size); 95 96 /* Attempt to use the largest byte stride among all present operands, 97 * but never exceed a stride of 4 since that would lead to illegal 98 * destination regions during lowering. 99 */ 100 return MIN2(max_stride, 4 * min_size); 101 } 102 } 103 104 /* 105 * Return an acceptable byte sub-register offset for the destination of an 106 * instruction that requires it to be aligned to the sub-register offset of 107 * the sources. 108 */ 109 unsigned 110 required_dst_byte_offset(const fs_inst *inst) 111 { 112 for (unsigned i = 0; i < inst->sources; i++) { 113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) 114 if (reg_offset(inst->src[i]) % REG_SIZE != 115 reg_offset(inst->dst) % REG_SIZE) 116 return 0; 117 } 118 119 return reg_offset(inst->dst) % REG_SIZE; 120 } 121 122 /* 123 * Return whether the instruction has an unsupported channel bit layout 124 * specified for the i-th source region. 125 */ 126 bool 127 has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst, 128 unsigned i) 129 { 130 if (is_unordered(inst) || inst->is_control_source(i)) 131 return false; 132 133 /* Empirical testing shows that Broadwell has a bug affecting half-float 134 * MAD instructions when any of its sources has a non-zero offset, such 135 * as: 136 * 137 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q }; 138 * 139 * We used to generate code like this for SIMD8 executions where we 140 * used to pack components Y and W of a vector at offset 16B of a SIMD 141 * register. The problem doesn't occur if the stride of the source is 0. 142 */ 143 if (devinfo->gen == 8 && 144 inst->opcode == BRW_OPCODE_MAD && 145 inst->src[i].type == BRW_REGISTER_TYPE_HF && 146 reg_offset(inst->src[i]) % REG_SIZE > 0 && 147 inst->src[i].stride != 0) { 148 return true; 149 } 150 151 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); 152 const unsigned src_byte_stride = inst->src[i].stride * 153 type_sz(inst->src[i].type); 154 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; 155 const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE; 156 157 return has_dst_aligned_region_restriction(devinfo, inst) && 158 !is_uniform(inst->src[i]) && 159 (src_byte_stride != dst_byte_stride || 160 src_byte_offset != dst_byte_offset); 161 } 162 163 /* 164 * Return whether the instruction has an unsupported channel bit layout 165 * specified for the destination region. 166 */ 167 bool 168 has_invalid_dst_region(const gen_device_info *devinfo, 169 const fs_inst *inst) 170 { 171 if (is_unordered(inst)) { 172 return false; 173 } else { 174 const brw_reg_type exec_type = get_exec_type(inst); 175 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; 176 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); 177 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) && 178 type_sz(inst->dst.type) < type_sz(exec_type); 179 180 return (has_dst_aligned_region_restriction(devinfo, inst) && 181 (required_dst_byte_stride(inst) != dst_byte_stride || 182 required_dst_byte_offset(inst) != dst_byte_offset)) || 183 (is_narrowing_conversion && 184 required_dst_byte_stride(inst) != dst_byte_stride); 185 } 186 } 187 188 /* 189 * Return whether the instruction has unsupported source modifiers 190 * specified for the i-th source region. 191 */ 192 bool 193 has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst, 194 unsigned i) 195 { 196 return !inst->can_do_source_mods(devinfo) && 197 (inst->src[i].negate || inst->src[i].abs); 198 } 199 200 /* 201 * Return whether the instruction has an unsupported type conversion 202 * specified for the destination. 203 */ 204 bool 205 has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst) 206 { 207 switch (inst->opcode) { 208 case BRW_OPCODE_MOV: 209 return false; 210 case BRW_OPCODE_SEL: 211 return inst->dst.type != get_exec_type(inst); 212 case SHADER_OPCODE_BROADCAST: 213 case SHADER_OPCODE_MOV_INDIRECT: 214 /* The source and destination types of these may be hard-coded to 215 * integer at codegen time due to hardware limitations of 64-bit 216 * types. 217 */ 218 return ((devinfo->gen == 7 && !devinfo->is_haswell) || 219 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) && 220 type_sz(inst->src[0].type) > 4 && 221 inst->dst.type != inst->src[0].type; 222 default: 223 /* FIXME: We assume the opcodes don't explicitly mentioned before 224 * just work fine with arbitrary conversions. 225 */ 226 return false; 227 } 228 } 229 230 /** 231 * Return whether the instruction has non-standard semantics for the 232 * conditional mod which don't cause the flag register to be updated with 233 * the comparison result. 234 */ 235 bool 236 has_inconsistent_cmod(const fs_inst *inst) 237 { 238 return inst->opcode == BRW_OPCODE_SEL || 239 inst->opcode == BRW_OPCODE_CSEL || 240 inst->opcode == BRW_OPCODE_IF || 241 inst->opcode == BRW_OPCODE_WHILE; 242 } 243 244 bool 245 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); 246} 247 248namespace brw { 249 /** 250 * Remove any modifiers from the \p i-th source region of the instruction, 251 * including negate, abs and any implicit type conversion to the execution 252 * type. Instead any source modifiers will be implemented as a separate 253 * MOV instruction prior to the original instruction. 254 */ 255 bool 256 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) 257 { 258 assert(inst->components_read(i) == 1); 259 const fs_builder ibld(v, block, inst); 260 const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); 261 262 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); 263 inst->src[i] = tmp; 264 265 return true; 266 } 267} 268 269namespace { 270 /** 271 * Remove any modifiers from the destination region of the instruction, 272 * including saturate, conditional mod and any implicit type conversion 273 * from the execution type. Instead any destination modifiers will be 274 * implemented as a separate MOV instruction after the original 275 * instruction. 276 */ 277 bool 278 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst) 279 { 280 const fs_builder ibld(v, block, inst); 281 const brw_reg_type type = get_exec_type(inst); 282 /* Not strictly necessary, but if possible use a temporary with the same 283 * channel alignment as the current destination in order to avoid 284 * violating the restrictions enforced later on by lower_src_region() 285 * and lower_dst_region(), which would introduce additional copy 286 * instructions into the program unnecessarily. 287 */ 288 const unsigned stride = 289 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 : 290 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); 291 const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride); 292 293 /* Emit a MOV taking care of all the destination modifiers. */ 294 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); 295 mov->saturate = inst->saturate; 296 if (!has_inconsistent_cmod(inst)) 297 mov->conditional_mod = inst->conditional_mod; 298 if (inst->opcode != BRW_OPCODE_SEL) { 299 mov->predicate = inst->predicate; 300 mov->predicate_inverse = inst->predicate_inverse; 301 } 302 mov->flag_subreg = inst->flag_subreg; 303 lower_instruction(v, block, mov); 304 305 /* Point the original instruction at the temporary, and clean up any 306 * destination modifiers. 307 */ 308 assert(inst->size_written == inst->dst.component_size(inst->exec_size)); 309 inst->dst = tmp; 310 inst->size_written = inst->dst.component_size(inst->exec_size); 311 inst->saturate = false; 312 if (!has_inconsistent_cmod(inst)) 313 inst->conditional_mod = BRW_CONDITIONAL_NONE; 314 315 assert(!inst->flags_written() || !mov->predicate); 316 return true; 317 } 318 319 /** 320 * Remove any non-trivial shuffling of data from the \p i-th source region 321 * of the instruction. Instead implement the region as a series of integer 322 * copies into a temporary with the same channel layout as the destination. 323 */ 324 bool 325 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) 326 { 327 assert(inst->components_read(i) == 1); 328 const fs_builder ibld(v, block, inst); 329 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride / 330 type_sz(inst->src[i].type); 331 assert(stride > 0); 332 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride), 333 stride); 334 335 /* Emit a series of 32-bit integer copies with any source modifiers 336 * cleaned up (because their semantics are dependent on the type). 337 */ 338 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), 339 false); 340 const unsigned n = type_sz(tmp.type) / type_sz(raw_type); 341 fs_reg raw_src = inst->src[i]; 342 raw_src.negate = false; 343 raw_src.abs = false; 344 345 for (unsigned j = 0; j < n; j++) 346 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j)); 347 348 /* Point the original instruction at the temporary, making sure to keep 349 * any source modifiers in the instruction. 350 */ 351 fs_reg lower_src = tmp; 352 lower_src.negate = inst->src[i].negate; 353 lower_src.abs = inst->src[i].abs; 354 inst->src[i] = lower_src; 355 356 return true; 357 } 358 359 /** 360 * Remove any non-trivial shuffling of data from the destination region of 361 * the instruction. Instead implement the region as a series of integer 362 * copies from a temporary with a channel layout compatible with the 363 * sources. 364 */ 365 bool 366 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) 367 { 368 /* We cannot replace the result of an integer multiply which writes the 369 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit 370 * value whereas the MOV will act on only 32 or 33 bits of the 371 * accumulator. 372 */ 373 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() || 374 brw_reg_type_is_floating_point(inst->dst.type)); 375 376 const fs_builder ibld(v, block, inst); 377 const unsigned stride = required_dst_byte_stride(inst) / 378 type_sz(inst->dst.type); 379 assert(stride > 0); 380 const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride), 381 stride); 382 383 /* Emit a series of 32-bit integer copies from the temporary into the 384 * original destination. 385 */ 386 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), 387 false); 388 const unsigned n = type_sz(tmp.type) / type_sz(raw_type); 389 390 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { 391 /* Note that in general we cannot simply predicate the copies on the 392 * same flag register as the original instruction, since it may have 393 * been overwritten by the instruction itself. Instead initialize 394 * the temporary with the previous contents of the destination 395 * register. 396 */ 397 for (unsigned j = 0; j < n; j++) 398 ibld.MOV(subscript(tmp, raw_type, j), 399 subscript(inst->dst, raw_type, j)); 400 } 401 402 for (unsigned j = 0; j < n; j++) 403 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j), 404 subscript(tmp, raw_type, j)); 405 406 /* Point the original instruction at the temporary, making sure to keep 407 * any destination modifiers in the instruction. 408 */ 409 assert(inst->size_written == inst->dst.component_size(inst->exec_size)); 410 inst->dst = tmp; 411 inst->size_written = inst->dst.component_size(inst->exec_size); 412 413 return true; 414 } 415 416 /** 417 * Legalize the source and destination regioning controls of the specified 418 * instruction. 419 */ 420 bool 421 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) 422 { 423 const gen_device_info *devinfo = v->devinfo; 424 bool progress = false; 425 426 if (has_invalid_conversion(devinfo, inst)) 427 progress |= lower_dst_modifiers(v, block, inst); 428 429 if (has_invalid_dst_region(devinfo, inst)) 430 progress |= lower_dst_region(v, block, inst); 431 432 for (unsigned i = 0; i < inst->sources; i++) { 433 if (has_invalid_src_modifiers(devinfo, inst, i)) 434 progress |= lower_src_modifiers(v, block, inst, i); 435 436 if (has_invalid_src_region(devinfo, inst, i)) 437 progress |= lower_src_region(v, block, inst, i); 438 } 439 440 return progress; 441 } 442} 443 444bool 445fs_visitor::lower_regioning() 446{ 447 bool progress = false; 448 449 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) 450 progress |= lower_instruction(this, block, inst); 451 452 if (progress) 453 invalidate_live_intervals(); 454 455 return progress; 456} 457