1/* 2 * Copyright © 2010 Intel Corporation 3 * Copyright © 2018 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 * DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "nir.h" 26#include "nir_builder.h" 27 28/** nir_lower_alu.c 29 * 30 * NIR's home for miscellaneous ALU operation lowering implementations. 31 * 32 * Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally 33 * easy to write them there. However, if terms appear multiple times in the 34 * lowered code, it can get very verbose and cause a lot of work for CSE, so 35 * it may end up being easier to write out in C code. 36 * 37 * The shader must be in SSA for this pass. 38 */ 39 40#define LOWER_MUL_HIGH (1 << 0) 41 42static bool 43lower_alu_instr(nir_alu_instr *instr, nir_builder *b) 44{ 45 nir_ssa_def *lowered = NULL; 46 47 assert(instr->dest.dest.is_ssa); 48 49 b->cursor = nir_before_instr(&instr->instr); 50 b->exact = instr->exact; 51 52 switch (instr->op) { 53 case nir_op_bitfield_reverse: 54 if (b->shader->options->lower_bitfield_reverse) { 55 /* For more details, see: 56 * 57 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 58 */ 59 nir_ssa_def *c1 = nir_imm_int(b, 1); 60 nir_ssa_def *c2 = nir_imm_int(b, 2); 61 nir_ssa_def *c4 = nir_imm_int(b, 4); 62 nir_ssa_def *c8 = nir_imm_int(b, 8); 63 nir_ssa_def *c16 = nir_imm_int(b, 16); 64 nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 65 nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 66 nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 67 nir_ssa_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff); 68 69 lowered = nir_ssa_for_alu_src(b, instr, 0); 70 71 /* Swap odd and even bits. */ 72 lowered = nir_ior(b, 73 nir_iand(b, nir_ushr(b, lowered, c1), c55555555), 74 nir_ishl(b, nir_iand(b, lowered, c55555555), c1)); 75 76 /* Swap consecutive pairs. */ 77 lowered = nir_ior(b, 78 nir_iand(b, nir_ushr(b, lowered, c2), c33333333), 79 nir_ishl(b, nir_iand(b, lowered, c33333333), c2)); 80 81 /* Swap nibbles. */ 82 lowered = nir_ior(b, 83 nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f), 84 nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4)); 85 86 /* Swap bytes. */ 87 lowered = nir_ior(b, 88 nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff), 89 nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8)); 90 91 lowered = nir_ior(b, 92 nir_ushr(b, lowered, c16), 93 nir_ishl(b, lowered, c16)); 94 } 95 break; 96 97 case nir_op_bit_count: 98 if (b->shader->options->lower_bit_count) { 99 /* For more details, see: 100 * 101 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 102 */ 103 nir_ssa_def *c1 = nir_imm_int(b, 1); 104 nir_ssa_def *c2 = nir_imm_int(b, 2); 105 nir_ssa_def *c4 = nir_imm_int(b, 4); 106 nir_ssa_def *c24 = nir_imm_int(b, 24); 107 nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 108 nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 109 nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 110 nir_ssa_def *c01010101 = nir_imm_int(b, 0x01010101); 111 112 lowered = nir_ssa_for_alu_src(b, instr, 0); 113 114 lowered = nir_isub(b, lowered, 115 nir_iand(b, nir_ushr(b, lowered, c1), c55555555)); 116 117 lowered = nir_iadd(b, 118 nir_iand(b, lowered, c33333333), 119 nir_iand(b, nir_ushr(b, lowered, c2), c33333333)); 120 121 lowered = nir_ushr(b, 122 nir_imul(b, 123 nir_iand(b, 124 nir_iadd(b, 125 lowered, 126 nir_ushr(b, lowered, c4)), 127 c0f0f0f0f), 128 c01010101), 129 c24); 130 } 131 break; 132 133 case nir_op_imul_high: 134 case nir_op_umul_high: 135 if (b->shader->options->lower_mul_high) { 136 nir_ssa_def *c1 = nir_imm_int(b, 1); 137 nir_ssa_def *c16 = nir_imm_int(b, 16); 138 139 nir_ssa_def *src0 = nir_ssa_for_alu_src(b, instr, 0); 140 nir_ssa_def *src1 = nir_ssa_for_alu_src(b, instr, 1); 141 nir_ssa_def *different_signs = NULL; 142 if (instr->op == nir_op_imul_high) { 143 nir_ssa_def *c0 = nir_imm_int(b, 0); 144 different_signs = nir_ixor(b, 145 nir_ilt(b, src0, c0), 146 nir_ilt(b, src1, c0)); 147 src0 = nir_iabs(b, src0); 148 src1 = nir_iabs(b, src1); 149 } 150 151 /* ABCD 152 * * EFGH 153 * ====== 154 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 155 * 156 * Start by splitting into the 4 multiplies. 157 */ 158 nir_ssa_def *src0l = nir_iand(b, src0, nir_imm_int(b, 0xffff)); 159 nir_ssa_def *src1l = nir_iand(b, src1, nir_imm_int(b, 0xffff)); 160 nir_ssa_def *src0h = nir_ushr(b, src0, c16); 161 nir_ssa_def *src1h = nir_ushr(b, src1, c16); 162 163 nir_ssa_def *lo = nir_imul(b, src0l, src1l); 164 nir_ssa_def *m1 = nir_imul(b, src0l, src1h); 165 nir_ssa_def *m2 = nir_imul(b, src0h, src1l); 166 nir_ssa_def *hi = nir_imul(b, src0h, src1h); 167 168 nir_ssa_def *tmp; 169 170 tmp = nir_ishl(b, m1, c16); 171 hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 172 lo = nir_iadd(b, lo, tmp); 173 hi = nir_iadd(b, hi, nir_ushr(b, m1, c16)); 174 175 tmp = nir_ishl(b, m2, c16); 176 hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 177 lo = nir_iadd(b, lo, tmp); 178 hi = nir_iadd(b, hi, nir_ushr(b, m2, c16)); 179 180 if (instr->op == nir_op_imul_high) { 181 /* For channels where different_signs is set we have to perform a 182 * 64-bit negation. This is *not* the same as just negating the 183 * high 32-bits. Consider -3 * 2. The high 32-bits is 0, but the 184 * desired result is -1, not -0! Recall -x == ~x + 1. 185 */ 186 hi = nir_bcsel(b, different_signs, 187 nir_iadd(b, 188 nir_inot(b, hi), 189 nir_iand(b, 190 nir_uadd_carry(b, 191 nir_inot(b, lo), 192 c1), 193 nir_imm_int(b, 1))), 194 hi); 195 } 196 197 lowered = hi; 198 } 199 break; 200 201 default: 202 break; 203 } 204 205 if (lowered) { 206 nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(lowered)); 207 nir_instr_remove(&instr->instr); 208 return true; 209 } else { 210 return false; 211 } 212} 213 214bool 215nir_lower_alu(nir_shader *shader) 216{ 217 bool progress = false; 218 219 if (!shader->options->lower_bitfield_reverse && 220 !shader->options->lower_mul_high) 221 return false; 222 223 nir_foreach_function(function, shader) { 224 if (function->impl) { 225 nir_builder builder; 226 nir_builder_init(&builder, function->impl); 227 228 nir_foreach_block(block, function->impl) { 229 nir_foreach_instr_safe(instr, block) { 230 if (instr->type == nir_instr_type_alu) { 231 progress = lower_alu_instr(nir_instr_as_alu(instr), 232 &builder) || progress; 233 } 234 } 235 } 236 237 if (progress) { 238 nir_metadata_preserve(function->impl, 239 nir_metadata_block_index | 240 nir_metadata_dominance); 241 } 242 } 243 } 244 245 return progress; 246} 247