101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2010 Intel Corporation 301e04c3fSmrg * Copyright © 2018 Broadcom 401e04c3fSmrg * 501e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 601e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 701e04c3fSmrg * to deal in the Software without restriction, including without limitation 801e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 901e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1001e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1101e04c3fSmrg * 1201e04c3fSmrg * The above copyright notice and this permission notice (including the next 1301e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1401e04c3fSmrg * Software. 1501e04c3fSmrg * 1601e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1701e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1801e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1901e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2001e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2101e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 2201e04c3fSmrg * DEALINGS IN THE SOFTWARE. 2301e04c3fSmrg */ 2401e04c3fSmrg 2501e04c3fSmrg#include "nir.h" 2601e04c3fSmrg#include "nir_builder.h" 2701e04c3fSmrg 2801e04c3fSmrg/** nir_lower_alu.c 2901e04c3fSmrg * 3001e04c3fSmrg * NIR's home for miscellaneous ALU operation lowering implementations. 3101e04c3fSmrg * 3201e04c3fSmrg * Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally 3301e04c3fSmrg * easy to write them there. However, if terms appear multiple times in the 3401e04c3fSmrg * lowered code, it can get very verbose and cause a lot of work for CSE, so 3501e04c3fSmrg * it may end up being easier to write out in C code. 3601e04c3fSmrg * 3701e04c3fSmrg * The shader must be in SSA for this pass. 3801e04c3fSmrg */ 3901e04c3fSmrg 4001e04c3fSmrg#define LOWER_MUL_HIGH (1 << 0) 4101e04c3fSmrg 4201e04c3fSmrgstatic bool 4301e04c3fSmrglower_alu_instr(nir_alu_instr *instr, nir_builder *b) 4401e04c3fSmrg{ 4501e04c3fSmrg nir_ssa_def *lowered = NULL; 4601e04c3fSmrg 4701e04c3fSmrg assert(instr->dest.dest.is_ssa); 4801e04c3fSmrg 4901e04c3fSmrg b->cursor = nir_before_instr(&instr->instr); 5001e04c3fSmrg b->exact = instr->exact; 5101e04c3fSmrg 5201e04c3fSmrg switch (instr->op) { 5301e04c3fSmrg case nir_op_bitfield_reverse: 5401e04c3fSmrg if (b->shader->options->lower_bitfield_reverse) { 5501e04c3fSmrg /* For more details, see: 5601e04c3fSmrg * 5701e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 5801e04c3fSmrg */ 5901e04c3fSmrg nir_ssa_def *c1 = nir_imm_int(b, 1); 6001e04c3fSmrg nir_ssa_def *c2 = nir_imm_int(b, 2); 6101e04c3fSmrg nir_ssa_def *c4 = nir_imm_int(b, 4); 6201e04c3fSmrg nir_ssa_def *c8 = nir_imm_int(b, 8); 6301e04c3fSmrg nir_ssa_def *c16 = nir_imm_int(b, 16); 6401e04c3fSmrg nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 6501e04c3fSmrg nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 6601e04c3fSmrg nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 6701e04c3fSmrg nir_ssa_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff); 6801e04c3fSmrg 6901e04c3fSmrg lowered = nir_ssa_for_alu_src(b, instr, 0); 7001e04c3fSmrg 7101e04c3fSmrg /* Swap odd and even bits. */ 7201e04c3fSmrg lowered = nir_ior(b, 7301e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c1), c55555555), 7401e04c3fSmrg nir_ishl(b, nir_iand(b, lowered, c55555555), c1)); 7501e04c3fSmrg 7601e04c3fSmrg /* Swap consecutive pairs. */ 7701e04c3fSmrg lowered = nir_ior(b, 7801e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c2), c33333333), 7901e04c3fSmrg nir_ishl(b, nir_iand(b, lowered, c33333333), c2)); 8001e04c3fSmrg 8101e04c3fSmrg /* Swap nibbles. */ 8201e04c3fSmrg lowered = nir_ior(b, 8301e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f), 8401e04c3fSmrg nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4)); 8501e04c3fSmrg 8601e04c3fSmrg /* Swap bytes. */ 8701e04c3fSmrg lowered = nir_ior(b, 8801e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff), 8901e04c3fSmrg nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8)); 9001e04c3fSmrg 9101e04c3fSmrg lowered = nir_ior(b, 9201e04c3fSmrg nir_ushr(b, lowered, c16), 9301e04c3fSmrg nir_ishl(b, lowered, c16)); 9401e04c3fSmrg } 9501e04c3fSmrg break; 9601e04c3fSmrg 9701e04c3fSmrg case nir_op_bit_count: 9801e04c3fSmrg if (b->shader->options->lower_bit_count) { 9901e04c3fSmrg /* For more details, see: 10001e04c3fSmrg * 10101e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel 10201e04c3fSmrg */ 10301e04c3fSmrg nir_ssa_def *c1 = nir_imm_int(b, 1); 10401e04c3fSmrg nir_ssa_def *c2 = nir_imm_int(b, 2); 10501e04c3fSmrg nir_ssa_def *c4 = nir_imm_int(b, 4); 10601e04c3fSmrg nir_ssa_def *c24 = nir_imm_int(b, 24); 10701e04c3fSmrg nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333); 10801e04c3fSmrg nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555); 10901e04c3fSmrg nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f); 11001e04c3fSmrg nir_ssa_def *c01010101 = nir_imm_int(b, 0x01010101); 11101e04c3fSmrg 11201e04c3fSmrg lowered = nir_ssa_for_alu_src(b, instr, 0); 11301e04c3fSmrg 11401e04c3fSmrg lowered = nir_isub(b, lowered, 11501e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c1), c55555555)); 11601e04c3fSmrg 11701e04c3fSmrg lowered = nir_iadd(b, 11801e04c3fSmrg nir_iand(b, lowered, c33333333), 11901e04c3fSmrg nir_iand(b, nir_ushr(b, lowered, c2), c33333333)); 12001e04c3fSmrg 12101e04c3fSmrg lowered = nir_ushr(b, 12201e04c3fSmrg nir_imul(b, 12301e04c3fSmrg nir_iand(b, 12401e04c3fSmrg nir_iadd(b, 12501e04c3fSmrg lowered, 12601e04c3fSmrg nir_ushr(b, lowered, c4)), 12701e04c3fSmrg c0f0f0f0f), 12801e04c3fSmrg c01010101), 12901e04c3fSmrg c24); 13001e04c3fSmrg } 13101e04c3fSmrg break; 13201e04c3fSmrg 13301e04c3fSmrg case nir_op_imul_high: 13401e04c3fSmrg case nir_op_umul_high: 13501e04c3fSmrg if (b->shader->options->lower_mul_high) { 13601e04c3fSmrg nir_ssa_def *src0 = nir_ssa_for_alu_src(b, instr, 0); 13701e04c3fSmrg nir_ssa_def *src1 = nir_ssa_for_alu_src(b, instr, 1); 1387ec681f3Smrg if (src0->bit_size < 32) { 1397ec681f3Smrg /* Just do the math in 32-bit space and shift the result */ 1407ec681f3Smrg nir_alu_type base_type = nir_op_infos[instr->op].output_type; 1417ec681f3Smrg nir_op upcast_op = nir_type_conversion_op(base_type | src0->bit_size, base_type | 32, nir_rounding_mode_undef); 1427ec681f3Smrg nir_op downscast_op = nir_type_conversion_op(base_type | 32, base_type | src0->bit_size, nir_rounding_mode_undef); 1437ec681f3Smrg 1447ec681f3Smrg nir_ssa_def *src0_32 = nir_build_alu(b, upcast_op, src0, NULL, NULL, NULL); 1457ec681f3Smrg nir_ssa_def *src1_32 = nir_build_alu(b, upcast_op, src1, NULL, NULL, NULL); 1467ec681f3Smrg nir_ssa_def *dest_32 = nir_imul(b, src0_32, src1_32); 1477ec681f3Smrg nir_ssa_def *dest_shifted = nir_ishr(b, dest_32, nir_imm_int(b, src0->bit_size)); 1487ec681f3Smrg lowered = nir_build_alu(b, downscast_op, dest_shifted, NULL, NULL, NULL); 1497ec681f3Smrg } else { 1507ec681f3Smrg nir_ssa_def *c1 = nir_imm_intN_t(b, 1, src0->bit_size); 1517ec681f3Smrg nir_ssa_def *cshift = nir_imm_int(b, src0->bit_size / 2); 1527ec681f3Smrg nir_ssa_def *cmask = nir_imm_intN_t(b, (1ull << (src0->bit_size / 2)) - 1, src0->bit_size); 1537ec681f3Smrg nir_ssa_def *different_signs = NULL; 1547ec681f3Smrg if (instr->op == nir_op_imul_high) { 1557ec681f3Smrg nir_ssa_def *c0 = nir_imm_intN_t(b, 0, src0->bit_size); 1567ec681f3Smrg different_signs = nir_ixor(b, 1577ec681f3Smrg nir_ilt(b, src0, c0), 1587ec681f3Smrg nir_ilt(b, src1, c0)); 1597ec681f3Smrg src0 = nir_iabs(b, src0); 1607ec681f3Smrg src1 = nir_iabs(b, src1); 1617ec681f3Smrg } 16201e04c3fSmrg 1637ec681f3Smrg /* ABCD 1647ec681f3Smrg * * EFGH 1657ec681f3Smrg * ====== 1667ec681f3Smrg * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1677ec681f3Smrg * 1687ec681f3Smrg * Start by splitting into the 4 multiplies. 16901e04c3fSmrg */ 1707ec681f3Smrg nir_ssa_def *src0l = nir_iand(b, src0, cmask); 1717ec681f3Smrg nir_ssa_def *src1l = nir_iand(b, src1, cmask); 1727ec681f3Smrg nir_ssa_def *src0h = nir_ushr(b, src0, cshift); 1737ec681f3Smrg nir_ssa_def *src1h = nir_ushr(b, src1, cshift); 1747ec681f3Smrg 1757ec681f3Smrg nir_ssa_def *lo = nir_imul(b, src0l, src1l); 1767ec681f3Smrg nir_ssa_def *m1 = nir_imul(b, src0l, src1h); 1777ec681f3Smrg nir_ssa_def *m2 = nir_imul(b, src0h, src1l); 1787ec681f3Smrg nir_ssa_def *hi = nir_imul(b, src0h, src1h); 1797ec681f3Smrg 1807ec681f3Smrg nir_ssa_def *tmp; 1817ec681f3Smrg 1827ec681f3Smrg tmp = nir_ishl(b, m1, cshift); 1837ec681f3Smrg hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 1847ec681f3Smrg lo = nir_iadd(b, lo, tmp); 1857ec681f3Smrg hi = nir_iadd(b, hi, nir_ushr(b, m1, cshift)); 1867ec681f3Smrg 1877ec681f3Smrg tmp = nir_ishl(b, m2, cshift); 1887ec681f3Smrg hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1)); 1897ec681f3Smrg lo = nir_iadd(b, lo, tmp); 1907ec681f3Smrg hi = nir_iadd(b, hi, nir_ushr(b, m2, cshift)); 1917ec681f3Smrg 1927ec681f3Smrg if (instr->op == nir_op_imul_high) { 1937ec681f3Smrg /* For channels where different_signs is set we have to perform a 1947ec681f3Smrg * 64-bit negation. This is *not* the same as just negating the 1957ec681f3Smrg * high 32-bits. Consider -3 * 2. The high 32-bits is 0, but the 1967ec681f3Smrg * desired result is -1, not -0! Recall -x == ~x + 1. 1977ec681f3Smrg */ 1987ec681f3Smrg hi = nir_bcsel(b, different_signs, 1997ec681f3Smrg nir_iadd(b, 2007ec681f3Smrg nir_inot(b, hi), 2017ec681f3Smrg nir_iand(b, 2027ec681f3Smrg nir_uadd_carry(b, 2037ec681f3Smrg nir_inot(b, lo), 2047ec681f3Smrg c1), 2057ec681f3Smrg nir_imm_intN_t(b, 1, src0->bit_size))), 2067ec681f3Smrg hi); 2077ec681f3Smrg } 20801e04c3fSmrg 2097ec681f3Smrg lowered = hi; 2107ec681f3Smrg } 21101e04c3fSmrg } 21201e04c3fSmrg break; 21301e04c3fSmrg 21401e04c3fSmrg default: 21501e04c3fSmrg break; 21601e04c3fSmrg } 21701e04c3fSmrg 21801e04c3fSmrg if (lowered) { 2197ec681f3Smrg nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, lowered); 22001e04c3fSmrg nir_instr_remove(&instr->instr); 22101e04c3fSmrg return true; 22201e04c3fSmrg } else { 22301e04c3fSmrg return false; 22401e04c3fSmrg } 22501e04c3fSmrg} 22601e04c3fSmrg 22701e04c3fSmrgbool 22801e04c3fSmrgnir_lower_alu(nir_shader *shader) 22901e04c3fSmrg{ 23001e04c3fSmrg bool progress = false; 23101e04c3fSmrg 23201e04c3fSmrg if (!shader->options->lower_bitfield_reverse && 23301e04c3fSmrg !shader->options->lower_mul_high) 23401e04c3fSmrg return false; 23501e04c3fSmrg 23601e04c3fSmrg nir_foreach_function(function, shader) { 23701e04c3fSmrg if (function->impl) { 23801e04c3fSmrg nir_builder builder; 23901e04c3fSmrg nir_builder_init(&builder, function->impl); 24001e04c3fSmrg 24101e04c3fSmrg nir_foreach_block(block, function->impl) { 24201e04c3fSmrg nir_foreach_instr_safe(instr, block) { 24301e04c3fSmrg if (instr->type == nir_instr_type_alu) { 24401e04c3fSmrg progress = lower_alu_instr(nir_instr_as_alu(instr), 24501e04c3fSmrg &builder) || progress; 24601e04c3fSmrg } 24701e04c3fSmrg } 24801e04c3fSmrg } 24901e04c3fSmrg 25001e04c3fSmrg if (progress) { 25101e04c3fSmrg nir_metadata_preserve(function->impl, 25201e04c3fSmrg nir_metadata_block_index | 25301e04c3fSmrg nir_metadata_dominance); 25401e04c3fSmrg } 25501e04c3fSmrg } 25601e04c3fSmrg } 25701e04c3fSmrg 25801e04c3fSmrg return progress; 25901e04c3fSmrg} 260