1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2015 Red Hat 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Authors: 24b8e80941Smrg * Rob Clark <robclark@freedesktop.org> 25b8e80941Smrg */ 26b8e80941Smrg 27b8e80941Smrg#include "nir.h" 28b8e80941Smrg#include "nir_builder.h" 29b8e80941Smrg 30b8e80941Smrg/* Lowers idiv/udiv/umod 31b8e80941Smrg * Based on NV50LegalizeSSA::handleDIV() 32b8e80941Smrg * 33b8e80941Smrg * Note that this is probably not enough precision for compute shaders. 34b8e80941Smrg * Perhaps we want a second higher precision (looping) version of this? 35b8e80941Smrg * Or perhaps we assume if you can do compute shaders you can also 36b8e80941Smrg * branch out to a pre-optimized shader library routine.. 37b8e80941Smrg */ 38b8e80941Smrg 39b8e80941Smrgstatic bool 40b8e80941Smrgconvert_instr(nir_builder *bld, nir_alu_instr *alu) 41b8e80941Smrg{ 42b8e80941Smrg nir_ssa_def *numer, *denom, *af, *bf, *a, *b, *q, *r; 43b8e80941Smrg nir_op op = alu->op; 44b8e80941Smrg bool is_signed; 45b8e80941Smrg 46b8e80941Smrg if ((op != nir_op_idiv) && 47b8e80941Smrg (op != nir_op_udiv) && 48b8e80941Smrg (op != nir_op_umod)) 49b8e80941Smrg return false; 50b8e80941Smrg 51b8e80941Smrg is_signed = (op == nir_op_idiv); 52b8e80941Smrg 53b8e80941Smrg bld->cursor = nir_before_instr(&alu->instr); 54b8e80941Smrg 55b8e80941Smrg numer = nir_ssa_for_alu_src(bld, alu, 0); 56b8e80941Smrg denom = nir_ssa_for_alu_src(bld, alu, 1); 57b8e80941Smrg 58b8e80941Smrg if (is_signed) { 59b8e80941Smrg af = nir_i2f32(bld, numer); 60b8e80941Smrg bf = nir_i2f32(bld, denom); 61b8e80941Smrg af = nir_fabs(bld, af); 62b8e80941Smrg bf = nir_fabs(bld, bf); 63b8e80941Smrg a = nir_iabs(bld, numer); 64b8e80941Smrg b = nir_iabs(bld, denom); 65b8e80941Smrg } else { 66b8e80941Smrg af = nir_u2f32(bld, numer); 67b8e80941Smrg bf = nir_u2f32(bld, denom); 68b8e80941Smrg a = numer; 69b8e80941Smrg b = denom; 70b8e80941Smrg } 71b8e80941Smrg 72b8e80941Smrg /* get first result: */ 73b8e80941Smrg bf = nir_frcp(bld, bf); 74b8e80941Smrg bf = nir_isub(bld, bf, nir_imm_int(bld, 2)); /* yes, really */ 75b8e80941Smrg q = nir_fmul(bld, af, bf); 76b8e80941Smrg 77b8e80941Smrg if (is_signed) { 78b8e80941Smrg q = nir_f2i32(bld, q); 79b8e80941Smrg } else { 80b8e80941Smrg q = nir_f2u32(bld, q); 81b8e80941Smrg } 82b8e80941Smrg 83b8e80941Smrg /* get error of first result: */ 84b8e80941Smrg r = nir_imul(bld, q, b); 85b8e80941Smrg r = nir_isub(bld, a, r); 86b8e80941Smrg r = nir_u2f32(bld, r); 87b8e80941Smrg r = nir_fmul(bld, r, bf); 88b8e80941Smrg r = nir_f2u32(bld, r); 89b8e80941Smrg 90b8e80941Smrg /* add quotients: */ 91b8e80941Smrg q = nir_iadd(bld, q, r); 92b8e80941Smrg 93b8e80941Smrg /* correction: if modulus >= divisor, add 1 */ 94b8e80941Smrg r = nir_imul(bld, q, b); 95b8e80941Smrg r = nir_isub(bld, a, r); 96b8e80941Smrg 97b8e80941Smrg r = nir_uge(bld, r, b); 98b8e80941Smrg r = nir_b2i32(bld, r); 99b8e80941Smrg 100b8e80941Smrg q = nir_iadd(bld, q, r); 101b8e80941Smrg if (is_signed) { 102b8e80941Smrg /* fix the sign: */ 103b8e80941Smrg r = nir_ixor(bld, numer, denom); 104b8e80941Smrg r = nir_ilt(bld, r, nir_imm_int(bld, 0)); 105b8e80941Smrg b = nir_ineg(bld, q); 106b8e80941Smrg q = nir_bcsel(bld, r, b, q); 107b8e80941Smrg } 108b8e80941Smrg 109b8e80941Smrg if (op == nir_op_umod) { 110b8e80941Smrg /* division result in q */ 111b8e80941Smrg r = nir_imul(bld, q, b); 112b8e80941Smrg q = nir_isub(bld, a, r); 113b8e80941Smrg } 114b8e80941Smrg 115b8e80941Smrg assert(alu->dest.dest.is_ssa); 116b8e80941Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(q)); 117b8e80941Smrg 118b8e80941Smrg return true; 119b8e80941Smrg} 120b8e80941Smrg 121b8e80941Smrgstatic bool 122b8e80941Smrgconvert_impl(nir_function_impl *impl) 123b8e80941Smrg{ 124b8e80941Smrg nir_builder b; 125b8e80941Smrg nir_builder_init(&b, impl); 126b8e80941Smrg bool progress = false; 127b8e80941Smrg 128b8e80941Smrg nir_foreach_block(block, impl) { 129b8e80941Smrg nir_foreach_instr_safe(instr, block) { 130b8e80941Smrg if (instr->type == nir_instr_type_alu) 131b8e80941Smrg progress |= convert_instr(&b, nir_instr_as_alu(instr)); 132b8e80941Smrg } 133b8e80941Smrg } 134b8e80941Smrg 135b8e80941Smrg nir_metadata_preserve(impl, nir_metadata_block_index | 136b8e80941Smrg nir_metadata_dominance); 137b8e80941Smrg 138b8e80941Smrg return progress; 139b8e80941Smrg} 140b8e80941Smrg 141b8e80941Smrgbool 142b8e80941Smrgnir_lower_idiv(nir_shader *shader) 143b8e80941Smrg{ 144b8e80941Smrg bool progress = false; 145b8e80941Smrg 146b8e80941Smrg nir_foreach_function(function, shader) { 147b8e80941Smrg if (function->impl) 148b8e80941Smrg progress |= convert_impl(function->impl); 149b8e80941Smrg } 150b8e80941Smrg 151b8e80941Smrg return progress; 152b8e80941Smrg} 153