1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2018 Advanced Micro Devices, Inc. 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#ifndef FAST_IDIV_BY_CONST_H 25b8e80941Smrg#define FAST_IDIV_BY_CONST_H 26b8e80941Smrg 27b8e80941Smrg/* Imported from: 28b8e80941Smrg * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c 29b8e80941Smrg */ 30b8e80941Smrg 31b8e80941Smrg#include <inttypes.h> 32b8e80941Smrg#include <limits.h> 33b8e80941Smrg#include <assert.h> 34b8e80941Smrg 35b8e80941Smrg#ifdef __cplusplus 36b8e80941Smrgextern "C" { 37b8e80941Smrg#endif 38b8e80941Smrg 39b8e80941Smrg/* Computes "magic info" for performing signed division by a fixed integer D. 40b8e80941Smrg * The type 'sint_t' is assumed to be defined as a signed integer type large 41b8e80941Smrg * enough to hold both the dividend and the divisor. 42b8e80941Smrg * Here >> is arithmetic (signed) shift, and >>> is logical shift. 43b8e80941Smrg * 44b8e80941Smrg * To emit code for n/d, rounding towards zero, use the following sequence: 45b8e80941Smrg * 46b8e80941Smrg * m = compute_signed_magic_info(D) 47b8e80941Smrg * emit("result = (m.multiplier * n) >> SINT_BITS"); 48b8e80941Smrg * if d > 0 and m.multiplier < 0: emit("result += n") 49b8e80941Smrg * if d < 0 and m.multiplier > 0: emit("result -= n") 50b8e80941Smrg * if m.post_shift > 0: emit("result >>= m.shift") 51b8e80941Smrg * emit("result += (result < 0)") 52b8e80941Smrg * 53b8e80941Smrg * The shifts by SINT_BITS may be "free" if the high half of the full multiply 54b8e80941Smrg * is put in a separate register. 55b8e80941Smrg * 56b8e80941Smrg * The final add can of course be implemented via the sign bit, e.g. 57b8e80941Smrg * result += (result >>> (SINT_BITS - 1)) 58b8e80941Smrg * or 59b8e80941Smrg * result -= (result >> (SINT_BITS - 1)) 60b8e80941Smrg * 61b8e80941Smrg * This code is heavily indebted to Hacker's Delight by Henry Warren. 62b8e80941Smrg * See http://www.hackersdelight.org/HDcode/magic.c.txt 63b8e80941Smrg * Used with permission from http://www.hackersdelight.org/permissions.htm 64b8e80941Smrg */ 65b8e80941Smrg 66b8e80941Smrgstruct util_fast_sdiv_info { 67b8e80941Smrg int64_t multiplier; /* the "magic number" multiplier */ 68b8e80941Smrg unsigned shift; /* shift for the dividend after multiplying */ 69b8e80941Smrg}; 70b8e80941Smrg 71b8e80941Smrgstruct util_fast_sdiv_info 72b8e80941Smrgutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS); 73b8e80941Smrg 74b8e80941Smrg/* Computes "magic info" for performing unsigned division by a fixed positive 75b8e80941Smrg * integer D. UINT_BITS is the bit size at which the final "magic" 76b8e80941Smrg * calculation will be performed; it is assumed to be large enough to hold 77b8e80941Smrg * both the dividand and the divisor. num_bits can be set appropriately if n 78b8e80941Smrg * is known to be smaller than calc_bits; if this is not known then UINT_BITS 79b8e80941Smrg * for num_bits. 80b8e80941Smrg * 81b8e80941Smrg * Assume we have a hardware register of width UINT_BITS, a known constant D 82b8e80941Smrg * which is not zero and not a power of 2, and a variable n of width num_bits 83b8e80941Smrg * (which may be up to UINT_BITS). To emit code for n/d, use one of the two 84b8e80941Smrg * following sequences (here >>> refers to a logical bitshift): 85b8e80941Smrg * 86b8e80941Smrg * m = compute_unsigned_magic_info(D, num_bits) 87b8e80941Smrg * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 88b8e80941Smrg * if m.increment: emit("n = saturated_increment(n)") 89b8e80941Smrg * emit("result = (m.multiplier * n) >>> UINT_BITS") 90b8e80941Smrg * if m.post_shift > 0: emit("result >>>= m.post_shift") 91b8e80941Smrg * 92b8e80941Smrg * or 93b8e80941Smrg * 94b8e80941Smrg * m = compute_unsigned_magic_info(D, num_bits) 95b8e80941Smrg * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 96b8e80941Smrg * emit("result = m.multiplier * n") 97b8e80941Smrg * if m.increment: emit("result = result + m.multiplier") 98b8e80941Smrg * emit("result >>>= UINT_BITS") 99b8e80941Smrg * if m.post_shift > 0: emit("result >>>= m.post_shift") 100b8e80941Smrg * 101b8e80941Smrg * This second version works even if D is 1. The shifts by UINT_BITS may be 102b8e80941Smrg * "free" if the high half of the full multiply is put in a separate register. 103b8e80941Smrg * 104b8e80941Smrg * saturated_increment(n) means "increment n unless it would wrap to 0," i.e. 105b8e80941Smrg * if n == (1 << UINT_BITS)-1: result = n 106b8e80941Smrg * else: result = n+1 107b8e80941Smrg * A common way to implement this is with the carry bit. For example, on x86: 108b8e80941Smrg * add 1 109b8e80941Smrg * sbb 0 110b8e80941Smrg * 111b8e80941Smrg * Some invariants: 112b8e80941Smrg * 1: At least one of pre_shift and increment is zero 113b8e80941Smrg * 2: multiplier is never zero 114b8e80941Smrg * 115b8e80941Smrg * This code incorporates the "round down" optimization per ridiculous_fish. 116b8e80941Smrg */ 117b8e80941Smrg 118b8e80941Smrgstruct util_fast_udiv_info { 119b8e80941Smrg uint64_t multiplier; /* the "magic number" multiplier */ 120b8e80941Smrg unsigned pre_shift; /* shift for the dividend before multiplying */ 121b8e80941Smrg unsigned post_shift; /* shift for the dividend after multiplying */ 122b8e80941Smrg int increment; /* 0 or 1; if set then increment the numerator, using one of 123b8e80941Smrg the two strategies */ 124b8e80941Smrg}; 125b8e80941Smrg 126b8e80941Smrgstruct util_fast_udiv_info 127b8e80941Smrgutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS); 128b8e80941Smrg 129b8e80941Smrg/* Below are possible options for dividing by a uniform in a shader where 130b8e80941Smrg * the divisor is constant but not known at compile time. 131b8e80941Smrg */ 132b8e80941Smrg 133b8e80941Smrg/* Full version. */ 134b8e80941Smrgstatic inline uint32_t 135b8e80941Smrgutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info) 136b8e80941Smrg{ 137b8e80941Smrg n = n >> info.pre_shift; 138b8e80941Smrg /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps 139b8e80941Smrg * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD. 140b8e80941Smrg * 141b8e80941Smrg * If you have unsigned 64-bit MAD with 32-bit inputs, you can do: 142b8e80941Smrg * increment = increment ? multiplier : 0; // on the CPU 143b8e80941Smrg * (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD 144b8e80941Smrg */ 145b8e80941Smrg n = (((uint64_t)n + info.increment) * info.multiplier) >> 32; 146b8e80941Smrg n = n >> info.post_shift; 147b8e80941Smrg return n; 148b8e80941Smrg} 149b8e80941Smrg 150b8e80941Smrg/* A little more efficient version if n != UINT_MAX, i.e. no unsigned 151b8e80941Smrg * wraparound in the computation. 152b8e80941Smrg */ 153b8e80941Smrgstatic inline uint32_t 154b8e80941Smrgutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info) 155b8e80941Smrg{ 156b8e80941Smrg assert(n != UINT32_MAX); 157b8e80941Smrg n = n >> info.pre_shift; 158b8e80941Smrg n = n + info.increment; 159b8e80941Smrg n = ((uint64_t)n * info.multiplier) >> 32; 160b8e80941Smrg n = n >> info.post_shift; 161b8e80941Smrg return n; 162b8e80941Smrg} 163b8e80941Smrg 164b8e80941Smrg/* Even faster version but both operands must be 31-bit unsigned integers 165b8e80941Smrg * and the divisor must be greater than 1. 166b8e80941Smrg * 167b8e80941Smrg * info must be computed with num_bits == 31. 168b8e80941Smrg */ 169b8e80941Smrgstatic inline uint32_t 170b8e80941Smrgutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info) 171b8e80941Smrg{ 172b8e80941Smrg assert(info.pre_shift == 0); 173b8e80941Smrg assert(info.increment == 0); 174b8e80941Smrg n = ((uint64_t)n * info.multiplier) >> 32; 175b8e80941Smrg n = n >> info.post_shift; 176b8e80941Smrg return n; 177b8e80941Smrg} 178b8e80941Smrg 179b8e80941Smrg#ifdef __cplusplus 180b8e80941Smrg} /* extern C */ 181b8e80941Smrg#endif 182b8e80941Smrg 183b8e80941Smrg#endif /* FAST_IDIV_BY_CONST_H */ 184