101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2018 Advanced Micro Devices, Inc. 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#ifndef FAST_IDIV_BY_CONST_H 2501e04c3fSmrg#define FAST_IDIV_BY_CONST_H 2601e04c3fSmrg 2701e04c3fSmrg/* Imported from: 2801e04c3fSmrg * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c 2901e04c3fSmrg */ 3001e04c3fSmrg 3101e04c3fSmrg#include <inttypes.h> 3201e04c3fSmrg#include <limits.h> 3301e04c3fSmrg#include <assert.h> 3401e04c3fSmrg 3501e04c3fSmrg#ifdef __cplusplus 3601e04c3fSmrgextern "C" { 3701e04c3fSmrg#endif 3801e04c3fSmrg 3901e04c3fSmrg/* Computes "magic info" for performing signed division by a fixed integer D. 4001e04c3fSmrg * The type 'sint_t' is assumed to be defined as a signed integer type large 4101e04c3fSmrg * enough to hold both the dividend and the divisor. 4201e04c3fSmrg * Here >> is arithmetic (signed) shift, and >>> is logical shift. 4301e04c3fSmrg * 4401e04c3fSmrg * To emit code for n/d, rounding towards zero, use the following sequence: 4501e04c3fSmrg * 4601e04c3fSmrg * m = compute_signed_magic_info(D) 4701e04c3fSmrg * emit("result = (m.multiplier * n) >> SINT_BITS"); 4801e04c3fSmrg * if d > 0 and m.multiplier < 0: emit("result += n") 4901e04c3fSmrg * if d < 0 and m.multiplier > 0: emit("result -= n") 5001e04c3fSmrg * if m.post_shift > 0: emit("result >>= m.shift") 5101e04c3fSmrg * emit("result += (result < 0)") 5201e04c3fSmrg * 5301e04c3fSmrg * The shifts by SINT_BITS may be "free" if the high half of the full multiply 5401e04c3fSmrg * is put in a separate register. 5501e04c3fSmrg * 5601e04c3fSmrg * The final add can of course be implemented via the sign bit, e.g. 5701e04c3fSmrg * result += (result >>> (SINT_BITS - 1)) 5801e04c3fSmrg * or 5901e04c3fSmrg * result -= (result >> (SINT_BITS - 1)) 6001e04c3fSmrg * 6101e04c3fSmrg * This code is heavily indebted to Hacker's Delight by Henry Warren. 6201e04c3fSmrg * See http://www.hackersdelight.org/HDcode/magic.c.txt 6301e04c3fSmrg * Used with permission from http://www.hackersdelight.org/permissions.htm 6401e04c3fSmrg */ 6501e04c3fSmrg 6601e04c3fSmrgstruct util_fast_sdiv_info { 6701e04c3fSmrg int64_t multiplier; /* the "magic number" multiplier */ 6801e04c3fSmrg unsigned shift; /* shift for the dividend after multiplying */ 6901e04c3fSmrg}; 7001e04c3fSmrg 7101e04c3fSmrgstruct util_fast_sdiv_info 7201e04c3fSmrgutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS); 7301e04c3fSmrg 7401e04c3fSmrg/* Computes "magic info" for performing unsigned division by a fixed positive 7501e04c3fSmrg * integer D. UINT_BITS is the bit size at which the final "magic" 7601e04c3fSmrg * calculation will be performed; it is assumed to be large enough to hold 7701e04c3fSmrg * both the dividand and the divisor. num_bits can be set appropriately if n 7801e04c3fSmrg * is known to be smaller than calc_bits; if this is not known then UINT_BITS 7901e04c3fSmrg * for num_bits. 8001e04c3fSmrg * 8101e04c3fSmrg * Assume we have a hardware register of width UINT_BITS, a known constant D 8201e04c3fSmrg * which is not zero and not a power of 2, and a variable n of width num_bits 8301e04c3fSmrg * (which may be up to UINT_BITS). To emit code for n/d, use one of the two 8401e04c3fSmrg * following sequences (here >>> refers to a logical bitshift): 8501e04c3fSmrg * 8601e04c3fSmrg * m = compute_unsigned_magic_info(D, num_bits) 8701e04c3fSmrg * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 8801e04c3fSmrg * if m.increment: emit("n = saturated_increment(n)") 8901e04c3fSmrg * emit("result = (m.multiplier * n) >>> UINT_BITS") 9001e04c3fSmrg * if m.post_shift > 0: emit("result >>>= m.post_shift") 9101e04c3fSmrg * 9201e04c3fSmrg * or 9301e04c3fSmrg * 9401e04c3fSmrg * m = compute_unsigned_magic_info(D, num_bits) 9501e04c3fSmrg * if m.pre_shift > 0: emit("n >>>= m.pre_shift") 9601e04c3fSmrg * emit("result = m.multiplier * n") 9701e04c3fSmrg * if m.increment: emit("result = result + m.multiplier") 9801e04c3fSmrg * emit("result >>>= UINT_BITS") 9901e04c3fSmrg * if m.post_shift > 0: emit("result >>>= m.post_shift") 10001e04c3fSmrg * 10101e04c3fSmrg * This second version works even if D is 1. The shifts by UINT_BITS may be 10201e04c3fSmrg * "free" if the high half of the full multiply is put in a separate register. 10301e04c3fSmrg * 10401e04c3fSmrg * saturated_increment(n) means "increment n unless it would wrap to 0," i.e. 10501e04c3fSmrg * if n == (1 << UINT_BITS)-1: result = n 10601e04c3fSmrg * else: result = n+1 10701e04c3fSmrg * A common way to implement this is with the carry bit. For example, on x86: 10801e04c3fSmrg * add 1 10901e04c3fSmrg * sbb 0 11001e04c3fSmrg * 11101e04c3fSmrg * Some invariants: 11201e04c3fSmrg * 1: At least one of pre_shift and increment is zero 11301e04c3fSmrg * 2: multiplier is never zero 11401e04c3fSmrg * 11501e04c3fSmrg * This code incorporates the "round down" optimization per ridiculous_fish. 11601e04c3fSmrg */ 11701e04c3fSmrg 11801e04c3fSmrgstruct util_fast_udiv_info { 11901e04c3fSmrg uint64_t multiplier; /* the "magic number" multiplier */ 12001e04c3fSmrg unsigned pre_shift; /* shift for the dividend before multiplying */ 12101e04c3fSmrg unsigned post_shift; /* shift for the dividend after multiplying */ 12201e04c3fSmrg int increment; /* 0 or 1; if set then increment the numerator, using one of 12301e04c3fSmrg the two strategies */ 12401e04c3fSmrg}; 12501e04c3fSmrg 12601e04c3fSmrgstruct util_fast_udiv_info 12701e04c3fSmrgutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS); 12801e04c3fSmrg 12901e04c3fSmrg/* Below are possible options for dividing by a uniform in a shader where 13001e04c3fSmrg * the divisor is constant but not known at compile time. 13101e04c3fSmrg */ 13201e04c3fSmrg 13301e04c3fSmrg/* Full version. */ 13401e04c3fSmrgstatic inline uint32_t 13501e04c3fSmrgutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info) 13601e04c3fSmrg{ 13701e04c3fSmrg n = n >> info.pre_shift; 13801e04c3fSmrg /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps 13901e04c3fSmrg * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD. 14001e04c3fSmrg * 14101e04c3fSmrg * If you have unsigned 64-bit MAD with 32-bit inputs, you can do: 14201e04c3fSmrg * increment = increment ? multiplier : 0; // on the CPU 14301e04c3fSmrg * (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD 14401e04c3fSmrg */ 14501e04c3fSmrg n = (((uint64_t)n + info.increment) * info.multiplier) >> 32; 14601e04c3fSmrg n = n >> info.post_shift; 14701e04c3fSmrg return n; 14801e04c3fSmrg} 14901e04c3fSmrg 15001e04c3fSmrg/* A little more efficient version if n != UINT_MAX, i.e. no unsigned 15101e04c3fSmrg * wraparound in the computation. 15201e04c3fSmrg */ 15301e04c3fSmrgstatic inline uint32_t 15401e04c3fSmrgutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info) 15501e04c3fSmrg{ 15601e04c3fSmrg assert(n != UINT32_MAX); 15701e04c3fSmrg n = n >> info.pre_shift; 15801e04c3fSmrg n = n + info.increment; 15901e04c3fSmrg n = ((uint64_t)n * info.multiplier) >> 32; 16001e04c3fSmrg n = n >> info.post_shift; 16101e04c3fSmrg return n; 16201e04c3fSmrg} 16301e04c3fSmrg 16401e04c3fSmrg/* Even faster version but both operands must be 31-bit unsigned integers 16501e04c3fSmrg * and the divisor must be greater than 1. 16601e04c3fSmrg * 16701e04c3fSmrg * info must be computed with num_bits == 31. 16801e04c3fSmrg */ 16901e04c3fSmrgstatic inline uint32_t 17001e04c3fSmrgutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info) 17101e04c3fSmrg{ 17201e04c3fSmrg assert(info.pre_shift == 0); 17301e04c3fSmrg assert(info.increment == 0); 17401e04c3fSmrg n = ((uint64_t)n * info.multiplier) >> 32; 17501e04c3fSmrg n = n >> info.post_shift; 17601e04c3fSmrg return n; 17701e04c3fSmrg} 17801e04c3fSmrg 17901e04c3fSmrg#ifdef __cplusplus 18001e04c3fSmrg} /* extern C */ 18101e04c3fSmrg#endif 18201e04c3fSmrg 18301e04c3fSmrg#endif /* FAST_IDIV_BY_CONST_H */ 184