101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2018 Advanced Micro Devices, Inc.
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2101e04c3fSmrg * IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#ifndef FAST_IDIV_BY_CONST_H
2501e04c3fSmrg#define FAST_IDIV_BY_CONST_H
2601e04c3fSmrg
2701e04c3fSmrg/* Imported from:
2801e04c3fSmrg *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
2901e04c3fSmrg */
3001e04c3fSmrg
3101e04c3fSmrg#include <inttypes.h>
3201e04c3fSmrg#include <limits.h>
3301e04c3fSmrg#include <assert.h>
3401e04c3fSmrg
3501e04c3fSmrg#ifdef __cplusplus
3601e04c3fSmrgextern "C" {
3701e04c3fSmrg#endif
3801e04c3fSmrg
3901e04c3fSmrg/* Computes "magic info" for performing signed division by a fixed integer D.
4001e04c3fSmrg * The type 'sint_t' is assumed to be defined as a signed integer type large
4101e04c3fSmrg * enough to hold both the dividend and the divisor.
4201e04c3fSmrg * Here >> is arithmetic (signed) shift, and >>> is logical shift.
4301e04c3fSmrg *
4401e04c3fSmrg * To emit code for n/d, rounding towards zero, use the following sequence:
4501e04c3fSmrg *
4601e04c3fSmrg *   m = compute_signed_magic_info(D)
4701e04c3fSmrg *   emit("result = (m.multiplier * n) >> SINT_BITS");
4801e04c3fSmrg *   if d > 0 and m.multiplier < 0: emit("result += n")
4901e04c3fSmrg *   if d < 0 and m.multiplier > 0: emit("result -= n")
5001e04c3fSmrg *   if m.post_shift > 0: emit("result >>= m.shift")
5101e04c3fSmrg *   emit("result += (result < 0)")
5201e04c3fSmrg *
5301e04c3fSmrg * The shifts by SINT_BITS may be "free" if the high half of the full multiply
5401e04c3fSmrg * is put in a separate register.
5501e04c3fSmrg *
5601e04c3fSmrg * The final add can of course be implemented via the sign bit, e.g.
5701e04c3fSmrg *    result += (result >>> (SINT_BITS - 1))
5801e04c3fSmrg * or
5901e04c3fSmrg *    result -= (result >> (SINT_BITS - 1))
6001e04c3fSmrg *
6101e04c3fSmrg * This code is heavily indebted to Hacker's Delight by Henry Warren.
6201e04c3fSmrg * See http://www.hackersdelight.org/HDcode/magic.c.txt
6301e04c3fSmrg * Used with permission from http://www.hackersdelight.org/permissions.htm
6401e04c3fSmrg */
6501e04c3fSmrg
6601e04c3fSmrgstruct util_fast_sdiv_info {
6701e04c3fSmrg   int64_t multiplier; /* the "magic number" multiplier */
6801e04c3fSmrg   unsigned shift; /* shift for the dividend after multiplying */
6901e04c3fSmrg};
7001e04c3fSmrg
7101e04c3fSmrgstruct util_fast_sdiv_info
7201e04c3fSmrgutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);
7301e04c3fSmrg
7401e04c3fSmrg/* Computes "magic info" for performing unsigned division by a fixed positive
7501e04c3fSmrg * integer D.  UINT_BITS is the bit size at which the final "magic"
7601e04c3fSmrg * calculation will be performed; it is assumed to be large enough to hold
7701e04c3fSmrg * both the dividand and the divisor.  num_bits can be set appropriately if n
7801e04c3fSmrg * is known to be smaller than calc_bits; if this is not known then UINT_BITS
7901e04c3fSmrg * for num_bits.
8001e04c3fSmrg *
8101e04c3fSmrg * Assume we have a hardware register of width UINT_BITS, a known constant D
8201e04c3fSmrg * which is not zero and not a power of 2, and a variable n of width num_bits
8301e04c3fSmrg * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
8401e04c3fSmrg * following sequences (here >>> refers to a logical bitshift):
8501e04c3fSmrg *
8601e04c3fSmrg *   m = compute_unsigned_magic_info(D, num_bits)
8701e04c3fSmrg *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
8801e04c3fSmrg *   if m.increment: emit("n = saturated_increment(n)")
8901e04c3fSmrg *   emit("result = (m.multiplier * n) >>> UINT_BITS")
9001e04c3fSmrg *   if m.post_shift > 0: emit("result >>>= m.post_shift")
9101e04c3fSmrg *
9201e04c3fSmrg * or
9301e04c3fSmrg *
9401e04c3fSmrg *   m = compute_unsigned_magic_info(D, num_bits)
9501e04c3fSmrg *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
9601e04c3fSmrg *   emit("result = m.multiplier * n")
9701e04c3fSmrg *   if m.increment: emit("result = result + m.multiplier")
9801e04c3fSmrg *   emit("result >>>= UINT_BITS")
9901e04c3fSmrg *   if m.post_shift > 0: emit("result >>>= m.post_shift")
10001e04c3fSmrg *
10101e04c3fSmrg * This second version works even if D is 1.  The shifts by UINT_BITS may be
10201e04c3fSmrg * "free" if the high half of the full multiply is put in a separate register.
10301e04c3fSmrg *
10401e04c3fSmrg * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
10501e04c3fSmrg *   if n == (1 << UINT_BITS)-1: result = n
10601e04c3fSmrg *   else: result = n+1
10701e04c3fSmrg * A common way to implement this is with the carry bit. For example, on x86:
10801e04c3fSmrg *   add 1
10901e04c3fSmrg *   sbb 0
11001e04c3fSmrg *
11101e04c3fSmrg * Some invariants:
11201e04c3fSmrg *   1: At least one of pre_shift and increment is zero
11301e04c3fSmrg *   2: multiplier is never zero
11401e04c3fSmrg *
11501e04c3fSmrg * This code incorporates the "round down" optimization per ridiculous_fish.
11601e04c3fSmrg */
11701e04c3fSmrg
11801e04c3fSmrgstruct util_fast_udiv_info {
11901e04c3fSmrg   uint64_t multiplier; /* the "magic number" multiplier */
12001e04c3fSmrg   unsigned pre_shift; /* shift for the dividend before multiplying */
12101e04c3fSmrg   unsigned post_shift; /* shift for the dividend after multiplying */
12201e04c3fSmrg   int increment; /* 0 or 1; if set then increment the numerator, using one of
12301e04c3fSmrg                     the two strategies */
12401e04c3fSmrg};
12501e04c3fSmrg
12601e04c3fSmrgstruct util_fast_udiv_info
12701e04c3fSmrgutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);
12801e04c3fSmrg
12901e04c3fSmrg/* Below are possible options for dividing by a uniform in a shader where
13001e04c3fSmrg * the divisor is constant but not known at compile time.
13101e04c3fSmrg */
13201e04c3fSmrg
13301e04c3fSmrg/* Full version. */
13401e04c3fSmrgstatic inline uint32_t
13501e04c3fSmrgutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
13601e04c3fSmrg{
13701e04c3fSmrg   n = n >> info.pre_shift;
13801e04c3fSmrg   /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
13901e04c3fSmrg    * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
14001e04c3fSmrg    *
14101e04c3fSmrg    * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
14201e04c3fSmrg    *    increment = increment ? multiplier : 0; // on the CPU
14301e04c3fSmrg    *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
14401e04c3fSmrg    */
14501e04c3fSmrg   n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
14601e04c3fSmrg   n = n >> info.post_shift;
14701e04c3fSmrg   return n;
14801e04c3fSmrg}
14901e04c3fSmrg
15001e04c3fSmrg/* A little more efficient version if n != UINT_MAX, i.e. no unsigned
15101e04c3fSmrg * wraparound in the computation.
15201e04c3fSmrg */
15301e04c3fSmrgstatic inline uint32_t
15401e04c3fSmrgutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
15501e04c3fSmrg{
15601e04c3fSmrg   assert(n != UINT32_MAX);
15701e04c3fSmrg   n = n >> info.pre_shift;
15801e04c3fSmrg   n = n + info.increment;
15901e04c3fSmrg   n = ((uint64_t)n * info.multiplier) >> 32;
16001e04c3fSmrg   n = n >> info.post_shift;
16101e04c3fSmrg   return n;
16201e04c3fSmrg}
16301e04c3fSmrg
16401e04c3fSmrg/* Even faster version but both operands must be 31-bit unsigned integers
16501e04c3fSmrg * and the divisor must be greater than 1.
16601e04c3fSmrg *
16701e04c3fSmrg * info must be computed with num_bits == 31.
16801e04c3fSmrg */
16901e04c3fSmrgstatic inline uint32_t
17001e04c3fSmrgutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
17101e04c3fSmrg{
17201e04c3fSmrg   assert(info.pre_shift == 0);
17301e04c3fSmrg   assert(info.increment == 0);
17401e04c3fSmrg   n = ((uint64_t)n * info.multiplier) >> 32;
17501e04c3fSmrg   n = n >> info.post_shift;
17601e04c3fSmrg   return n;
17701e04c3fSmrg}
17801e04c3fSmrg
17901e04c3fSmrg#ifdef __cplusplus
18001e04c3fSmrg} /* extern C */
18101e04c3fSmrg#endif
18201e04c3fSmrg
18301e04c3fSmrg#endif /* FAST_IDIV_BY_CONST_H */
184