1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2018 Advanced Micro Devices, Inc.
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg#ifndef FAST_IDIV_BY_CONST_H
25b8e80941Smrg#define FAST_IDIV_BY_CONST_H
26b8e80941Smrg
27b8e80941Smrg/* Imported from:
28b8e80941Smrg *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
29b8e80941Smrg */
30b8e80941Smrg
31b8e80941Smrg#include <inttypes.h>
32b8e80941Smrg#include <limits.h>
33b8e80941Smrg#include <assert.h>
34b8e80941Smrg
35b8e80941Smrg#ifdef __cplusplus
36b8e80941Smrgextern "C" {
37b8e80941Smrg#endif
38b8e80941Smrg
39b8e80941Smrg/* Computes "magic info" for performing signed division by a fixed integer D.
40b8e80941Smrg * The type 'sint_t' is assumed to be defined as a signed integer type large
41b8e80941Smrg * enough to hold both the dividend and the divisor.
42b8e80941Smrg * Here >> is arithmetic (signed) shift, and >>> is logical shift.
43b8e80941Smrg *
44b8e80941Smrg * To emit code for n/d, rounding towards zero, use the following sequence:
45b8e80941Smrg *
46b8e80941Smrg *   m = compute_signed_magic_info(D)
47b8e80941Smrg *   emit("result = (m.multiplier * n) >> SINT_BITS");
48b8e80941Smrg *   if d > 0 and m.multiplier < 0: emit("result += n")
49b8e80941Smrg *   if d < 0 and m.multiplier > 0: emit("result -= n")
50b8e80941Smrg *   if m.post_shift > 0: emit("result >>= m.shift")
51b8e80941Smrg *   emit("result += (result < 0)")
52b8e80941Smrg *
53b8e80941Smrg * The shifts by SINT_BITS may be "free" if the high half of the full multiply
54b8e80941Smrg * is put in a separate register.
55b8e80941Smrg *
56b8e80941Smrg * The final add can of course be implemented via the sign bit, e.g.
57b8e80941Smrg *    result += (result >>> (SINT_BITS - 1))
58b8e80941Smrg * or
59b8e80941Smrg *    result -= (result >> (SINT_BITS - 1))
60b8e80941Smrg *
61b8e80941Smrg * This code is heavily indebted to Hacker's Delight by Henry Warren.
62b8e80941Smrg * See http://www.hackersdelight.org/HDcode/magic.c.txt
63b8e80941Smrg * Used with permission from http://www.hackersdelight.org/permissions.htm
64b8e80941Smrg */
65b8e80941Smrg
66b8e80941Smrgstruct util_fast_sdiv_info {
67b8e80941Smrg   int64_t multiplier; /* the "magic number" multiplier */
68b8e80941Smrg   unsigned shift; /* shift for the dividend after multiplying */
69b8e80941Smrg};
70b8e80941Smrg
71b8e80941Smrgstruct util_fast_sdiv_info
72b8e80941Smrgutil_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);
73b8e80941Smrg
74b8e80941Smrg/* Computes "magic info" for performing unsigned division by a fixed positive
75b8e80941Smrg * integer D.  UINT_BITS is the bit size at which the final "magic"
76b8e80941Smrg * calculation will be performed; it is assumed to be large enough to hold
77b8e80941Smrg * both the dividand and the divisor.  num_bits can be set appropriately if n
78b8e80941Smrg * is known to be smaller than calc_bits; if this is not known then UINT_BITS
79b8e80941Smrg * for num_bits.
80b8e80941Smrg *
81b8e80941Smrg * Assume we have a hardware register of width UINT_BITS, a known constant D
82b8e80941Smrg * which is not zero and not a power of 2, and a variable n of width num_bits
83b8e80941Smrg * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
84b8e80941Smrg * following sequences (here >>> refers to a logical bitshift):
85b8e80941Smrg *
86b8e80941Smrg *   m = compute_unsigned_magic_info(D, num_bits)
87b8e80941Smrg *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
88b8e80941Smrg *   if m.increment: emit("n = saturated_increment(n)")
89b8e80941Smrg *   emit("result = (m.multiplier * n) >>> UINT_BITS")
90b8e80941Smrg *   if m.post_shift > 0: emit("result >>>= m.post_shift")
91b8e80941Smrg *
92b8e80941Smrg * or
93b8e80941Smrg *
94b8e80941Smrg *   m = compute_unsigned_magic_info(D, num_bits)
95b8e80941Smrg *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
96b8e80941Smrg *   emit("result = m.multiplier * n")
97b8e80941Smrg *   if m.increment: emit("result = result + m.multiplier")
98b8e80941Smrg *   emit("result >>>= UINT_BITS")
99b8e80941Smrg *   if m.post_shift > 0: emit("result >>>= m.post_shift")
100b8e80941Smrg *
101b8e80941Smrg * This second version works even if D is 1.  The shifts by UINT_BITS may be
102b8e80941Smrg * "free" if the high half of the full multiply is put in a separate register.
103b8e80941Smrg *
104b8e80941Smrg * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
105b8e80941Smrg *   if n == (1 << UINT_BITS)-1: result = n
106b8e80941Smrg *   else: result = n+1
107b8e80941Smrg * A common way to implement this is with the carry bit. For example, on x86:
108b8e80941Smrg *   add 1
109b8e80941Smrg *   sbb 0
110b8e80941Smrg *
111b8e80941Smrg * Some invariants:
112b8e80941Smrg *   1: At least one of pre_shift and increment is zero
113b8e80941Smrg *   2: multiplier is never zero
114b8e80941Smrg *
115b8e80941Smrg * This code incorporates the "round down" optimization per ridiculous_fish.
116b8e80941Smrg */
117b8e80941Smrg
118b8e80941Smrgstruct util_fast_udiv_info {
119b8e80941Smrg   uint64_t multiplier; /* the "magic number" multiplier */
120b8e80941Smrg   unsigned pre_shift; /* shift for the dividend before multiplying */
121b8e80941Smrg   unsigned post_shift; /* shift for the dividend after multiplying */
122b8e80941Smrg   int increment; /* 0 or 1; if set then increment the numerator, using one of
123b8e80941Smrg                     the two strategies */
124b8e80941Smrg};
125b8e80941Smrg
126b8e80941Smrgstruct util_fast_udiv_info
127b8e80941Smrgutil_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);
128b8e80941Smrg
129b8e80941Smrg/* Below are possible options for dividing by a uniform in a shader where
130b8e80941Smrg * the divisor is constant but not known at compile time.
131b8e80941Smrg */
132b8e80941Smrg
133b8e80941Smrg/* Full version. */
134b8e80941Smrgstatic inline uint32_t
135b8e80941Smrgutil_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
136b8e80941Smrg{
137b8e80941Smrg   n = n >> info.pre_shift;
138b8e80941Smrg   /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
139b8e80941Smrg    * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
140b8e80941Smrg    *
141b8e80941Smrg    * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
142b8e80941Smrg    *    increment = increment ? multiplier : 0; // on the CPU
143b8e80941Smrg    *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
144b8e80941Smrg    */
145b8e80941Smrg   n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
146b8e80941Smrg   n = n >> info.post_shift;
147b8e80941Smrg   return n;
148b8e80941Smrg}
149b8e80941Smrg
150b8e80941Smrg/* A little more efficient version if n != UINT_MAX, i.e. no unsigned
151b8e80941Smrg * wraparound in the computation.
152b8e80941Smrg */
153b8e80941Smrgstatic inline uint32_t
154b8e80941Smrgutil_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
155b8e80941Smrg{
156b8e80941Smrg   assert(n != UINT32_MAX);
157b8e80941Smrg   n = n >> info.pre_shift;
158b8e80941Smrg   n = n + info.increment;
159b8e80941Smrg   n = ((uint64_t)n * info.multiplier) >> 32;
160b8e80941Smrg   n = n >> info.post_shift;
161b8e80941Smrg   return n;
162b8e80941Smrg}
163b8e80941Smrg
164b8e80941Smrg/* Even faster version but both operands must be 31-bit unsigned integers
165b8e80941Smrg * and the divisor must be greater than 1.
166b8e80941Smrg *
167b8e80941Smrg * info must be computed with num_bits == 31.
168b8e80941Smrg */
169b8e80941Smrgstatic inline uint32_t
170b8e80941Smrgutil_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
171b8e80941Smrg{
172b8e80941Smrg   assert(info.pre_shift == 0);
173b8e80941Smrg   assert(info.increment == 0);
174b8e80941Smrg   n = ((uint64_t)n * info.multiplier) >> 32;
175b8e80941Smrg   n = n >> info.post_shift;
176b8e80941Smrg   return n;
177b8e80941Smrg}
178b8e80941Smrg
179b8e80941Smrg#ifdef __cplusplus
180b8e80941Smrg} /* extern C */
181b8e80941Smrg#endif
182b8e80941Smrg
183b8e80941Smrg#endif /* FAST_IDIV_BY_CONST_H */
184