1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2016 Intel Corporation
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg#include "nir.h"
25b8e80941Smrg#include "nir_builder.h"
26b8e80941Smrg
27b8e80941Smrgstatic nir_ssa_def *
28b8e80941Smrglower_b2i64(nir_builder *b, nir_ssa_def *x)
29b8e80941Smrg{
30b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0));
31b8e80941Smrg}
32b8e80941Smrg
33b8e80941Smrgstatic nir_ssa_def *
34b8e80941Smrglower_i2b(nir_builder *b, nir_ssa_def *x)
35b8e80941Smrg{
36b8e80941Smrg   return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x),
37b8e80941Smrg                                nir_unpack_64_2x32_split_y(b, x)),
38b8e80941Smrg                     nir_imm_int(b, 0));
39b8e80941Smrg}
40b8e80941Smrg
41b8e80941Smrgstatic nir_ssa_def *
42b8e80941Smrglower_i2i8(nir_builder *b, nir_ssa_def *x)
43b8e80941Smrg{
44b8e80941Smrg   return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x));
45b8e80941Smrg}
46b8e80941Smrg
47b8e80941Smrgstatic nir_ssa_def *
48b8e80941Smrglower_i2i16(nir_builder *b, nir_ssa_def *x)
49b8e80941Smrg{
50b8e80941Smrg   return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x));
51b8e80941Smrg}
52b8e80941Smrg
53b8e80941Smrg
54b8e80941Smrgstatic nir_ssa_def *
55b8e80941Smrglower_i2i32(nir_builder *b, nir_ssa_def *x)
56b8e80941Smrg{
57b8e80941Smrg   return nir_unpack_64_2x32_split_x(b, x);
58b8e80941Smrg}
59b8e80941Smrg
60b8e80941Smrgstatic nir_ssa_def *
61b8e80941Smrglower_i2i64(nir_builder *b, nir_ssa_def *x)
62b8e80941Smrg{
63b8e80941Smrg   nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x);
64b8e80941Smrg   return nir_pack_64_2x32_split(b, x32, nir_ishr(b, x32, nir_imm_int(b, 31)));
65b8e80941Smrg}
66b8e80941Smrg
67b8e80941Smrgstatic nir_ssa_def *
68b8e80941Smrglower_u2u8(nir_builder *b, nir_ssa_def *x)
69b8e80941Smrg{
70b8e80941Smrg   return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x));
71b8e80941Smrg}
72b8e80941Smrg
73b8e80941Smrgstatic nir_ssa_def *
74b8e80941Smrglower_u2u16(nir_builder *b, nir_ssa_def *x)
75b8e80941Smrg{
76b8e80941Smrg   return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x));
77b8e80941Smrg}
78b8e80941Smrg
79b8e80941Smrgstatic nir_ssa_def *
80b8e80941Smrglower_u2u32(nir_builder *b, nir_ssa_def *x)
81b8e80941Smrg{
82b8e80941Smrg   return nir_unpack_64_2x32_split_x(b, x);
83b8e80941Smrg}
84b8e80941Smrg
85b8e80941Smrgstatic nir_ssa_def *
86b8e80941Smrglower_u2u64(nir_builder *b, nir_ssa_def *x)
87b8e80941Smrg{
88b8e80941Smrg   nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x);
89b8e80941Smrg   return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0));
90b8e80941Smrg}
91b8e80941Smrg
92b8e80941Smrgstatic nir_ssa_def *
93b8e80941Smrglower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y)
94b8e80941Smrg{
95b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
96b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
97b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
98b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
99b8e80941Smrg
100b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo),
101b8e80941Smrg                                    nir_bcsel(b, cond, x_hi, y_hi));
102b8e80941Smrg}
103b8e80941Smrg
104b8e80941Smrgstatic nir_ssa_def *
105b8e80941Smrglower_inot64(nir_builder *b, nir_ssa_def *x)
106b8e80941Smrg{
107b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
108b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
109b8e80941Smrg
110b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi));
111b8e80941Smrg}
112b8e80941Smrg
113b8e80941Smrgstatic nir_ssa_def *
114b8e80941Smrglower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
115b8e80941Smrg{
116b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
117b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
118b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
119b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
120b8e80941Smrg
121b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo),
122b8e80941Smrg                                    nir_iand(b, x_hi, y_hi));
123b8e80941Smrg}
124b8e80941Smrg
125b8e80941Smrgstatic nir_ssa_def *
126b8e80941Smrglower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
127b8e80941Smrg{
128b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
129b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
130b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
131b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
132b8e80941Smrg
133b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo),
134b8e80941Smrg                                    nir_ior(b, x_hi, y_hi));
135b8e80941Smrg}
136b8e80941Smrg
137b8e80941Smrgstatic nir_ssa_def *
138b8e80941Smrglower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
139b8e80941Smrg{
140b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
141b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
142b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
143b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
144b8e80941Smrg
145b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo),
146b8e80941Smrg                                    nir_ixor(b, x_hi, y_hi));
147b8e80941Smrg}
148b8e80941Smrg
149b8e80941Smrgstatic nir_ssa_def *
150b8e80941Smrglower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
151b8e80941Smrg{
152b8e80941Smrg   /* Implemented as
153b8e80941Smrg    *
154b8e80941Smrg    * uint64_t lshift(uint64_t x, int c)
155b8e80941Smrg    * {
156b8e80941Smrg    *    if (c == 0) return x;
157b8e80941Smrg    *
158b8e80941Smrg    *    uint32_t lo = LO(x), hi = HI(x);
159b8e80941Smrg    *
160b8e80941Smrg    *    if (c < 32) {
161b8e80941Smrg    *       uint32_t lo_shifted = lo << c;
162b8e80941Smrg    *       uint32_t hi_shifted = hi << c;
163b8e80941Smrg    *       uint32_t lo_shifted_hi = lo >> abs(32 - c);
164b8e80941Smrg    *       return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
165b8e80941Smrg    *    } else {
166b8e80941Smrg    *       uint32_t lo_shifted_hi = lo << abs(32 - c);
167b8e80941Smrg    *       return pack_64(0, lo_shifted_hi);
168b8e80941Smrg    *    }
169b8e80941Smrg    * }
170b8e80941Smrg    */
171b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
172b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
173b8e80941Smrg
174b8e80941Smrg   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
175b8e80941Smrg   nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y);
176b8e80941Smrg   nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y);
177b8e80941Smrg   nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count);
178b8e80941Smrg
179b8e80941Smrg   nir_ssa_def *res_if_lt_32 =
180b8e80941Smrg      nir_pack_64_2x32_split(b, lo_shifted,
181b8e80941Smrg                                nir_ior(b, hi_shifted, lo_shifted_hi));
182b8e80941Smrg   nir_ssa_def *res_if_ge_32 =
183b8e80941Smrg      nir_pack_64_2x32_split(b, nir_imm_int(b, 0),
184b8e80941Smrg                                nir_ishl(b, x_lo, reverse_count));
185b8e80941Smrg
186b8e80941Smrg   return nir_bcsel(b,
187b8e80941Smrg                    nir_ieq(b, y, nir_imm_int(b, 0)), x,
188b8e80941Smrg                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
189b8e80941Smrg                                 res_if_ge_32, res_if_lt_32));
190b8e80941Smrg}
191b8e80941Smrg
192b8e80941Smrgstatic nir_ssa_def *
193b8e80941Smrglower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
194b8e80941Smrg{
195b8e80941Smrg   /* Implemented as
196b8e80941Smrg    *
197b8e80941Smrg    * uint64_t arshift(uint64_t x, int c)
198b8e80941Smrg    * {
199b8e80941Smrg    *    if (c == 0) return x;
200b8e80941Smrg    *
201b8e80941Smrg    *    uint32_t lo = LO(x);
202b8e80941Smrg    *    int32_t  hi = HI(x);
203b8e80941Smrg    *
204b8e80941Smrg    *    if (c < 32) {
205b8e80941Smrg    *       uint32_t lo_shifted = lo >> c;
206b8e80941Smrg    *       uint32_t hi_shifted = hi >> c;
207b8e80941Smrg    *       uint32_t hi_shifted_lo = hi << abs(32 - c);
208b8e80941Smrg    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
209b8e80941Smrg    *    } else {
210b8e80941Smrg    *       uint32_t hi_shifted = hi >> 31;
211b8e80941Smrg    *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
212b8e80941Smrg    *       return pack_64(hi_shifted, hi_shifted_lo);
213b8e80941Smrg    *    }
214b8e80941Smrg    * }
215b8e80941Smrg    */
216b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
217b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
218b8e80941Smrg
219b8e80941Smrg   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
220b8e80941Smrg   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
221b8e80941Smrg   nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y);
222b8e80941Smrg   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
223b8e80941Smrg
224b8e80941Smrg   nir_ssa_def *res_if_lt_32 =
225b8e80941Smrg      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
226b8e80941Smrg                                hi_shifted);
227b8e80941Smrg   nir_ssa_def *res_if_ge_32 =
228b8e80941Smrg      nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count),
229b8e80941Smrg                                nir_ishr(b, x_hi, nir_imm_int(b, 31)));
230b8e80941Smrg
231b8e80941Smrg   return nir_bcsel(b,
232b8e80941Smrg                    nir_ieq(b, y, nir_imm_int(b, 0)), x,
233b8e80941Smrg                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
234b8e80941Smrg                                 res_if_ge_32, res_if_lt_32));
235b8e80941Smrg}
236b8e80941Smrg
237b8e80941Smrgstatic nir_ssa_def *
238b8e80941Smrglower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
239b8e80941Smrg{
240b8e80941Smrg   /* Implemented as
241b8e80941Smrg    *
242b8e80941Smrg    * uint64_t rshift(uint64_t x, int c)
243b8e80941Smrg    * {
244b8e80941Smrg    *    if (c == 0) return x;
245b8e80941Smrg    *
246b8e80941Smrg    *    uint32_t lo = LO(x), hi = HI(x);
247b8e80941Smrg    *
248b8e80941Smrg    *    if (c < 32) {
249b8e80941Smrg    *       uint32_t lo_shifted = lo >> c;
250b8e80941Smrg    *       uint32_t hi_shifted = hi >> c;
251b8e80941Smrg    *       uint32_t hi_shifted_lo = hi << abs(32 - c);
252b8e80941Smrg    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
253b8e80941Smrg    *    } else {
254b8e80941Smrg    *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
255b8e80941Smrg    *       return pack_64(0, hi_shifted_lo);
256b8e80941Smrg    *    }
257b8e80941Smrg    * }
258b8e80941Smrg    */
259b8e80941Smrg
260b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
261b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
262b8e80941Smrg
263b8e80941Smrg   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
264b8e80941Smrg   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
265b8e80941Smrg   nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y);
266b8e80941Smrg   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
267b8e80941Smrg
268b8e80941Smrg   nir_ssa_def *res_if_lt_32 =
269b8e80941Smrg      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
270b8e80941Smrg                                hi_shifted);
271b8e80941Smrg   nir_ssa_def *res_if_ge_32 =
272b8e80941Smrg      nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count),
273b8e80941Smrg                                nir_imm_int(b, 0));
274b8e80941Smrg
275b8e80941Smrg   return nir_bcsel(b,
276b8e80941Smrg                    nir_ieq(b, y, nir_imm_int(b, 0)), x,
277b8e80941Smrg                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
278b8e80941Smrg                                 res_if_ge_32, res_if_lt_32));
279b8e80941Smrg}
280b8e80941Smrg
281b8e80941Smrgstatic nir_ssa_def *
282b8e80941Smrglower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
283b8e80941Smrg{
284b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
285b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
286b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
287b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
288b8e80941Smrg
289b8e80941Smrg   nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo);
290b8e80941Smrg   nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo));
291b8e80941Smrg   nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi));
292b8e80941Smrg
293b8e80941Smrg   return nir_pack_64_2x32_split(b, res_lo, res_hi);
294b8e80941Smrg}
295b8e80941Smrg
296b8e80941Smrgstatic nir_ssa_def *
297b8e80941Smrglower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
298b8e80941Smrg{
299b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
300b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
301b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
302b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
303b8e80941Smrg
304b8e80941Smrg   nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo);
305b8e80941Smrg   nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo)));
306b8e80941Smrg   nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow);
307b8e80941Smrg
308b8e80941Smrg   return nir_pack_64_2x32_split(b, res_lo, res_hi);
309b8e80941Smrg}
310b8e80941Smrg
311b8e80941Smrgstatic nir_ssa_def *
312b8e80941Smrglower_ineg64(nir_builder *b, nir_ssa_def *x)
313b8e80941Smrg{
314b8e80941Smrg   /* Since isub is the same number of instructions (with better dependencies)
315b8e80941Smrg    * as iadd, subtraction is actually more efficient for ineg than the usual
316b8e80941Smrg    * 2's complement "flip the bits and add one".
317b8e80941Smrg    */
318b8e80941Smrg   return lower_isub64(b, nir_imm_int64(b, 0), x);
319b8e80941Smrg}
320b8e80941Smrg
321b8e80941Smrgstatic nir_ssa_def *
322b8e80941Smrglower_iabs64(nir_builder *b, nir_ssa_def *x)
323b8e80941Smrg{
324b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
325b8e80941Smrg   nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0));
326b8e80941Smrg   return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x);
327b8e80941Smrg}
328b8e80941Smrg
329b8e80941Smrgstatic nir_ssa_def *
330b8e80941Smrglower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y)
331b8e80941Smrg{
332b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
333b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
334b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
335b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
336b8e80941Smrg
337b8e80941Smrg   switch (op) {
338b8e80941Smrg   case nir_op_ieq:
339b8e80941Smrg      return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo));
340b8e80941Smrg   case nir_op_ine:
341b8e80941Smrg      return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo));
342b8e80941Smrg   case nir_op_ult:
343b8e80941Smrg      return nir_ior(b, nir_ult(b, x_hi, y_hi),
344b8e80941Smrg                        nir_iand(b, nir_ieq(b, x_hi, y_hi),
345b8e80941Smrg                                    nir_ult(b, x_lo, y_lo)));
346b8e80941Smrg   case nir_op_ilt:
347b8e80941Smrg      return nir_ior(b, nir_ilt(b, x_hi, y_hi),
348b8e80941Smrg                        nir_iand(b, nir_ieq(b, x_hi, y_hi),
349b8e80941Smrg                                    nir_ult(b, x_lo, y_lo)));
350b8e80941Smrg      break;
351b8e80941Smrg   case nir_op_uge:
352b8e80941Smrg      /* Lower as !(x < y) in the hopes of better CSE */
353b8e80941Smrg      return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y));
354b8e80941Smrg   case nir_op_ige:
355b8e80941Smrg      /* Lower as !(x < y) in the hopes of better CSE */
356b8e80941Smrg      return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y));
357b8e80941Smrg   default:
358b8e80941Smrg      unreachable("Invalid comparison");
359b8e80941Smrg   }
360b8e80941Smrg}
361b8e80941Smrg
362b8e80941Smrgstatic nir_ssa_def *
363b8e80941Smrglower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
364b8e80941Smrg{
365b8e80941Smrg   return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x);
366b8e80941Smrg}
367b8e80941Smrg
368b8e80941Smrgstatic nir_ssa_def *
369b8e80941Smrglower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
370b8e80941Smrg{
371b8e80941Smrg   return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x);
372b8e80941Smrg}
373b8e80941Smrg
374b8e80941Smrgstatic nir_ssa_def *
375b8e80941Smrglower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
376b8e80941Smrg{
377b8e80941Smrg   return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y);
378b8e80941Smrg}
379b8e80941Smrg
380b8e80941Smrgstatic nir_ssa_def *
381b8e80941Smrglower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
382b8e80941Smrg{
383b8e80941Smrg   return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
384b8e80941Smrg}
385b8e80941Smrg
386b8e80941Smrgstatic nir_ssa_def *
387b8e80941Smrglower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
388b8e80941Smrg                  bool sign_extend)
389b8e80941Smrg{
390b8e80941Smrg   nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
391b8e80941Smrg                                     : nir_umul_high(b, x, y);
392b8e80941Smrg
393b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
394b8e80941Smrg}
395b8e80941Smrg
396b8e80941Smrgstatic nir_ssa_def *
397b8e80941Smrglower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
398b8e80941Smrg{
399b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
400b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
401b8e80941Smrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
402b8e80941Smrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
403b8e80941Smrg
404b8e80941Smrg   nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
405b8e80941Smrg   nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
406b8e80941Smrg                         nir_iadd(b, nir_imul(b, x_lo, y_hi),
407b8e80941Smrg                                     nir_imul(b, x_hi, y_lo)));
408b8e80941Smrg
409b8e80941Smrg   return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
410b8e80941Smrg                                 res_hi);
411b8e80941Smrg}
412b8e80941Smrg
413b8e80941Smrgstatic nir_ssa_def *
414b8e80941Smrglower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
415b8e80941Smrg                 bool sign_extend)
416b8e80941Smrg{
417b8e80941Smrg   nir_ssa_def *x32[4], *y32[4];
418b8e80941Smrg   x32[0] = nir_unpack_64_2x32_split_x(b, x);
419b8e80941Smrg   x32[1] = nir_unpack_64_2x32_split_y(b, x);
420b8e80941Smrg   if (sign_extend) {
421b8e80941Smrg      x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31));
422b8e80941Smrg   } else {
423b8e80941Smrg      x32[2] = x32[3] = nir_imm_int(b, 0);
424b8e80941Smrg   }
425b8e80941Smrg
426b8e80941Smrg   y32[0] = nir_unpack_64_2x32_split_x(b, y);
427b8e80941Smrg   y32[1] = nir_unpack_64_2x32_split_y(b, y);
428b8e80941Smrg   if (sign_extend) {
429b8e80941Smrg      y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31));
430b8e80941Smrg   } else {
431b8e80941Smrg      y32[2] = y32[3] = nir_imm_int(b, 0);
432b8e80941Smrg   }
433b8e80941Smrg
434b8e80941Smrg   nir_ssa_def *res[8] = { NULL, };
435b8e80941Smrg
436b8e80941Smrg   /* Yes, the following generates a pile of code.  However, we throw res[0]
437b8e80941Smrg    * and res[1] away in the end and, if we're in the umul case, four of our
438b8e80941Smrg    * eight dword operands will be constant zero and opt_algebraic will clean
439b8e80941Smrg    * this up nicely.
440b8e80941Smrg    */
441b8e80941Smrg   for (unsigned i = 0; i < 4; i++) {
442b8e80941Smrg      nir_ssa_def *carry = NULL;
443b8e80941Smrg      for (unsigned j = 0; j < 4; j++) {
444b8e80941Smrg         /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
445b8e80941Smrg          * maximum value of tmp is UINT32_MAX * UINT32_MAX.  The maximum
446b8e80941Smrg          * value that will fit in tmp is
447b8e80941Smrg          *
448b8e80941Smrg          *    UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
449b8e80941Smrg          *               = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
450b8e80941Smrg          *               = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
451b8e80941Smrg          *
452b8e80941Smrg          * so we're guaranteed that we can add in two more 32-bit values
453b8e80941Smrg          * without overflowing tmp.
454b8e80941Smrg          */
455b8e80941Smrg         nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
456b8e80941Smrg
457b8e80941Smrg         if (res[i + j])
458b8e80941Smrg            tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
459b8e80941Smrg         if (carry)
460b8e80941Smrg            tmp = nir_iadd(b, tmp, carry);
461b8e80941Smrg         res[i + j] = nir_u2u32(b, tmp);
462b8e80941Smrg         carry = nir_ushr(b, tmp, nir_imm_int(b, 32));
463b8e80941Smrg      }
464b8e80941Smrg      res[i + 4] = nir_u2u32(b, carry);
465b8e80941Smrg   }
466b8e80941Smrg
467b8e80941Smrg   return nir_pack_64_2x32_split(b, res[2], res[3]);
468b8e80941Smrg}
469b8e80941Smrg
470b8e80941Smrgstatic nir_ssa_def *
471b8e80941Smrglower_isign64(nir_builder *b, nir_ssa_def *x)
472b8e80941Smrg{
473b8e80941Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
474b8e80941Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
475b8e80941Smrg
476b8e80941Smrg   nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi));
477b8e80941Smrg   nir_ssa_def *res_hi = nir_ishr(b, x_hi, nir_imm_int(b, 31));
478b8e80941Smrg   nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero));
479b8e80941Smrg
480b8e80941Smrg   return nir_pack_64_2x32_split(b, res_lo, res_hi);
481b8e80941Smrg}
482b8e80941Smrg
483b8e80941Smrgstatic void
484b8e80941Smrglower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d,
485b8e80941Smrg                   nir_ssa_def **q, nir_ssa_def **r)
486b8e80941Smrg{
487b8e80941Smrg   /* TODO: We should specially handle the case where the denominator is a
488b8e80941Smrg    * constant.  In that case, we should be able to reduce it to a multiply by
489b8e80941Smrg    * a constant, some shifts, and an add.
490b8e80941Smrg    */
491b8e80941Smrg   nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n);
492b8e80941Smrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
493b8e80941Smrg   nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d);
494b8e80941Smrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
495b8e80941Smrg
496b8e80941Smrg   nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32);
497b8e80941Smrg   nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32);
498b8e80941Smrg
499b8e80941Smrg   nir_ssa_def *n_hi_before_if = n_hi;
500b8e80941Smrg   nir_ssa_def *q_hi_before_if = q_hi;
501b8e80941Smrg
502b8e80941Smrg   /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
503b8e80941Smrg    * greater than 32 bits to occur.  If the upper 32 bits of the numerator
504b8e80941Smrg    * are zero, it is impossible for (denom << [63, 32]) <= numer unless
505b8e80941Smrg    * denom == 0.
506b8e80941Smrg    */
507b8e80941Smrg   nir_ssa_def *need_high_div =
508b8e80941Smrg      nir_iand(b, nir_ieq(b, d_hi, nir_imm_int(b, 0)), nir_uge(b, n_hi, d_lo));
509b8e80941Smrg   nir_push_if(b, nir_bany(b, need_high_div));
510b8e80941Smrg   {
511b8e80941Smrg      /* If we only have one component, then the bany above goes away and
512b8e80941Smrg       * this is always true within the if statement.
513b8e80941Smrg       */
514b8e80941Smrg      if (n->num_components == 1)
515b8e80941Smrg         need_high_div = nir_imm_true(b);
516b8e80941Smrg
517b8e80941Smrg      nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo);
518b8e80941Smrg
519b8e80941Smrg      for (int i = 31; i >= 0; i--) {
520b8e80941Smrg         /* if ((d.x << i) <= n.y) {
521b8e80941Smrg          *    n.y -= d.x << i;
522b8e80941Smrg          *    quot.y |= 1U << i;
523b8e80941Smrg          * }
524b8e80941Smrg          */
525b8e80941Smrg         nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i));
526b8e80941Smrg         nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift);
527b8e80941Smrg         nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i));
528b8e80941Smrg         nir_ssa_def *cond = nir_iand(b, need_high_div,
529b8e80941Smrg                                         nir_uge(b, n_hi, d_shift));
530b8e80941Smrg         if (i != 0) {
531b8e80941Smrg            /* log2_d_lo is always <= 31, so we don't need to bother with it
532b8e80941Smrg             * in the last iteration.
533b8e80941Smrg             */
534b8e80941Smrg            cond = nir_iand(b, cond,
535b8e80941Smrg                               nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo));
536b8e80941Smrg         }
537b8e80941Smrg         n_hi = nir_bcsel(b, cond, new_n_hi, n_hi);
538b8e80941Smrg         q_hi = nir_bcsel(b, cond, new_q_hi, q_hi);
539b8e80941Smrg      }
540b8e80941Smrg   }
541b8e80941Smrg   nir_pop_if(b, NULL);
542b8e80941Smrg   n_hi = nir_if_phi(b, n_hi, n_hi_before_if);
543b8e80941Smrg   q_hi = nir_if_phi(b, q_hi, q_hi_before_if);
544b8e80941Smrg
545b8e80941Smrg   nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi);
546b8e80941Smrg
547b8e80941Smrg   n = nir_pack_64_2x32_split(b, n_lo, n_hi);
548b8e80941Smrg   d = nir_pack_64_2x32_split(b, d_lo, d_hi);
549b8e80941Smrg   for (int i = 31; i >= 0; i--) {
550b8e80941Smrg      /* if ((d64 << i) <= n64) {
551b8e80941Smrg       *    n64 -= d64 << i;
552b8e80941Smrg       *    quot.x |= 1U << i;
553b8e80941Smrg       * }
554b8e80941Smrg       */
555b8e80941Smrg      nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i));
556b8e80941Smrg      nir_ssa_def *new_n = nir_isub(b, n, d_shift);
557b8e80941Smrg      nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i));
558b8e80941Smrg      nir_ssa_def *cond = nir_uge(b, n, d_shift);
559b8e80941Smrg      if (i != 0) {
560b8e80941Smrg         /* log2_denom is always <= 31, so we don't need to bother with it
561b8e80941Smrg          * in the last iteration.
562b8e80941Smrg          */
563b8e80941Smrg         cond = nir_iand(b, cond,
564b8e80941Smrg                            nir_ige(b, nir_imm_int(b, 31 - i), log2_denom));
565b8e80941Smrg      }
566b8e80941Smrg      n = nir_bcsel(b, cond, new_n, n);
567b8e80941Smrg      q_lo = nir_bcsel(b, cond, new_q_lo, q_lo);
568b8e80941Smrg   }
569b8e80941Smrg
570b8e80941Smrg   *q = nir_pack_64_2x32_split(b, q_lo, q_hi);
571b8e80941Smrg   *r = n;
572b8e80941Smrg}
573b8e80941Smrg
574b8e80941Smrgstatic nir_ssa_def *
575b8e80941Smrglower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
576b8e80941Smrg{
577b8e80941Smrg   nir_ssa_def *q, *r;
578b8e80941Smrg   lower_udiv64_mod64(b, n, d, &q, &r);
579b8e80941Smrg   return q;
580b8e80941Smrg}
581b8e80941Smrg
582b8e80941Smrgstatic nir_ssa_def *
583b8e80941Smrglower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
584b8e80941Smrg{
585b8e80941Smrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
586b8e80941Smrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
587b8e80941Smrg
588b8e80941Smrg   nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)),
589b8e80941Smrg                                    nir_ilt(b, d_hi, nir_imm_int(b, 0)));
590b8e80941Smrg   nir_ssa_def *q, *r;
591b8e80941Smrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
592b8e80941Smrg   return nir_bcsel(b, negate, nir_ineg(b, q), q);
593b8e80941Smrg}
594b8e80941Smrg
595b8e80941Smrgstatic nir_ssa_def *
596b8e80941Smrglower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
597b8e80941Smrg{
598b8e80941Smrg   nir_ssa_def *q, *r;
599b8e80941Smrg   lower_udiv64_mod64(b, n, d, &q, &r);
600b8e80941Smrg   return r;
601b8e80941Smrg}
602b8e80941Smrg
603b8e80941Smrgstatic nir_ssa_def *
604b8e80941Smrglower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
605b8e80941Smrg{
606b8e80941Smrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
607b8e80941Smrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
608b8e80941Smrg   nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
609b8e80941Smrg   nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0));
610b8e80941Smrg
611b8e80941Smrg   nir_ssa_def *q, *r;
612b8e80941Smrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
613b8e80941Smrg
614b8e80941Smrg   nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
615b8e80941Smrg
616b8e80941Smrg   return nir_bcsel(b, nir_ieq(b, r, nir_imm_int64(b, 0)), nir_imm_int64(b, 0),
617b8e80941Smrg          nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem,
618b8e80941Smrg                       nir_iadd(b, rem, d)));
619b8e80941Smrg}
620b8e80941Smrg
621b8e80941Smrgstatic nir_ssa_def *
622b8e80941Smrglower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
623b8e80941Smrg{
624b8e80941Smrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
625b8e80941Smrg   nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
626b8e80941Smrg
627b8e80941Smrg   nir_ssa_def *q, *r;
628b8e80941Smrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
629b8e80941Smrg   return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
630b8e80941Smrg}
631b8e80941Smrg
632b8e80941Smrgstatic nir_ssa_def *
633b8e80941Smrglower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c)
634b8e80941Smrg{
635b8e80941Smrg   assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 ||
636b8e80941Smrg          op == nir_op_extract_u16 || op == nir_op_extract_i16);
637b8e80941Smrg
638b8e80941Smrg   const int chunk = nir_src_as_uint(nir_src_for_ssa(c));
639b8e80941Smrg   const int chunk_bits =
640b8e80941Smrg      (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16;
641b8e80941Smrg   const int num_chunks_in_32 = 32 / chunk_bits;
642b8e80941Smrg
643b8e80941Smrg   nir_ssa_def *extract32;
644b8e80941Smrg   if (chunk < num_chunks_in_32) {
645b8e80941Smrg      extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x),
646b8e80941Smrg                                   nir_imm_int(b, chunk),
647b8e80941Smrg                                   NULL, NULL);
648b8e80941Smrg   } else {
649b8e80941Smrg      extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x),
650b8e80941Smrg                                   nir_imm_int(b, chunk - num_chunks_in_32),
651b8e80941Smrg                                   NULL, NULL);
652b8e80941Smrg   }
653b8e80941Smrg
654b8e80941Smrg   if (op == nir_op_extract_i8 || op == nir_op_extract_i16)
655b8e80941Smrg      return lower_i2i64(b, extract32);
656b8e80941Smrg   else
657b8e80941Smrg      return lower_u2u64(b, extract32);
658b8e80941Smrg}
659b8e80941Smrg
660b8e80941Smrgnir_lower_int64_options
661b8e80941Smrgnir_lower_int64_op_to_options_mask(nir_op opcode)
662b8e80941Smrg{
663b8e80941Smrg   switch (opcode) {
664b8e80941Smrg   case nir_op_imul:
665b8e80941Smrg      return nir_lower_imul64;
666b8e80941Smrg   case nir_op_imul_2x32_64:
667b8e80941Smrg   case nir_op_umul_2x32_64:
668b8e80941Smrg      return nir_lower_imul_2x32_64;
669b8e80941Smrg   case nir_op_imul_high:
670b8e80941Smrg   case nir_op_umul_high:
671b8e80941Smrg      return nir_lower_imul_high64;
672b8e80941Smrg   case nir_op_isign:
673b8e80941Smrg      return nir_lower_isign64;
674b8e80941Smrg   case nir_op_udiv:
675b8e80941Smrg   case nir_op_idiv:
676b8e80941Smrg   case nir_op_umod:
677b8e80941Smrg   case nir_op_imod:
678b8e80941Smrg   case nir_op_irem:
679b8e80941Smrg      return nir_lower_divmod64;
680b8e80941Smrg   case nir_op_b2i64:
681b8e80941Smrg   case nir_op_i2b1:
682b8e80941Smrg   case nir_op_i2i32:
683b8e80941Smrg   case nir_op_i2i64:
684b8e80941Smrg   case nir_op_u2u32:
685b8e80941Smrg   case nir_op_u2u64:
686b8e80941Smrg   case nir_op_bcsel:
687b8e80941Smrg      return nir_lower_mov64;
688b8e80941Smrg   case nir_op_ieq:
689b8e80941Smrg   case nir_op_ine:
690b8e80941Smrg   case nir_op_ult:
691b8e80941Smrg   case nir_op_ilt:
692b8e80941Smrg   case nir_op_uge:
693b8e80941Smrg   case nir_op_ige:
694b8e80941Smrg      return nir_lower_icmp64;
695b8e80941Smrg   case nir_op_iadd:
696b8e80941Smrg   case nir_op_isub:
697b8e80941Smrg      return nir_lower_iadd64;
698b8e80941Smrg   case nir_op_imin:
699b8e80941Smrg   case nir_op_imax:
700b8e80941Smrg   case nir_op_umin:
701b8e80941Smrg   case nir_op_umax:
702b8e80941Smrg      return nir_lower_minmax64;
703b8e80941Smrg   case nir_op_iabs:
704b8e80941Smrg      return nir_lower_iabs64;
705b8e80941Smrg   case nir_op_ineg:
706b8e80941Smrg      return nir_lower_ineg64;
707b8e80941Smrg   case nir_op_iand:
708b8e80941Smrg   case nir_op_ior:
709b8e80941Smrg   case nir_op_ixor:
710b8e80941Smrg   case nir_op_inot:
711b8e80941Smrg      return nir_lower_logic64;
712b8e80941Smrg   case nir_op_ishl:
713b8e80941Smrg   case nir_op_ishr:
714b8e80941Smrg   case nir_op_ushr:
715b8e80941Smrg      return nir_lower_shift64;
716b8e80941Smrg   case nir_op_extract_u8:
717b8e80941Smrg   case nir_op_extract_i8:
718b8e80941Smrg   case nir_op_extract_u16:
719b8e80941Smrg   case nir_op_extract_i16:
720b8e80941Smrg      return nir_lower_extract64;
721b8e80941Smrg   default:
722b8e80941Smrg      return 0;
723b8e80941Smrg   }
724b8e80941Smrg}
725b8e80941Smrg
726b8e80941Smrgstatic nir_ssa_def *
727b8e80941Smrglower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
728b8e80941Smrg{
729b8e80941Smrg   nir_ssa_def *src[4];
730b8e80941Smrg   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
731b8e80941Smrg      src[i] = nir_ssa_for_alu_src(b, alu, i);
732b8e80941Smrg
733b8e80941Smrg   switch (alu->op) {
734b8e80941Smrg   case nir_op_imul:
735b8e80941Smrg      return lower_imul64(b, src[0], src[1]);
736b8e80941Smrg   case nir_op_imul_2x32_64:
737b8e80941Smrg      return lower_mul_2x32_64(b, src[0], src[1], true);
738b8e80941Smrg   case nir_op_umul_2x32_64:
739b8e80941Smrg      return lower_mul_2x32_64(b, src[0], src[1], false);
740b8e80941Smrg   case nir_op_imul_high:
741b8e80941Smrg      return lower_mul_high64(b, src[0], src[1], true);
742b8e80941Smrg   case nir_op_umul_high:
743b8e80941Smrg      return lower_mul_high64(b, src[0], src[1], false);
744b8e80941Smrg   case nir_op_isign:
745b8e80941Smrg      return lower_isign64(b, src[0]);
746b8e80941Smrg   case nir_op_udiv:
747b8e80941Smrg      return lower_udiv64(b, src[0], src[1]);
748b8e80941Smrg   case nir_op_idiv:
749b8e80941Smrg      return lower_idiv64(b, src[0], src[1]);
750b8e80941Smrg   case nir_op_umod:
751b8e80941Smrg      return lower_umod64(b, src[0], src[1]);
752b8e80941Smrg   case nir_op_imod:
753b8e80941Smrg      return lower_imod64(b, src[0], src[1]);
754b8e80941Smrg   case nir_op_irem:
755b8e80941Smrg      return lower_irem64(b, src[0], src[1]);
756b8e80941Smrg   case nir_op_b2i64:
757b8e80941Smrg      return lower_b2i64(b, src[0]);
758b8e80941Smrg   case nir_op_i2b1:
759b8e80941Smrg      return lower_i2b(b, src[0]);
760b8e80941Smrg   case nir_op_i2i8:
761b8e80941Smrg      return lower_i2i8(b, src[0]);
762b8e80941Smrg   case nir_op_i2i16:
763b8e80941Smrg      return lower_i2i16(b, src[0]);
764b8e80941Smrg   case nir_op_i2i32:
765b8e80941Smrg      return lower_i2i32(b, src[0]);
766b8e80941Smrg   case nir_op_i2i64:
767b8e80941Smrg      return lower_i2i64(b, src[0]);
768b8e80941Smrg   case nir_op_u2u8:
769b8e80941Smrg      return lower_u2u8(b, src[0]);
770b8e80941Smrg   case nir_op_u2u16:
771b8e80941Smrg      return lower_u2u16(b, src[0]);
772b8e80941Smrg   case nir_op_u2u32:
773b8e80941Smrg      return lower_u2u32(b, src[0]);
774b8e80941Smrg   case nir_op_u2u64:
775b8e80941Smrg      return lower_u2u64(b, src[0]);
776b8e80941Smrg   case nir_op_bcsel:
777b8e80941Smrg      return lower_bcsel64(b, src[0], src[1], src[2]);
778b8e80941Smrg   case nir_op_ieq:
779b8e80941Smrg   case nir_op_ine:
780b8e80941Smrg   case nir_op_ult:
781b8e80941Smrg   case nir_op_ilt:
782b8e80941Smrg   case nir_op_uge:
783b8e80941Smrg   case nir_op_ige:
784b8e80941Smrg      return lower_int64_compare(b, alu->op, src[0], src[1]);
785b8e80941Smrg   case nir_op_iadd:
786b8e80941Smrg      return lower_iadd64(b, src[0], src[1]);
787b8e80941Smrg   case nir_op_isub:
788b8e80941Smrg      return lower_isub64(b, src[0], src[1]);
789b8e80941Smrg   case nir_op_imin:
790b8e80941Smrg      return lower_imin64(b, src[0], src[1]);
791b8e80941Smrg   case nir_op_imax:
792b8e80941Smrg      return lower_imax64(b, src[0], src[1]);
793b8e80941Smrg   case nir_op_umin:
794b8e80941Smrg      return lower_umin64(b, src[0], src[1]);
795b8e80941Smrg   case nir_op_umax:
796b8e80941Smrg      return lower_umax64(b, src[0], src[1]);
797b8e80941Smrg   case nir_op_iabs:
798b8e80941Smrg      return lower_iabs64(b, src[0]);
799b8e80941Smrg   case nir_op_ineg:
800b8e80941Smrg      return lower_ineg64(b, src[0]);
801b8e80941Smrg   case nir_op_iand:
802b8e80941Smrg      return lower_iand64(b, src[0], src[1]);
803b8e80941Smrg   case nir_op_ior:
804b8e80941Smrg      return lower_ior64(b, src[0], src[1]);
805b8e80941Smrg   case nir_op_ixor:
806b8e80941Smrg      return lower_ixor64(b, src[0], src[1]);
807b8e80941Smrg   case nir_op_inot:
808b8e80941Smrg      return lower_inot64(b, src[0]);
809b8e80941Smrg   case nir_op_ishl:
810b8e80941Smrg      return lower_ishl64(b, src[0], src[1]);
811b8e80941Smrg   case nir_op_ishr:
812b8e80941Smrg      return lower_ishr64(b, src[0], src[1]);
813b8e80941Smrg   case nir_op_ushr:
814b8e80941Smrg      return lower_ushr64(b, src[0], src[1]);
815b8e80941Smrg   case nir_op_extract_u8:
816b8e80941Smrg   case nir_op_extract_i8:
817b8e80941Smrg   case nir_op_extract_u16:
818b8e80941Smrg   case nir_op_extract_i16:
819b8e80941Smrg      return lower_extract(b, alu->op, src[0], src[1]);
820b8e80941Smrg   default:
821b8e80941Smrg      unreachable("Invalid ALU opcode to lower");
822b8e80941Smrg   }
823b8e80941Smrg}
824b8e80941Smrg
825b8e80941Smrgstatic bool
826b8e80941Smrglower_int64_impl(nir_function_impl *impl, nir_lower_int64_options options)
827b8e80941Smrg{
828b8e80941Smrg   nir_builder b;
829b8e80941Smrg   nir_builder_init(&b, impl);
830b8e80941Smrg
831b8e80941Smrg   bool progress = false;
832b8e80941Smrg   nir_foreach_block(block, impl) {
833b8e80941Smrg      nir_foreach_instr_safe(instr, block) {
834b8e80941Smrg         if (instr->type != nir_instr_type_alu)
835b8e80941Smrg            continue;
836b8e80941Smrg
837b8e80941Smrg         nir_alu_instr *alu = nir_instr_as_alu(instr);
838b8e80941Smrg         switch (alu->op) {
839b8e80941Smrg         case nir_op_i2b1:
840b8e80941Smrg         case nir_op_i2i32:
841b8e80941Smrg         case nir_op_u2u32:
842b8e80941Smrg            assert(alu->src[0].src.is_ssa);
843b8e80941Smrg            if (alu->src[0].src.ssa->bit_size != 64)
844b8e80941Smrg               continue;
845b8e80941Smrg            break;
846b8e80941Smrg         case nir_op_bcsel:
847b8e80941Smrg            assert(alu->src[1].src.is_ssa);
848b8e80941Smrg            assert(alu->src[2].src.is_ssa);
849b8e80941Smrg            assert(alu->src[1].src.ssa->bit_size ==
850b8e80941Smrg                   alu->src[2].src.ssa->bit_size);
851b8e80941Smrg            if (alu->src[1].src.ssa->bit_size != 64)
852b8e80941Smrg               continue;
853b8e80941Smrg            break;
854b8e80941Smrg         case nir_op_ieq:
855b8e80941Smrg         case nir_op_ine:
856b8e80941Smrg         case nir_op_ult:
857b8e80941Smrg         case nir_op_ilt:
858b8e80941Smrg         case nir_op_uge:
859b8e80941Smrg         case nir_op_ige:
860b8e80941Smrg            assert(alu->src[0].src.is_ssa);
861b8e80941Smrg            assert(alu->src[1].src.is_ssa);
862b8e80941Smrg            assert(alu->src[0].src.ssa->bit_size ==
863b8e80941Smrg                   alu->src[1].src.ssa->bit_size);
864b8e80941Smrg            if (alu->src[0].src.ssa->bit_size != 64)
865b8e80941Smrg               continue;
866b8e80941Smrg            break;
867b8e80941Smrg         default:
868b8e80941Smrg            assert(alu->dest.dest.is_ssa);
869b8e80941Smrg            if (alu->dest.dest.ssa.bit_size != 64)
870b8e80941Smrg               continue;
871b8e80941Smrg            break;
872b8e80941Smrg         }
873b8e80941Smrg
874b8e80941Smrg         if (!(options & nir_lower_int64_op_to_options_mask(alu->op)))
875b8e80941Smrg            continue;
876b8e80941Smrg
877b8e80941Smrg         b.cursor = nir_before_instr(instr);
878b8e80941Smrg
879b8e80941Smrg         nir_ssa_def *lowered = lower_int64_alu_instr(&b, alu);
880b8e80941Smrg         nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa,
881b8e80941Smrg                                  nir_src_for_ssa(lowered));
882b8e80941Smrg         nir_instr_remove(&alu->instr);
883b8e80941Smrg         progress = true;
884b8e80941Smrg      }
885b8e80941Smrg   }
886b8e80941Smrg
887b8e80941Smrg   if (progress) {
888b8e80941Smrg      nir_metadata_preserve(impl, nir_metadata_none);
889b8e80941Smrg   } else {
890b8e80941Smrg#ifndef NDEBUG
891b8e80941Smrg      impl->valid_metadata &= ~nir_metadata_not_properly_reset;
892b8e80941Smrg#endif
893b8e80941Smrg   }
894b8e80941Smrg
895b8e80941Smrg   return progress;
896b8e80941Smrg}
897b8e80941Smrg
898b8e80941Smrgbool
899b8e80941Smrgnir_lower_int64(nir_shader *shader, nir_lower_int64_options options)
900b8e80941Smrg{
901b8e80941Smrg   bool progress = false;
902b8e80941Smrg
903b8e80941Smrg   nir_foreach_function(function, shader) {
904b8e80941Smrg      if (function->impl)
905b8e80941Smrg         progress |= lower_int64_impl(function->impl, options);
906b8e80941Smrg   }
907b8e80941Smrg
908b8e80941Smrg   return progress;
909b8e80941Smrg}
910