1/*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#include "si_shader_internal.h"
26#include "si_pipe.h"
27#include "sid.h"
28#include "tgsi/tgsi_build.h"
29#include "tgsi/tgsi_util.h"
30#include "ac_llvm_util.h"
31
32static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
33			   struct lp_build_emit_data *emit_data,
34			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
35			   LLVMValueRef *fmask_ptr);
36
37/**
38 * Given a v8i32 resource descriptor for a buffer, extract the size of the
39 * buffer in number of elements and return it as an i32.
40 */
41static LLVMValueRef get_buffer_size(
42	struct lp_build_tgsi_context *bld_base,
43	LLVMValueRef descriptor)
44{
45	struct si_shader_context *ctx = si_shader_context(bld_base);
46	LLVMBuilderRef builder = ctx->ac.builder;
47	LLVMValueRef size =
48		LLVMBuildExtractElement(builder, descriptor,
49					LLVMConstInt(ctx->i32, 2, 0), "");
50
51	if (ctx->screen->info.chip_class == VI) {
52		/* On VI, the descriptor contains the size in bytes,
53		 * but TXQ must return the size in elements.
54		 * The stride is always non-zero for resources using TXQ.
55		 */
56		LLVMValueRef stride =
57			LLVMBuildExtractElement(builder, descriptor,
58						ctx->i32_1, "");
59		stride = LLVMBuildLShr(builder, stride,
60				       LLVMConstInt(ctx->i32, 16, 0), "");
61		stride = LLVMBuildAnd(builder, stride,
62				      LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
63
64		size = LLVMBuildUDiv(builder, size, stride, "");
65	}
66
67	return size;
68}
69
70static LLVMValueRef
71shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
72			 const struct tgsi_full_src_register *reg,
73			 bool ubo)
74{
75	LLVMValueRef index;
76
77	if (!reg->Register.Indirect) {
78		index = LLVMConstInt(ctx->i32, reg->Register.Index, false);
79	} else {
80		index = si_get_indirect_index(ctx, &reg->Indirect,
81					      1, reg->Register.Index);
82	}
83
84	if (ubo)
85		return ctx->abi.load_ubo(&ctx->abi, index);
86	else
87		return ctx->abi.load_ssbo(&ctx->abi, index, false);
88}
89
90static enum ac_image_dim
91ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
92{
93	switch (target) {
94	case TGSI_TEXTURE_1D:
95	case TGSI_TEXTURE_SHADOW1D:
96		if (screen->info.chip_class >= GFX9)
97			return ac_image_2d;
98		return ac_image_1d;
99	case TGSI_TEXTURE_2D:
100	case TGSI_TEXTURE_SHADOW2D:
101	case TGSI_TEXTURE_RECT:
102	case TGSI_TEXTURE_SHADOWRECT:
103		return ac_image_2d;
104	case TGSI_TEXTURE_3D:
105		return ac_image_3d;
106	case TGSI_TEXTURE_CUBE:
107	case TGSI_TEXTURE_SHADOWCUBE:
108	case TGSI_TEXTURE_CUBE_ARRAY:
109	case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
110		return ac_image_cube;
111	case TGSI_TEXTURE_1D_ARRAY:
112	case TGSI_TEXTURE_SHADOW1D_ARRAY:
113		if (screen->info.chip_class >= GFX9)
114			return ac_image_2darray;
115		return ac_image_1darray;
116	case TGSI_TEXTURE_2D_ARRAY:
117	case TGSI_TEXTURE_SHADOW2D_ARRAY:
118		return ac_image_2darray;
119	case TGSI_TEXTURE_2D_MSAA:
120		return ac_image_2dmsaa;
121	case TGSI_TEXTURE_2D_ARRAY_MSAA:
122		return ac_image_2darraymsaa;
123	default:
124		unreachable("unhandled texture type");
125	}
126}
127
128static enum ac_image_dim
129ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
130{
131	enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target);
132
133	/* Match the resource type set in the descriptor. */
134	if (dim == ac_image_cube ||
135	    (screen->info.chip_class <= VI && dim == ac_image_3d))
136		dim = ac_image_2darray;
137	else if (target == TGSI_TEXTURE_2D && screen->info.chip_class >= GFX9) {
138		/* When a single layer of a 3D texture is bound, the shader
139		 * will refer to a 2D target, but the descriptor has a 3D type.
140		 * Since the HW ignores BASE_ARRAY in this case, we need to
141		 * send 3 coordinates. This doesn't hurt when the underlying
142		 * texture is non-3D.
143		 */
144		dim = ac_image_3d;
145	}
146
147	return dim;
148}
149
150/**
151 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
152 *
153 * At least on Tonga, executing image stores on images with DCC enabled and
154 * non-trivial can eventually lead to lockups. This can occur when an
155 * application binds an image as read-only but then uses a shader that writes
156 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
157 * program termination) in this case, but it doesn't cost much to be a bit
158 * nicer: disabling DCC in the shader still leads to undefined results but
159 * avoids the lockup.
160 */
161static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
162				  LLVMValueRef rsrc)
163{
164	if (ctx->screen->info.chip_class <= CIK) {
165		return rsrc;
166	} else {
167		LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
168		LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
169		LLVMValueRef tmp;
170
171		tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
172		tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
173		return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
174	}
175}
176
177LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
178				LLVMValueRef list, LLVMValueRef index,
179				enum ac_descriptor_type desc_type, bool dcc_off,
180				bool bindless)
181{
182	LLVMBuilderRef builder = ctx->ac.builder;
183	LLVMValueRef rsrc;
184
185	if (desc_type == AC_DESC_BUFFER) {
186		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
187				      ctx->i32_1);
188		list = LLVMBuildPointerCast(builder, list,
189					    ac_array_in_const32_addr_space(ctx->v4i32), "");
190	} else {
191		assert(desc_type == AC_DESC_IMAGE);
192	}
193
194	if (bindless)
195		rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
196	else
197		rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
198
199	if (desc_type == AC_DESC_IMAGE && dcc_off)
200		rsrc = force_dcc_off(ctx, rsrc);
201	return rsrc;
202}
203
204/**
205 * Load the resource descriptor for \p image.
206 */
207static void
208image_fetch_rsrc(
209	struct lp_build_tgsi_context *bld_base,
210	const struct tgsi_full_src_register *image,
211	bool is_store, unsigned target,
212	LLVMValueRef *rsrc)
213{
214	struct si_shader_context *ctx = si_shader_context(bld_base);
215	LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
216					     ctx->param_samplers_and_images);
217	LLVMValueRef index;
218	bool dcc_off = is_store;
219
220	if (!image->Register.Indirect) {
221		const struct tgsi_shader_info *info = bld_base->info;
222		unsigned images_writemask = info->images_store |
223					    info->images_atomic;
224
225		index = LLVMConstInt(ctx->i32,
226				     si_get_image_slot(image->Register.Index), 0);
227
228		if (images_writemask & (1 << image->Register.Index))
229			dcc_off = true;
230	} else {
231		/* From the GL_ARB_shader_image_load_store extension spec:
232		 *
233		 *    If a shader performs an image load, store, or atomic
234		 *    operation using an image variable declared as an array,
235		 *    and if the index used to select an individual element is
236		 *    negative or greater than or equal to the size of the
237		 *    array, the results of the operation are undefined but may
238		 *    not lead to termination.
239		 */
240		index = si_get_bounded_indirect_index(ctx, &image->Indirect,
241						      image->Register.Index,
242						      ctx->num_images);
243		index = LLVMBuildSub(ctx->ac.builder,
244				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
245				     index, "");
246	}
247
248	bool bindless = false;
249
250	if (image->Register.File != TGSI_FILE_IMAGE) {
251		/* Bindless descriptors are accessible from a different pair of
252		 * user SGPR indices.
253		 */
254		rsrc_ptr = LLVMGetParam(ctx->main_fn,
255					ctx->param_bindless_samplers_and_images);
256		index = lp_build_emit_fetch_src(bld_base, image,
257						TGSI_TYPE_UNSIGNED, 0);
258
259		/* For simplicity, bindless image descriptors use fixed
260		 * 16-dword slots for now.
261		 */
262		index = LLVMBuildMul(ctx->ac.builder, index,
263				     LLVMConstInt(ctx->i32, 2, 0), "");
264		bindless = true;
265	}
266
267	*rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
268				   target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
269				   dcc_off, bindless);
270}
271
272static void image_fetch_coords(
273		struct lp_build_tgsi_context *bld_base,
274		const struct tgsi_full_instruction *inst,
275		unsigned src, LLVMValueRef desc,
276		LLVMValueRef *coords)
277{
278	struct si_shader_context *ctx = si_shader_context(bld_base);
279	LLVMBuilderRef builder = ctx->ac.builder;
280	unsigned target = inst->Memory.Texture;
281	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
282	LLVMValueRef tmp;
283	int chan;
284
285	if (target == TGSI_TEXTURE_2D_MSAA ||
286	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
287		/* Need the sample index as well. */
288		num_coords++;
289	}
290
291	for (chan = 0; chan < num_coords; ++chan) {
292		tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
293		tmp = ac_to_integer(&ctx->ac, tmp);
294		coords[chan] = tmp;
295	}
296
297	if (ctx->screen->info.chip_class >= GFX9) {
298		/* 1D textures are allocated and used as 2D on GFX9. */
299		if (target == TGSI_TEXTURE_1D) {
300			coords[1] = ctx->i32_0;
301		} else if (target == TGSI_TEXTURE_1D_ARRAY) {
302			coords[2] = coords[1];
303			coords[1] = ctx->i32_0;
304		} else if (target == TGSI_TEXTURE_2D) {
305			/* The hw can't bind a slice of a 3D image as a 2D
306			 * image, because it ignores BASE_ARRAY if the target
307			 * is 3D. The workaround is to read BASE_ARRAY and set
308			 * it as the 3rd address operand for all 2D images.
309			 */
310			LLVMValueRef first_layer, const5, mask;
311
312			const5 = LLVMConstInt(ctx->i32, 5, 0);
313			mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
314			first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
315			first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
316
317			coords[2] = first_layer;
318		}
319	}
320}
321
322static unsigned get_cache_policy(struct si_shader_context *ctx,
323				 const struct tgsi_full_instruction *inst,
324				 bool atomic, bool may_store_unaligned,
325				 bool writeonly_memory)
326{
327	unsigned cache_policy = 0;
328
329	if (!atomic &&
330	    /* SI has a TC L1 bug causing corruption of 8bit/16bit stores.
331	     * All store opcodes not aligned to a dword are affected.
332	     * The only way to get unaligned stores in radeonsi is through
333	     * shader images. */
334	    ((may_store_unaligned && ctx->screen->info.chip_class == SI) ||
335	     /* If this is write-only, don't keep data in L1 to prevent
336	      * evicting L1 cache lines that may be needed by other
337	      * instructions. */
338	     writeonly_memory ||
339	     inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE)))
340		cache_policy |= ac_glc;
341
342	if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
343		cache_policy |= ac_slc;
344
345	return cache_policy;
346}
347
348static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
349                                   const struct tgsi_full_instruction *inst,
350                                   LLVMTypeRef type, int arg)
351{
352	LLVMBuilderRef builder = ctx->ac.builder;
353	LLVMValueRef offset, ptr;
354	int addr_space;
355
356	offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
357	offset = ac_to_integer(&ctx->ac, offset);
358
359	ptr = ctx->ac.lds;
360	ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
361	addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
362	ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
363
364	return ptr;
365}
366
367static void load_emit_memory(
368		struct si_shader_context *ctx,
369		struct lp_build_emit_data *emit_data)
370{
371	const struct tgsi_full_instruction *inst = emit_data->inst;
372	unsigned writemask = inst->Dst[0].Register.WriteMask;
373	LLVMValueRef channels[4], ptr, derived_ptr, index;
374	int chan;
375
376	ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
377
378	for (chan = 0; chan < 4; ++chan) {
379		if (!(writemask & (1 << chan))) {
380			channels[chan] = LLVMGetUndef(ctx->f32);
381			continue;
382		}
383
384		index = LLVMConstInt(ctx->i32, chan, 0);
385		derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
386		channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
387	}
388	emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
389}
390
391/**
392 * Return true if the memory accessed by a LOAD or STORE instruction is
393 * read-only or write-only, respectively.
394 *
395 * \param shader_buffers_reverse_access_mask
396 *	For LOAD, set this to (store | atomic) slot usage in the shader.
397 *	For STORE, set this to (load | atomic) slot usage in the shader.
398 * \param images_reverse_access_mask  Same as above, but for images.
399 * \param bindless_buffer_reverse_access_mask  Same as above, but for bindless image buffers.
400 * \param bindless_image_reverse_access_mask   Same as above, but for bindless images.
401 */
402static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
403				  const struct tgsi_shader_info *info,
404				  unsigned shader_buffers_reverse_access_mask,
405				  unsigned images_reverse_access_mask,
406				  bool bindless_buffer_reverse_access_mask,
407				  bool bindless_image_reverse_access_mask)
408{
409	enum tgsi_file_type resource_file;
410	unsigned resource_index;
411	bool resource_indirect;
412
413	if (inst->Instruction.Opcode == TGSI_OPCODE_STORE) {
414		resource_file = inst->Dst[0].Register.File;
415		resource_index = inst->Dst[0].Register.Index;
416		resource_indirect = inst->Dst[0].Register.Indirect;
417	} else {
418		resource_file = inst->Src[0].Register.File;
419		resource_index = inst->Src[0].Register.Index;
420		resource_indirect = inst->Src[0].Register.Indirect;
421	}
422
423	assert(resource_file == TGSI_FILE_BUFFER ||
424	       resource_file == TGSI_FILE_IMAGE ||
425	       /* bindless image */
426	       resource_file == TGSI_FILE_INPUT ||
427	       resource_file == TGSI_FILE_OUTPUT ||
428	       resource_file == TGSI_FILE_CONSTANT ||
429	       resource_file == TGSI_FILE_TEMPORARY ||
430	       resource_file == TGSI_FILE_IMMEDIATE);
431
432	assert(resource_file != TGSI_FILE_BUFFER ||
433	       inst->Memory.Texture == TGSI_TEXTURE_BUFFER);
434
435	bool bindless = resource_file != TGSI_FILE_BUFFER &&
436			resource_file != TGSI_FILE_IMAGE;
437
438	/* RESTRICT means NOALIAS.
439	 * If there are no writes, we can assume the accessed memory is read-only.
440	 * If there are no reads, we can assume the accessed memory is write-only.
441	 */
442	if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT && !bindless) {
443		unsigned reverse_access_mask;
444
445		if (resource_file == TGSI_FILE_BUFFER) {
446			reverse_access_mask = shader_buffers_reverse_access_mask;
447		} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
448			reverse_access_mask = info->images_buffers &
449					      images_reverse_access_mask;
450		} else {
451			reverse_access_mask = ~info->images_buffers &
452					      images_reverse_access_mask;
453		}
454
455		if (resource_indirect) {
456			if (!reverse_access_mask)
457				return true;
458		} else {
459			if (!(reverse_access_mask &
460			      (1u << resource_index)))
461				return true;
462		}
463	}
464
465	/* If there are no buffer writes (for both shader buffers & image
466	 * buffers), it implies that buffer memory is read-only.
467	 * If there are no buffer reads (for both shader buffers & image
468	 * buffers), it implies that buffer memory is write-only.
469	 *
470	 * Same for the case when there are no writes/reads for non-buffer
471	 * images.
472	 */
473	if (resource_file == TGSI_FILE_BUFFER ||
474	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
475		if (!shader_buffers_reverse_access_mask &&
476		    !(info->images_buffers & images_reverse_access_mask) &&
477		    !bindless_buffer_reverse_access_mask)
478			return true;
479	} else {
480		if (!(~info->images_buffers & images_reverse_access_mask) &&
481		    !bindless_image_reverse_access_mask)
482			return true;
483	}
484	return false;
485}
486
487static void load_emit(
488		const struct lp_build_tgsi_action *action,
489		struct lp_build_tgsi_context *bld_base,
490		struct lp_build_emit_data *emit_data)
491{
492	struct si_shader_context *ctx = si_shader_context(bld_base);
493	const struct tgsi_full_instruction * inst = emit_data->inst;
494	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
495	bool can_speculate = false;
496	LLVMValueRef vindex = ctx->i32_0;
497	LLVMValueRef voffset = ctx->i32_0;
498	struct ac_image_args args = {};
499
500	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
501		load_emit_memory(ctx, emit_data);
502		return;
503	}
504
505	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
506	    inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
507		bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF;
508		args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo);
509		voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
510	} else {
511		unsigned target = inst->Memory.Texture;
512
513		image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &args.resource);
514		image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
515		vindex = args.coords[0]; /* for buffers only */
516	}
517
518	if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
519		emit_data->output[emit_data->chan] =
520			ac_build_buffer_load(&ctx->ac, args.resource,
521					     util_last_bit(inst->Dst[0].Register.WriteMask),
522					     NULL, voffset, NULL, 0, 0, 0, true, true);
523		return;
524	}
525
526	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
527		ac_build_waitcnt(&ctx->ac, VM_CNT);
528
529	can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
530			  is_oneway_access_only(inst, info,
531						info->shader_buffers_store |
532						info->shader_buffers_atomic,
533						info->images_store |
534						info->images_atomic,
535						info->uses_bindless_buffer_store |
536						info->uses_bindless_buffer_atomic,
537						info->uses_bindless_image_store |
538						info->uses_bindless_image_atomic);
539	args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
540
541	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
542		/* Don't use SMEM for shader buffer loads, because LLVM doesn't
543		 * select SMEM for SI.load.const with a non-constant offset, and
544		 * constant offsets practically don't exist with shader buffers.
545		 *
546		 * Also, SI.load.const doesn't use inst_offset when it's lowered
547		 * to VMEM, so we just end up with more VALU instructions in the end
548		 * and no benefit.
549		 *
550		 * TODO: Remove this line once LLVM can select SMEM with a non-constant
551		 *       offset, and can derive inst_offset when VMEM is selected.
552		 *       After that, si_memory_barrier should invalidate sL1 for shader
553		 *       buffers.
554		 */
555		emit_data->output[emit_data->chan] =
556			ac_build_buffer_load(&ctx->ac, args.resource,
557					     util_last_bit(inst->Dst[0].Register.WriteMask),
558					     NULL, voffset, NULL, 0,
559					     !!(args.cache_policy & ac_glc),
560					     !!(args.cache_policy & ac_slc),
561					     can_speculate, false);
562		return;
563	}
564
565	if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
566		unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
567		LLVMValueRef result =
568			ac_build_buffer_load_format(&ctx->ac,
569						    args.resource,
570						    vindex,
571						    ctx->i32_0,
572						    num_channels,
573						    !!(args.cache_policy & ac_glc),
574						    can_speculate);
575		emit_data->output[emit_data->chan] =
576			ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
577	} else {
578		args.opcode = ac_image_load;
579		args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
580		args.attributes = ac_get_load_intr_attribs(can_speculate);
581		args.dmask = 0xf;
582
583		emit_data->output[emit_data->chan] =
584			ac_build_image_opcode(&ctx->ac, &args);
585	}
586}
587
588static void store_emit_buffer(struct si_shader_context *ctx,
589			      LLVMValueRef resource,
590			      unsigned writemask,
591			      LLVMValueRef value,
592			      LLVMValueRef voffset,
593			      unsigned cache_policy,
594			      bool writeonly_memory)
595{
596	LLVMBuilderRef builder = ctx->ac.builder;
597	LLVMValueRef base_data = value;
598	LLVMValueRef base_offset = voffset;
599
600	while (writemask) {
601		int start, count;
602		const char *intrinsic_name;
603		LLVMValueRef data, voff;
604
605		u_bit_scan_consecutive_range(&writemask, &start, &count);
606
607		/* Due to an LLVM limitation, split 3-element writes
608		 * into a 2-element and a 1-element write. */
609		if (count == 3) {
610			writemask |= 1 << (start + 2);
611			count = 2;
612		}
613
614		if (count == 4) {
615			data = base_data;
616			intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
617		} else if (count == 2) {
618			LLVMValueRef values[2] = {
619				LLVMBuildExtractElement(builder, base_data,
620							LLVMConstInt(ctx->i32, start, 0), ""),
621				LLVMBuildExtractElement(builder, base_data,
622							LLVMConstInt(ctx->i32, start + 1, 0), ""),
623			};
624
625			data = ac_build_gather_values(&ctx->ac, values, 2);
626			intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
627		} else {
628			assert(count == 1);
629			data = LLVMBuildExtractElement(
630				builder, base_data,
631				LLVMConstInt(ctx->i32, start, 0), "");
632			intrinsic_name = "llvm.amdgcn.buffer.store.f32";
633		}
634
635		voff = base_offset;
636		if (start != 0) {
637			voff = LLVMBuildAdd(
638				builder, voff,
639				LLVMConstInt(ctx->i32, start * 4, 0), "");
640		}
641
642		LLVMValueRef args[] = {
643			data,
644			resource,
645			ctx->i32_0, /* vindex */
646			voff,
647			LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
648			LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0),
649		};
650		ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->voidt, args, 6,
651				   ac_get_store_intr_attribs(writeonly_memory));
652	}
653}
654
655static void store_emit_memory(
656		struct si_shader_context *ctx,
657		struct lp_build_emit_data *emit_data)
658{
659	const struct tgsi_full_instruction *inst = emit_data->inst;
660	LLVMBuilderRef builder = ctx->ac.builder;
661	unsigned writemask = inst->Dst[0].Register.WriteMask;
662	LLVMValueRef ptr, derived_ptr, data, index;
663	int chan;
664
665	ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
666
667	for (chan = 0; chan < 4; ++chan) {
668		if (!(writemask & (1 << chan))) {
669			continue;
670		}
671		data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
672		index = LLVMConstInt(ctx->i32, chan, 0);
673		derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
674		LLVMBuildStore(builder, data, derived_ptr);
675	}
676}
677
678static void store_emit(
679		const struct lp_build_tgsi_action *action,
680		struct lp_build_tgsi_context *bld_base,
681		struct lp_build_emit_data *emit_data)
682{
683	struct si_shader_context *ctx = si_shader_context(bld_base);
684	const struct tgsi_full_instruction * inst = emit_data->inst;
685	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
686	struct tgsi_full_src_register resource_reg =
687		tgsi_full_src_register_from_dst(&inst->Dst[0]);
688	unsigned target = inst->Memory.Texture;
689
690	if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
691		store_emit_memory(ctx, emit_data);
692		return;
693	}
694
695	bool writeonly_memory = is_oneway_access_only(inst, info,
696						      info->shader_buffers_load |
697						      info->shader_buffers_atomic,
698						      info->images_load |
699						      info->images_atomic,
700						      info->uses_bindless_buffer_load |
701						      info->uses_bindless_buffer_atomic,
702						      info->uses_bindless_image_load |
703						      info->uses_bindless_image_atomic);
704	LLVMValueRef chans[4];
705	LLVMValueRef vindex = ctx->i32_0;
706	LLVMValueRef voffset = ctx->i32_0;
707	struct ac_image_args args = {};
708
709	for (unsigned chan = 0; chan < 4; ++chan)
710		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
711
712	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
713		args.resource = shader_buffer_fetch_rsrc(ctx, &resource_reg, false);
714		voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 0, 0));
715	} else {
716		image_fetch_rsrc(bld_base, &resource_reg, true, target, &args.resource);
717		image_fetch_coords(bld_base, inst, 0, args.resource, args.coords);
718		vindex = args.coords[0]; /* for buffers only */
719	}
720
721	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
722		ac_build_waitcnt(&ctx->ac, VM_CNT);
723
724	bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
725	args.cache_policy = get_cache_policy(ctx, inst,
726					     false, /* atomic */
727					     is_image, /* may_store_unaligned */
728					     writeonly_memory);
729
730	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
731		store_emit_buffer(ctx, args.resource, inst->Dst[0].Register.WriteMask,
732				  ac_build_gather_values(&ctx->ac, chans, 4),
733				  voffset, args.cache_policy, writeonly_memory);
734		return;
735	}
736
737	if (target == TGSI_TEXTURE_BUFFER) {
738		unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
739		num_channels = util_next_power_of_two(num_channels);
740
741		LLVMValueRef buf_args[6] = {
742			ac_build_gather_values(&ctx->ac, chans, 4),
743			args.resource,
744			vindex,
745			ctx->i32_0, /* voffset */
746		};
747
748		if (HAVE_LLVM >= 0x0800) {
749			buf_args[4] = ctx->i32_0; /* soffset */
750			buf_args[5] = LLVMConstInt(ctx->i1, args.cache_policy, 0);
751		} else {
752			buf_args[4] = LLVMConstInt(ctx->i1, !!(args.cache_policy & ac_glc), 0);
753			buf_args[5] = LLVMConstInt(ctx->i1, !!(args.cache_policy & ac_slc), 0);
754		}
755
756		const char *types[] = { "f32", "v2f32", "v4f32" };
757		char name[128];
758
759		snprintf(name, sizeof(name), "%s.%s",
760			 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.struct.buffer.store.format" :
761					       "llvm.amdgcn.buffer.store.format",
762			 types[CLAMP(num_channels, 1, 3) - 1]);
763
764		emit_data->output[emit_data->chan] = ac_build_intrinsic(
765			&ctx->ac,
766			name,
767			ctx->voidt, buf_args, 6,
768			ac_get_store_intr_attribs(writeonly_memory));
769	} else {
770		args.opcode = ac_image_store;
771		args.data[0] = ac_build_gather_values(&ctx->ac, chans, 4);
772		args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
773		args.attributes = ac_get_store_intr_attribs(writeonly_memory);
774		args.dmask = 0xf;
775
776		emit_data->output[emit_data->chan] =
777			ac_build_image_opcode(&ctx->ac, &args);
778	}
779}
780
781static void atomic_emit_memory(struct si_shader_context *ctx,
782                               struct lp_build_emit_data *emit_data) {
783	LLVMBuilderRef builder = ctx->ac.builder;
784	const struct tgsi_full_instruction * inst = emit_data->inst;
785	LLVMValueRef ptr, result, arg;
786	const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : "workgroup";
787
788	ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
789
790	arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
791	arg = ac_to_integer(&ctx->ac, arg);
792
793	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
794		LLVMValueRef new_data;
795		new_data = lp_build_emit_fetch(&ctx->bld_base,
796		                               inst, 3, 0);
797
798		new_data = ac_to_integer(&ctx->ac, new_data);
799
800		result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, new_data,
801						  sync_scope);
802		result = LLVMBuildExtractValue(builder, result, 0, "");
803	} else {
804		LLVMAtomicRMWBinOp op;
805
806		switch(inst->Instruction.Opcode) {
807			case TGSI_OPCODE_ATOMUADD:
808				op = LLVMAtomicRMWBinOpAdd;
809				break;
810			case TGSI_OPCODE_ATOMXCHG:
811				op = LLVMAtomicRMWBinOpXchg;
812				break;
813			case TGSI_OPCODE_ATOMAND:
814				op = LLVMAtomicRMWBinOpAnd;
815				break;
816			case TGSI_OPCODE_ATOMOR:
817				op = LLVMAtomicRMWBinOpOr;
818				break;
819			case TGSI_OPCODE_ATOMXOR:
820				op = LLVMAtomicRMWBinOpXor;
821				break;
822			case TGSI_OPCODE_ATOMUMIN:
823				op = LLVMAtomicRMWBinOpUMin;
824				break;
825			case TGSI_OPCODE_ATOMUMAX:
826				op = LLVMAtomicRMWBinOpUMax;
827				break;
828			case TGSI_OPCODE_ATOMIMIN:
829				op = LLVMAtomicRMWBinOpMin;
830				break;
831			case TGSI_OPCODE_ATOMIMAX:
832				op = LLVMAtomicRMWBinOpMax;
833				break;
834			default:
835				unreachable("unknown atomic opcode");
836		}
837
838		result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, sync_scope);
839	}
840	emit_data->output[emit_data->chan] =
841		LLVMBuildBitCast(builder, result, ctx->f32, "");
842}
843
844static void atomic_emit(
845		const struct lp_build_tgsi_action *action,
846		struct lp_build_tgsi_context *bld_base,
847		struct lp_build_emit_data *emit_data)
848{
849	struct si_shader_context *ctx = si_shader_context(bld_base);
850	const struct tgsi_full_instruction * inst = emit_data->inst;
851	struct ac_image_args args = {};
852	unsigned num_data = 0;
853	LLVMValueRef vindex = ctx->i32_0;
854	LLVMValueRef voffset = ctx->i32_0;
855
856	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
857		atomic_emit_memory(ctx, emit_data);
858		return;
859	}
860
861	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
862		/* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
863		 * of arguments, which is reversed relative to TGSI (and GLSL)
864		 */
865		args.data[num_data++] =
866			ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0));
867	}
868
869	args.data[num_data++] =
870		ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
871	args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
872
873	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
874		args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
875		voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
876	} else {
877		image_fetch_rsrc(bld_base, &inst->Src[0], true,
878				inst->Memory.Texture, &args.resource);
879		image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
880		vindex = args.coords[0]; /* for buffers only */
881	}
882
883	if (HAVE_LLVM >= 0x0800 &&
884	    inst->Src[0].Register.File != TGSI_FILE_BUFFER &&
885	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
886		LLVMValueRef buf_args[7];
887		unsigned num_args = 0;
888
889		buf_args[num_args++] = args.data[0];
890		if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
891			buf_args[num_args++] = args.data[1];
892
893		buf_args[num_args++] = args.resource;
894		buf_args[num_args++] = vindex;
895		buf_args[num_args++] = voffset;
896		buf_args[num_args++] = ctx->i32_0; /* soffset */
897		buf_args[num_args++] = LLVMConstInt(ctx->i32, args.cache_policy & ac_slc, 0);
898
899		char intrinsic_name[64];
900		snprintf(intrinsic_name, sizeof(intrinsic_name),
901			 "llvm.amdgcn.struct.buffer.atomic.%s", action->intr_name);
902		emit_data->output[emit_data->chan] =
903			ac_to_float(&ctx->ac,
904				    ac_build_intrinsic(&ctx->ac, intrinsic_name,
905						       ctx->i32, buf_args, num_args, 0));
906		return;
907	}
908
909	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
910	    (HAVE_LLVM < 0x0800 &&
911	     inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
912		LLVMValueRef buf_args[7];
913		unsigned num_args = 0;
914
915		buf_args[num_args++] = args.data[0];
916		if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
917			buf_args[num_args++] = args.data[1];
918
919		buf_args[num_args++] = args.resource;
920		buf_args[num_args++] = vindex;
921		buf_args[num_args++] = voffset;
922		buf_args[num_args++] = args.cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
923
924		char intrinsic_name[40];
925		snprintf(intrinsic_name, sizeof(intrinsic_name),
926			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
927		emit_data->output[emit_data->chan] =
928			ac_to_float(&ctx->ac,
929				    ac_build_intrinsic(&ctx->ac, intrinsic_name,
930						       ctx->i32, buf_args, num_args, 0));
931	} else {
932		if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
933			args.opcode = ac_image_atomic_cmpswap;
934		} else {
935			args.opcode = ac_image_atomic;
936			switch (inst->Instruction.Opcode) {
937			case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break;
938			case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break;
939			case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break;
940			case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break;
941			case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break;
942			case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break;
943			case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break;
944			case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break;
945			case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break;
946			default: unreachable("unhandled image atomic");
947			}
948		}
949
950		args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
951		emit_data->output[emit_data->chan] =
952			ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args));
953	}
954}
955
956static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
957				unsigned target, LLVMValueRef out)
958{
959	LLVMBuilderRef builder = ctx->ac.builder;
960
961	/* 1D textures are allocated and used as 2D on GFX9. */
962        if (ctx->screen->info.chip_class >= GFX9 &&
963	    (target == TGSI_TEXTURE_1D_ARRAY ||
964	     target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
965		LLVMValueRef layers =
966			LLVMBuildExtractElement(builder, out,
967						LLVMConstInt(ctx->i32, 2, 0), "");
968		out = LLVMBuildInsertElement(builder, out, layers,
969					     ctx->i32_1, "");
970	}
971
972	/* Divide the number of layers by 6 to get the number of cubes. */
973	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
974	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
975		LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
976
977		LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
978		z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
979
980		out = LLVMBuildInsertElement(builder, out, z, imm2, "");
981	}
982	return out;
983}
984
985static void resq_emit(
986		const struct lp_build_tgsi_action *action,
987		struct lp_build_tgsi_context *bld_base,
988		struct lp_build_emit_data *emit_data)
989{
990	struct si_shader_context *ctx = si_shader_context(bld_base);
991	LLVMBuilderRef builder = ctx->ac.builder;
992	const struct tgsi_full_instruction *inst = emit_data->inst;
993	const struct tgsi_full_src_register *reg =
994		&inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0];
995
996	if (reg->Register.File == TGSI_FILE_BUFFER) {
997		LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false);
998
999		emit_data->output[emit_data->chan] =
1000			LLVMBuildExtractElement(builder, rsrc,
1001						LLVMConstInt(ctx->i32, 2, 0), "");
1002		return;
1003	}
1004
1005	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
1006	    inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
1007		LLVMValueRef rsrc;
1008
1009		tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
1010		/* Read the size from the buffer descriptor directly. */
1011		emit_data->output[emit_data->chan] =
1012			get_buffer_size(bld_base, rsrc);
1013		return;
1014	}
1015
1016	if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
1017	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
1018		LLVMValueRef rsrc;
1019
1020		image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc);
1021		emit_data->output[emit_data->chan] =
1022			get_buffer_size(bld_base, rsrc);
1023		return;
1024	}
1025
1026	unsigned target;
1027
1028	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1029		target = inst->Texture.Texture;
1030	} else {
1031		if (inst->Memory.Texture == TGSI_TEXTURE_3D)
1032			target = TGSI_TEXTURE_2D_ARRAY;
1033		else
1034			target = inst->Memory.Texture;
1035	}
1036
1037	struct ac_image_args args = {};
1038	args.opcode = ac_image_get_resinfo;
1039	args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1040	args.dmask = 0xf;
1041	args.attributes = AC_FUNC_ATTR_READNONE;
1042
1043	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1044		tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL);
1045		args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1046	} else {
1047		image_fetch_rsrc(bld_base, reg, false, target, &args.resource);
1048		args.lod = ctx->i32_0;
1049	}
1050
1051	emit_data->output[emit_data->chan] =
1052		fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args));
1053}
1054
1055/**
1056 * Load an image view, fmask view. or sampler state descriptor.
1057 */
1058LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
1059				  LLVMValueRef list, LLVMValueRef index,
1060				  enum ac_descriptor_type type)
1061{
1062	LLVMBuilderRef builder = ctx->ac.builder;
1063
1064	switch (type) {
1065	case AC_DESC_IMAGE:
1066		/* The image is at [0:7]. */
1067		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1068		break;
1069	case AC_DESC_BUFFER:
1070		/* The buffer is in [4:7]. */
1071		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1072				      ctx->i32_1);
1073		list = LLVMBuildPointerCast(builder, list,
1074					    ac_array_in_const32_addr_space(ctx->v4i32), "");
1075		break;
1076	case AC_DESC_FMASK:
1077		/* The FMASK is at [8:15]. */
1078		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
1079				      ctx->i32_1);
1080		break;
1081	case AC_DESC_SAMPLER:
1082		/* The sampler state is at [12:15]. */
1083		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1084				      LLVMConstInt(ctx->i32, 3, 0));
1085		list = LLVMBuildPointerCast(builder, list,
1086					    ac_array_in_const32_addr_space(ctx->v4i32), "");
1087		break;
1088	case AC_DESC_PLANE_0:
1089	case AC_DESC_PLANE_1:
1090	case AC_DESC_PLANE_2:
1091		/* Only used for the multiplane image support for Vulkan. Should
1092		 * never be reached in radeonsi.
1093		 */
1094		unreachable("Plane descriptor requested in radeonsi.");
1095	}
1096
1097	return ac_build_load_to_sgpr(&ctx->ac, list, index);
1098}
1099
1100/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
1101 *
1102 * SI-CI:
1103 *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
1104 *   filtering manually. The driver sets img7 to a mask clearing
1105 *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
1106 *     s_and_b32 samp0, samp0, img7
1107 *
1108 * VI:
1109 *   The ANISO_OVERRIDE sampler field enables this fix in TA.
1110 */
1111static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
1112					   LLVMValueRef res, LLVMValueRef samp)
1113{
1114	LLVMValueRef img7, samp0;
1115
1116	if (ctx->screen->info.chip_class >= VI)
1117		return samp;
1118
1119	img7 = LLVMBuildExtractElement(ctx->ac.builder, res,
1120				       LLVMConstInt(ctx->i32, 7, 0), "");
1121	samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp,
1122					ctx->i32_0, "");
1123	samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, "");
1124	return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0,
1125				      ctx->i32_0, "");
1126}
1127
1128static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
1129			   struct lp_build_emit_data *emit_data,
1130			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
1131			   LLVMValueRef *fmask_ptr)
1132{
1133	struct si_shader_context *ctx = si_shader_context(bld_base);
1134	LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
1135	const struct tgsi_full_instruction *inst = emit_data->inst;
1136	const struct tgsi_full_src_register *reg;
1137	unsigned target = inst->Texture.Texture;
1138	unsigned sampler_src;
1139	LLVMValueRef index;
1140
1141	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1142	reg = &emit_data->inst->Src[sampler_src];
1143
1144	if (reg->Register.Indirect) {
1145		index = si_get_bounded_indirect_index(ctx,
1146						      &reg->Indirect,
1147						      reg->Register.Index,
1148						      ctx->num_samplers);
1149		index = LLVMBuildAdd(ctx->ac.builder, index,
1150				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
1151	} else {
1152		index = LLVMConstInt(ctx->i32,
1153				     si_get_sampler_slot(reg->Register.Index), 0);
1154	}
1155
1156	if (reg->Register.File != TGSI_FILE_SAMPLER) {
1157		/* Bindless descriptors are accessible from a different pair of
1158		 * user SGPR indices.
1159		 */
1160		list = LLVMGetParam(ctx->main_fn,
1161				    ctx->param_bindless_samplers_and_images);
1162		index = lp_build_emit_fetch_src(bld_base, reg,
1163						TGSI_TYPE_UNSIGNED, 0);
1164
1165		/* Since bindless handle arithmetic can contain an unsigned integer
1166		 * wraparound and si_load_sampler_desc assumes there isn't any,
1167		 * use GEP without "inbounds" (inside ac_build_pointer_add)
1168		 * to prevent incorrect code generation and hangs.
1169		 */
1170		index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1171		list = ac_build_pointer_add(&ctx->ac, list, index);
1172		index = ctx->i32_0;
1173	}
1174
1175	if (target == TGSI_TEXTURE_BUFFER)
1176		*res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
1177	else
1178		*res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
1179
1180	if (samp_ptr)
1181		*samp_ptr = NULL;
1182	if (fmask_ptr)
1183		*fmask_ptr = NULL;
1184
1185	if (target == TGSI_TEXTURE_2D_MSAA ||
1186	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1187		if (fmask_ptr)
1188			*fmask_ptr = si_load_sampler_desc(ctx, list, index,
1189						          AC_DESC_FMASK);
1190	} else if (target != TGSI_TEXTURE_BUFFER) {
1191		if (samp_ptr) {
1192			*samp_ptr = si_load_sampler_desc(ctx, list, index,
1193						         AC_DESC_SAMPLER);
1194			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
1195		}
1196	}
1197}
1198
1199/* Gather4 should follow the same rules as bilinear filtering, but the hardware
1200 * incorrectly forces nearest filtering if the texture format is integer.
1201 * The only effect it has on Gather4, which always returns 4 texels for
1202 * bilinear filtering, is that the final coordinates are off by 0.5 of
1203 * the texel size.
1204 *
1205 * The workaround is to subtract 0.5 from the unnormalized coordinates,
1206 * or (0.5 / size) from the normalized coordinates.
1207 *
1208 * However, cube textures with 8_8_8_8 data formats require a different
1209 * workaround of overriding the num format to USCALED/SSCALED. This would lose
1210 * precision in 32-bit data formats, so it needs to be applied dynamically at
1211 * runtime. In this case, return an i1 value that indicates whether the
1212 * descriptor was overridden (and hence a fixup of the sampler result is needed).
1213 */
1214static LLVMValueRef
1215si_lower_gather4_integer(struct si_shader_context *ctx,
1216			 struct ac_image_args *args,
1217			 unsigned target,
1218			 enum tgsi_return_type return_type)
1219{
1220	LLVMBuilderRef builder = ctx->ac.builder;
1221	LLVMValueRef wa_8888 = NULL;
1222	LLVMValueRef half_texel[2];
1223
1224	assert(return_type == TGSI_RETURN_TYPE_SINT ||
1225	       return_type == TGSI_RETURN_TYPE_UINT);
1226
1227	if (target == TGSI_TEXTURE_CUBE ||
1228	    target == TGSI_TEXTURE_CUBE_ARRAY) {
1229		LLVMValueRef formats;
1230		LLVMValueRef data_format;
1231		LLVMValueRef wa_formats;
1232
1233		formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
1234
1235		data_format = LLVMBuildLShr(builder, formats,
1236					    LLVMConstInt(ctx->i32, 20, false), "");
1237		data_format = LLVMBuildAnd(builder, data_format,
1238					   LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1239		wa_8888 = LLVMBuildICmp(
1240			builder, LLVMIntEQ, data_format,
1241			LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
1242			"");
1243
1244		uint32_t wa_num_format =
1245			return_type == TGSI_RETURN_TYPE_UINT ?
1246			S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_USCALED) :
1247			S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_SSCALED);
1248		wa_formats = LLVMBuildAnd(builder, formats,
1249					  LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false),
1250					  "");
1251		wa_formats = LLVMBuildOr(builder, wa_formats,
1252					LLVMConstInt(ctx->i32, wa_num_format, false), "");
1253
1254		formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
1255		args->resource = LLVMBuildInsertElement(
1256			builder, args->resource, formats, ctx->i32_1, "");
1257	}
1258
1259	if (target == TGSI_TEXTURE_RECT ||
1260	    target == TGSI_TEXTURE_SHADOWRECT) {
1261		assert(!wa_8888);
1262		half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1263	} else {
1264		struct ac_image_args resinfo = {};
1265		struct lp_build_if_state if_ctx;
1266
1267		if (wa_8888) {
1268			/* Skip the texture size query entirely if we don't need it. */
1269			lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
1270		}
1271
1272		/* Query the texture size. */
1273		resinfo.opcode = ac_image_get_resinfo;
1274		resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1275		resinfo.resource = args->resource;
1276		resinfo.sampler = args->sampler;
1277		resinfo.lod = ctx->ac.i32_0;
1278		resinfo.dmask = 0xf;
1279		resinfo.attributes = AC_FUNC_ATTR_READNONE;
1280
1281		LLVMValueRef texsize =
1282			fix_resinfo(ctx, target,
1283				    ac_build_image_opcode(&ctx->ac, &resinfo));
1284
1285		/* Compute -0.5 / size. */
1286		for (unsigned c = 0; c < 2; c++) {
1287			half_texel[c] =
1288				LLVMBuildExtractElement(builder, texsize,
1289							LLVMConstInt(ctx->i32, c, 0), "");
1290			half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
1291			half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
1292			half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
1293						      LLVMConstReal(ctx->f32, -0.5), "");
1294		}
1295
1296		if (wa_8888) {
1297			lp_build_endif(&if_ctx);
1298
1299			LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
1300
1301			for (unsigned c = 0; c < 2; c++) {
1302				LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
1303				half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
1304							     values, bb);
1305			}
1306		}
1307	}
1308
1309	for (unsigned c = 0; c < 2; c++) {
1310		LLVMValueRef tmp;
1311		tmp = ac_to_float(&ctx->ac, args->coords[c]);
1312		tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
1313		args->coords[c] = ac_to_integer(&ctx->ac, tmp);
1314	}
1315
1316	return wa_8888;
1317}
1318
1319/* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
1320 * result after the gather operation.
1321 */
1322static LLVMValueRef
1323si_fix_gather4_integer_result(struct si_shader_context *ctx,
1324			   LLVMValueRef result,
1325			   enum tgsi_return_type return_type,
1326			   LLVMValueRef wa)
1327{
1328	LLVMBuilderRef builder = ctx->ac.builder;
1329
1330	assert(return_type == TGSI_RETURN_TYPE_SINT ||
1331	       return_type == TGSI_RETURN_TYPE_UINT);
1332
1333	for (unsigned chan = 0; chan < 4; ++chan) {
1334		LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
1335		LLVMValueRef value;
1336		LLVMValueRef wa_value;
1337
1338		value = LLVMBuildExtractElement(builder, result, chanv, "");
1339
1340		if (return_type == TGSI_RETURN_TYPE_UINT)
1341			wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
1342		else
1343			wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
1344		wa_value = ac_to_float(&ctx->ac, wa_value);
1345		value = LLVMBuildSelect(builder, wa, wa_value, value, "");
1346
1347		result = LLVMBuildInsertElement(builder, result, value, chanv, "");
1348	}
1349
1350	return result;
1351}
1352
1353static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
1354				struct lp_build_tgsi_context *bld_base,
1355				struct lp_build_emit_data *emit_data)
1356{
1357	struct si_shader_context *ctx = si_shader_context(bld_base);
1358	const struct tgsi_full_instruction *inst = emit_data->inst;
1359	unsigned opcode = inst->Instruction.Opcode;
1360	unsigned target = inst->Texture.Texture;
1361	struct ac_image_args args = {};
1362	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
1363	unsigned chan;
1364	bool has_offset = inst->Texture.NumOffsets > 0;
1365	LLVMValueRef fmask_ptr = NULL;
1366
1367	tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr);
1368
1369	if (target == TGSI_TEXTURE_BUFFER) {
1370		LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1371		unsigned num_channels =
1372			util_last_bit(inst->Dst[0].Register.WriteMask);
1373		LLVMValueRef result =
1374			ac_build_buffer_load_format(&ctx->ac,
1375						    args.resource,
1376						    vindex,
1377						    ctx->i32_0,
1378						    num_channels, false, true);
1379		emit_data->output[emit_data->chan] =
1380			ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
1381		return;
1382	}
1383
1384	/* Fetch and project texture coordinates */
1385	args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W);
1386	for (chan = 0; chan < 3; chan++) {
1387		args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
1388		if (opcode == TGSI_OPCODE_TXP)
1389			args.coords[chan] = ac_build_fdiv(&ctx->ac,
1390				args.coords[chan], args.coords[3]);
1391	}
1392
1393	if (opcode == TGSI_OPCODE_TXP)
1394		args.coords[3] = ctx->ac.f32_1;
1395
1396	/* Pack offsets. */
1397	if (has_offset &&
1398	    opcode != TGSI_OPCODE_TXF &&
1399	    opcode != TGSI_OPCODE_TXF_LZ) {
1400		/* The offsets are six-bit signed integers packed like this:
1401		 *   X=[5:0], Y=[13:8], and Z=[21:16].
1402		 */
1403		LLVMValueRef offset[3], pack;
1404
1405		assert(inst->Texture.NumOffsets == 1);
1406
1407		for (chan = 0; chan < 3; chan++) {
1408			offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan);
1409			offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
1410						    LLVMConstInt(ctx->i32, 0x3f, 0), "");
1411			if (chan)
1412				offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
1413							    LLVMConstInt(ctx->i32, chan*8, 0), "");
1414		}
1415
1416		pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
1417		pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
1418		args.offset = pack;
1419	}
1420
1421	/* Pack LOD bias value */
1422	if (opcode == TGSI_OPCODE_TXB)
1423		args.bias = args.coords[3];
1424	if (opcode == TGSI_OPCODE_TXB2)
1425		args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1426
1427	/* Pack depth comparison value */
1428	if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
1429		LLVMValueRef z;
1430
1431		if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1432			z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1433		} else {
1434			assert(ref_pos >= 0);
1435			z = args.coords[ref_pos];
1436		}
1437
1438		/* Section 8.23.1 (Depth Texture Comparison Mode) of the
1439		 * OpenGL 4.5 spec says:
1440		 *
1441		 *    "If the texture’s internal format indicates a fixed-point
1442		 *     depth texture, then D_t and D_ref are clamped to the
1443		 *     range [0, 1]; otherwise no clamping is performed."
1444		 *
1445		 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
1446		 * so the depth comparison value isn't clamped for Z16 and
1447		 * Z24 anymore. Do it manually here.
1448		 */
1449		if (ctx->screen->info.chip_class >= VI) {
1450			LLVMValueRef upgraded;
1451			LLVMValueRef clamped;
1452			upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
1453							   LLVMConstInt(ctx->i32, 3, false), "");
1454			upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
1455						 LLVMConstInt(ctx->i32, 29, false), "");
1456			upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, "");
1457			clamped = ac_build_clamp(&ctx->ac, z);
1458			z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, "");
1459		}
1460
1461		args.compare = z;
1462	}
1463
1464	/* Pack user derivatives */
1465	if (opcode == TGSI_OPCODE_TXD) {
1466		int param, num_src_deriv_channels, num_dst_deriv_channels;
1467
1468		switch (target) {
1469		case TGSI_TEXTURE_3D:
1470			num_src_deriv_channels = 3;
1471			num_dst_deriv_channels = 3;
1472			break;
1473		case TGSI_TEXTURE_2D:
1474		case TGSI_TEXTURE_SHADOW2D:
1475		case TGSI_TEXTURE_RECT:
1476		case TGSI_TEXTURE_SHADOWRECT:
1477		case TGSI_TEXTURE_2D_ARRAY:
1478		case TGSI_TEXTURE_SHADOW2D_ARRAY:
1479			num_src_deriv_channels = 2;
1480			num_dst_deriv_channels = 2;
1481			break;
1482		case TGSI_TEXTURE_CUBE:
1483		case TGSI_TEXTURE_SHADOWCUBE:
1484		case TGSI_TEXTURE_CUBE_ARRAY:
1485		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1486			/* Cube derivatives will be converted to 2D. */
1487			num_src_deriv_channels = 3;
1488			num_dst_deriv_channels = 3;
1489			break;
1490		case TGSI_TEXTURE_1D:
1491		case TGSI_TEXTURE_SHADOW1D:
1492		case TGSI_TEXTURE_1D_ARRAY:
1493		case TGSI_TEXTURE_SHADOW1D_ARRAY:
1494			num_src_deriv_channels = 1;
1495
1496			/* 1D textures are allocated and used as 2D on GFX9. */
1497			if (ctx->screen->info.chip_class >= GFX9) {
1498				num_dst_deriv_channels = 2;
1499			} else {
1500				num_dst_deriv_channels = 1;
1501			}
1502			break;
1503		default:
1504			unreachable("invalid target");
1505		}
1506
1507		for (param = 0; param < 2; param++) {
1508			for (chan = 0; chan < num_src_deriv_channels; chan++)
1509				args.derivs[param * num_dst_deriv_channels + chan] =
1510					lp_build_emit_fetch(bld_base, inst, param+1, chan);
1511
1512			/* Fill in the rest with zeros. */
1513			for (chan = num_src_deriv_channels;
1514			     chan < num_dst_deriv_channels; chan++)
1515				args.derivs[param * num_dst_deriv_channels + chan] =
1516					ctx->ac.f32_0;
1517		}
1518	}
1519
1520	if (target == TGSI_TEXTURE_CUBE ||
1521	    target == TGSI_TEXTURE_CUBE_ARRAY ||
1522	    target == TGSI_TEXTURE_SHADOWCUBE ||
1523	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1524		ac_prepare_cube_coords(&ctx->ac,
1525				       opcode == TGSI_OPCODE_TXD,
1526				       target == TGSI_TEXTURE_CUBE_ARRAY ||
1527				       target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
1528				       opcode == TGSI_OPCODE_LODQ,
1529				       args.coords, args.derivs);
1530	} else if (tgsi_is_array_sampler(target) &&
1531		   opcode != TGSI_OPCODE_TXF &&
1532		   opcode != TGSI_OPCODE_TXF_LZ &&
1533		   ctx->screen->info.chip_class <= VI) {
1534		unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
1535		args.coords[array_coord] = ac_build_round(&ctx->ac, args.coords[array_coord]);
1536	}
1537
1538	/* 1D textures are allocated and used as 2D on GFX9. */
1539	if (ctx->screen->info.chip_class >= GFX9) {
1540		LLVMValueRef filler;
1541
1542		/* Use 0.5, so that we don't sample the border color. */
1543		if (opcode == TGSI_OPCODE_TXF ||
1544		    opcode == TGSI_OPCODE_TXF_LZ)
1545			filler = ctx->i32_0;
1546		else
1547			filler = LLVMConstReal(ctx->f32, 0.5);
1548
1549		if (target == TGSI_TEXTURE_1D ||
1550		    target == TGSI_TEXTURE_SHADOW1D) {
1551			args.coords[1] = filler;
1552		} else if (target == TGSI_TEXTURE_1D_ARRAY ||
1553			   target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
1554			args.coords[2] = args.coords[1];
1555			args.coords[1] = filler;
1556		}
1557	}
1558
1559	/* Pack LOD or sample index */
1560	if (opcode == TGSI_OPCODE_TXL)
1561		args.lod = args.coords[3];
1562	else if (opcode == TGSI_OPCODE_TXL2)
1563		args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1564	else if (opcode == TGSI_OPCODE_TXF) {
1565		if (target == TGSI_TEXTURE_2D_MSAA) {
1566			/* No LOD, but move sample index into the right place. */
1567			args.coords[2] = args.coords[3];
1568		} else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
1569			args.lod = args.coords[3];
1570		}
1571	}
1572
1573	if (target == TGSI_TEXTURE_2D_MSAA ||
1574	    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1575		ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords,
1576					 target == TGSI_TEXTURE_2D_ARRAY_MSAA);
1577	}
1578
1579	if (opcode == TGSI_OPCODE_TXF ||
1580	    opcode == TGSI_OPCODE_TXF_LZ) {
1581		/* add tex offsets */
1582		if (inst->Texture.NumOffsets) {
1583			const struct tgsi_texture_offset *off = inst->TexOffsets;
1584
1585			assert(inst->Texture.NumOffsets == 1);
1586
1587			switch (target) {
1588			case TGSI_TEXTURE_3D:
1589				args.coords[2] =
1590					LLVMBuildAdd(ctx->ac.builder, args.coords[2],
1591						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
1592				/* fall through */
1593			case TGSI_TEXTURE_2D:
1594			case TGSI_TEXTURE_SHADOW2D:
1595			case TGSI_TEXTURE_RECT:
1596			case TGSI_TEXTURE_SHADOWRECT:
1597			case TGSI_TEXTURE_2D_ARRAY:
1598			case TGSI_TEXTURE_SHADOW2D_ARRAY:
1599				args.coords[1] =
1600					LLVMBuildAdd(ctx->ac.builder, args.coords[1],
1601						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
1602				/* fall through */
1603			case TGSI_TEXTURE_1D:
1604			case TGSI_TEXTURE_SHADOW1D:
1605			case TGSI_TEXTURE_1D_ARRAY:
1606			case TGSI_TEXTURE_SHADOW1D_ARRAY:
1607				args.coords[0] =
1608					LLVMBuildAdd(ctx->ac.builder, args.coords[0],
1609						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
1610				break;
1611				/* texture offsets do not apply to other texture targets */
1612			}
1613		}
1614	}
1615
1616	if (opcode == TGSI_OPCODE_TG4) {
1617		unsigned gather_comp = 0;
1618
1619		/* DMASK was repurposed for GATHER4. 4 components are always
1620		 * returned and DMASK works like a swizzle - it selects
1621		 * the component to fetch. The only valid DMASK values are
1622		 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1623		 * (red,red,red,red) etc.) The ISA document doesn't mention
1624		 * this.
1625		 */
1626
1627		/* Get the component index from src1.x for Gather4. */
1628		if (!tgsi_is_shadow_target(target)) {
1629			LLVMValueRef comp_imm;
1630			struct tgsi_src_register src1 = inst->Src[1].Register;
1631
1632			assert(src1.File == TGSI_FILE_IMMEDIATE);
1633
1634			comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
1635			gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1636			gather_comp = CLAMP(gather_comp, 0, 3);
1637		}
1638
1639		args.dmask = 1 << gather_comp;
1640	} else {
1641		args.dmask = 0xf;
1642	}
1643
1644	args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1645	args.unorm = target == TGSI_TEXTURE_RECT ||
1646		     target == TGSI_TEXTURE_SHADOWRECT;
1647	args.opcode = ac_image_sample;
1648
1649	switch (opcode) {
1650	case TGSI_OPCODE_TXF:
1651	case TGSI_OPCODE_TXF_LZ:
1652		args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
1653			      target == TGSI_TEXTURE_2D_MSAA ||
1654			      target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
1655				      ac_image_load : ac_image_load_mip;
1656		break;
1657	case TGSI_OPCODE_LODQ:
1658		args.opcode = ac_image_get_lod;
1659		break;
1660	case TGSI_OPCODE_TEX:
1661	case TGSI_OPCODE_TEX2:
1662	case TGSI_OPCODE_TXP:
1663		if (ctx->type != PIPE_SHADER_FRAGMENT)
1664			args.level_zero = true;
1665		break;
1666	case TGSI_OPCODE_TEX_LZ:
1667		args.level_zero = true;
1668		break;
1669	case TGSI_OPCODE_TXB:
1670	case TGSI_OPCODE_TXB2:
1671		assert(ctx->type == PIPE_SHADER_FRAGMENT);
1672		break;
1673	case TGSI_OPCODE_TXL:
1674	case TGSI_OPCODE_TXL2:
1675		break;
1676	case TGSI_OPCODE_TXD:
1677		break;
1678	case TGSI_OPCODE_TG4:
1679		args.opcode = ac_image_gather4;
1680		args.level_zero = true;
1681		break;
1682	default:
1683		assert(0);
1684		return;
1685	}
1686
1687	/* The hardware needs special lowering for Gather4 with integer formats. */
1688	LLVMValueRef gather4_int_result_workaround = NULL;
1689
1690	if (ctx->screen->info.chip_class <= VI &&
1691	    opcode == TGSI_OPCODE_TG4) {
1692		assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
1693
1694		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
1695		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
1696			gather4_int_result_workaround =
1697				si_lower_gather4_integer(ctx, &args, target,
1698							 inst->Texture.ReturnType);
1699		}
1700	}
1701
1702	args.attributes = AC_FUNC_ATTR_READNONE;
1703	LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
1704
1705	if (gather4_int_result_workaround) {
1706		result = si_fix_gather4_integer_result(ctx, result,
1707						       inst->Texture.ReturnType,
1708						       gather4_int_result_workaround);
1709	}
1710
1711	emit_data->output[emit_data->chan] = result;
1712}
1713
1714static void si_llvm_emit_txqs(
1715	const struct lp_build_tgsi_action *action,
1716	struct lp_build_tgsi_context *bld_base,
1717	struct lp_build_emit_data *emit_data)
1718{
1719	struct si_shader_context *ctx = si_shader_context(bld_base);
1720	LLVMValueRef res, samples;
1721	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
1722
1723	tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
1724
1725	/* Read the samples from the descriptor directly. */
1726	res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->v8i32, "");
1727	samples = LLVMBuildExtractElement(ctx->ac.builder, res,
1728					  LLVMConstInt(ctx->i32, 3, 0), "");
1729	samples = LLVMBuildLShr(ctx->ac.builder, samples,
1730				LLVMConstInt(ctx->i32, 16, 0), "");
1731	samples = LLVMBuildAnd(ctx->ac.builder, samples,
1732			       LLVMConstInt(ctx->i32, 0xf, 0), "");
1733	samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1,
1734			       samples, "");
1735
1736	emit_data->output[emit_data->chan] = samples;
1737}
1738
1739static void si_llvm_emit_fbfetch(const struct lp_build_tgsi_action *action,
1740				 struct lp_build_tgsi_context *bld_base,
1741				 struct lp_build_emit_data *emit_data)
1742{
1743	struct si_shader_context *ctx = si_shader_context(bld_base);
1744	struct ac_image_args args = {};
1745	LLVMValueRef ptr, image, fmask;
1746
1747	/* Ignore src0, because KHR_blend_func_extended disallows multiple render
1748	 * targets.
1749	 */
1750
1751	/* Load the image descriptor. */
1752	STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
1753	ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1754	ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
1755				   ac_array_in_const32_addr_space(ctx->v8i32), "");
1756	image = ac_build_load_to_sgpr(&ctx->ac, ptr,
1757			LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
1758
1759	unsigned chan = 0;
1760
1761	args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
1762
1763	if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1764		args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
1765
1766	/* Get the current render target layer index. */
1767	if (ctx->shader->key.mono.u.ps.fbfetch_layered)
1768		args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
1769
1770	if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1771		args.coords[chan++] = si_get_sample_id(ctx);
1772
1773	if (ctx->shader->key.mono.u.ps.fbfetch_msaa) {
1774		fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
1775			LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
1776
1777		ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
1778					 ctx->shader->key.mono.u.ps.fbfetch_layered);
1779	}
1780
1781	args.opcode = ac_image_load;
1782	args.resource = image;
1783	args.dmask = 0xf;
1784	args.attributes = AC_FUNC_ATTR_READNONE;
1785
1786	if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1787		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1788			ac_image_2darraymsaa : ac_image_2dmsaa;
1789	else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1790		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1791			ac_image_1darray : ac_image_1d;
1792	else
1793		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1794			ac_image_2darray : ac_image_2d;
1795
1796	emit_data->output[emit_data->chan] =
1797		ac_build_image_opcode(&ctx->ac, &args);
1798}
1799
1800/**
1801 * Setup actions for TGSI memory opcode, including texture opcodes.
1802 */
1803void si_shader_context_init_mem(struct si_shader_context *ctx)
1804{
1805	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1806
1807	bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic;
1808	bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic;
1809	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic;
1810	bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic;
1811	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic;
1812	bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic;
1813	bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic;
1814	bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic;
1815	bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic;
1816	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic;
1817	bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic;
1818	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit;
1819	bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic;
1820	bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic;
1821	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
1822
1823	bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_llvm_emit_fbfetch;
1824
1825	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
1826	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
1827	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
1828
1829	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
1830	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
1831	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
1832	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
1833	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
1834	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
1835	bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
1836	bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
1837	bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
1838	bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
1839	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
1840	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
1841	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
1842	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
1843	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
1844	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
1845	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
1846	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
1847	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
1848	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
1849}
1850