r600_shader.c revision 01e04c3f
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_formats.h"
25#include "r600_opcodes.h"
26#include "r600_shader.h"
27#include "r600d.h"
28
29#include "sb/sb_public.h"
30
31#include "pipe/p_shader_tokens.h"
32#include "tgsi/tgsi_info.h"
33#include "tgsi/tgsi_parse.h"
34#include "tgsi/tgsi_scan.h"
35#include "tgsi/tgsi_dump.h"
36#include "util/u_bitcast.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63/* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66      .y = RelVertexID (??)
67      .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70      r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73       .y = RelPatchID (??)
74       .z = InvocationID
75       .w = tess factor base.
76
77 TES - .x = TessCoord.x
78     - .y = TessCoord.y
79     - .z = RelPatchID (??)
80     - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83      face_gpr.w = SampleID
84*/
85#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86static int r600_shader_from_tgsi(struct r600_context *rctx,
87				 struct r600_pipe_shader *pipeshader,
88				 union r600_shader_key key);
89
90static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91                           int size, unsigned comp_mask) {
92
93	if (!size)
94		return;
95
96	if (ps->num_arrays == ps->max_arrays) {
97		ps->max_arrays += 64;
98		ps->arrays = realloc(ps->arrays, ps->max_arrays *
99		                     sizeof(struct r600_shader_array));
100	}
101
102	int n = ps->num_arrays;
103	++ps->num_arrays;
104
105	ps->arrays[n].comp_mask = comp_mask;
106	ps->arrays[n].gpr_start = start_gpr;
107	ps->arrays[n].gpr_count = size;
108}
109
110static void r600_dump_streamout(struct pipe_stream_output_info *so)
111{
112	unsigned i;
113
114	fprintf(stderr, "STREAMOUT\n");
115	for (i = 0; i < so->num_outputs; i++) {
116		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117				so->output[i].start_component;
118		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119			i,
120			so->output[i].stream,
121			so->output[i].output_buffer,
122			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123			so->output[i].register_index,
124			mask & 1 ? "x" : "",
125		        mask & 2 ? "y" : "",
126		        mask & 4 ? "z" : "",
127		        mask & 8 ? "w" : "",
128			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129	}
130}
131
132static int store_shader(struct pipe_context *ctx,
133			struct r600_pipe_shader *shader)
134{
135	struct r600_context *rctx = (struct r600_context *)ctx;
136	uint32_t *ptr, i;
137
138	if (shader->bo == NULL) {
139		shader->bo = (struct r600_resource*)
140			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141		if (shader->bo == NULL) {
142			return -ENOMEM;
143		}
144		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145		if (R600_BIG_ENDIAN) {
146			for (i = 0; i < shader->shader.bc.ndw; ++i) {
147				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148			}
149		} else {
150			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151		}
152		rctx->b.ws->buffer_unmap(shader->bo->buf);
153	}
154
155	return 0;
156}
157
158int r600_pipe_shader_create(struct pipe_context *ctx,
159			    struct r600_pipe_shader *shader,
160			    union r600_shader_key key)
161{
162	struct r600_context *rctx = (struct r600_context *)ctx;
163	struct r600_pipe_shader_selector *sel = shader->selector;
164	int r;
165	bool dump = r600_can_dump_shader(&rctx->screen->b,
166					 tgsi_get_processor_type(sel->tokens));
167	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168	unsigned sb_disasm;
169	unsigned export_shader;
170
171	shader->shader.bc.isa = rctx->isa;
172
173	if (dump) {
174		fprintf(stderr, "--------------------------------------------------------------\n");
175		tgsi_dump(sel->tokens, 0);
176
177		if (sel->so.num_outputs) {
178			r600_dump_streamout(&sel->so);
179		}
180	}
181	r = r600_shader_from_tgsi(rctx, shader, key);
182	if (r) {
183		R600_ERR("translation from TGSI failed !\n");
184		goto error;
185	}
186	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187		/* only disable for vertex shaders in tess paths */
188		if (key.vs.as_ls)
189			use_sb = 0;
190	}
191	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193	use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194
195	/* disable SB for shaders using doubles */
196	use_sb &= !shader->shader.uses_doubles;
197
198	use_sb &= !shader->shader.uses_atomics;
199	use_sb &= !shader->shader.uses_images;
200	use_sb &= !shader->shader.uses_helper_invocation;
201
202	/* Check if the bytecode has already been built. */
203	if (!shader->shader.bc.bytecode) {
204		r = r600_bytecode_build(&shader->shader.bc);
205		if (r) {
206			R600_ERR("building bytecode failed !\n");
207			goto error;
208		}
209	}
210
211	sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
212	if (dump && !sb_disasm) {
213		fprintf(stderr, "--------------------------------------------------------------\n");
214		r600_bytecode_disasm(&shader->shader.bc);
215		fprintf(stderr, "______________________________________________________________\n");
216	} else if ((dump && sb_disasm) || use_sb) {
217		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
218		                             dump, use_sb);
219		if (r) {
220			R600_ERR("r600_sb_bytecode_process failed !\n");
221			goto error;
222		}
223	}
224
225	if (shader->gs_copy_shader) {
226		if (dump) {
227			// dump copy shader
228			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
229						     &shader->gs_copy_shader->shader, dump, 0);
230			if (r)
231				goto error;
232		}
233
234		if ((r = store_shader(ctx, shader->gs_copy_shader)))
235			goto error;
236	}
237
238	/* Store the shader in a buffer. */
239	if ((r = store_shader(ctx, shader)))
240		goto error;
241
242	/* Build state. */
243	switch (shader->shader.processor_type) {
244	case PIPE_SHADER_TESS_CTRL:
245		evergreen_update_hs_state(ctx, shader);
246		break;
247	case PIPE_SHADER_TESS_EVAL:
248		if (key.tes.as_es)
249			evergreen_update_es_state(ctx, shader);
250		else
251			evergreen_update_vs_state(ctx, shader);
252		break;
253	case PIPE_SHADER_GEOMETRY:
254		if (rctx->b.chip_class >= EVERGREEN) {
255			evergreen_update_gs_state(ctx, shader);
256			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
257		} else {
258			r600_update_gs_state(ctx, shader);
259			r600_update_vs_state(ctx, shader->gs_copy_shader);
260		}
261		break;
262	case PIPE_SHADER_VERTEX:
263		export_shader = key.vs.as_es;
264		if (rctx->b.chip_class >= EVERGREEN) {
265			if (key.vs.as_ls)
266				evergreen_update_ls_state(ctx, shader);
267			else if (key.vs.as_es)
268				evergreen_update_es_state(ctx, shader);
269			else
270				evergreen_update_vs_state(ctx, shader);
271		} else {
272			if (export_shader)
273				r600_update_es_state(ctx, shader);
274			else
275				r600_update_vs_state(ctx, shader);
276		}
277		break;
278	case PIPE_SHADER_FRAGMENT:
279		if (rctx->b.chip_class >= EVERGREEN) {
280			evergreen_update_ps_state(ctx, shader);
281		} else {
282			r600_update_ps_state(ctx, shader);
283		}
284		break;
285	case PIPE_SHADER_COMPUTE:
286		evergreen_update_ls_state(ctx, shader);
287		break;
288	default:
289		r = -EINVAL;
290		goto error;
291	}
292	return 0;
293
294error:
295	r600_pipe_shader_destroy(ctx, shader);
296	return r;
297}
298
299void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
300{
301	r600_resource_reference(&shader->bo, NULL);
302	r600_bytecode_clear(&shader->shader.bc);
303	r600_release_command_buffer(&shader->command_buffer);
304}
305
306/*
307 * tgsi -> r600 shader
308 */
309struct r600_shader_tgsi_instruction;
310
311struct r600_shader_src {
312	unsigned				sel;
313	unsigned				swizzle[4];
314	unsigned				neg;
315	unsigned				abs;
316	unsigned				rel;
317	unsigned				kc_bank;
318	boolean					kc_rel; /* true if cache bank is indexed */
319	uint32_t				value[4];
320};
321
322struct eg_interp {
323	boolean					enabled;
324	unsigned				ij_index;
325};
326
327struct r600_shader_ctx {
328	struct tgsi_shader_info			info;
329	struct tgsi_array_info			*array_infos;
330	/* flag for each tgsi temp array if its been spilled or not */
331	bool					*spilled_arrays;
332	struct tgsi_parse_context		parse;
333	const struct tgsi_token			*tokens;
334	unsigned				type;
335	unsigned				file_offset[TGSI_FILE_COUNT];
336	unsigned				temp_reg;
337	const struct r600_shader_tgsi_instruction	*inst_info;
338	struct r600_bytecode			*bc;
339	struct r600_shader			*shader;
340	struct r600_shader_src			src[4];
341	uint32_t				*literals;
342	uint32_t				nliterals;
343	uint32_t				max_driver_temp_used;
344	/* needed for evergreen interpolation */
345	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
346	/* evergreen/cayman also store sample mask in face register */
347	int					face_gpr;
348	/* sample id is .w component stored in fixed point position register */
349	int					fixed_pt_position_gpr;
350	int					colors_used;
351	boolean                 clip_vertex_write;
352	unsigned                cv_output;
353	unsigned		edgeflag_output;
354	int					helper_invoc_reg;
355	int                                     cs_block_size_reg;
356	int                                     cs_grid_size_reg;
357	bool cs_block_size_loaded, cs_grid_size_loaded;
358	int					fragcoord_input;
359	int					next_ring_offset;
360	int					gs_out_ring_offset;
361	int					gs_next_vertex;
362	struct r600_shader	*gs_for_vs;
363	int					gs_export_gpr_tregs[4];
364	int                                     gs_rotated_input[2];
365	const struct pipe_stream_output_info	*gs_stream_output_info;
366	unsigned				enabled_stream_buffers_mask;
367	unsigned                                tess_input_info; /* temp with tess input offsets */
368	unsigned                                tess_output_info; /* temp with tess input offsets */
369	unsigned                                thread_id_gpr; /* temp with thread id calculated for images */
370};
371
372struct r600_shader_tgsi_instruction {
373	unsigned	op;
374	int (*process)(struct r600_shader_ctx *ctx);
375};
376
377static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
378static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
379static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
380static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
381static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
382static int tgsi_else(struct r600_shader_ctx *ctx);
383static int tgsi_endif(struct r600_shader_ctx *ctx);
384static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
385static int tgsi_endloop(struct r600_shader_ctx *ctx);
386static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
387static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
388                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
389                                unsigned int dst_reg);
390static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
391			const struct r600_shader_src *shader_src,
392			unsigned chan);
393static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
394			       unsigned dst_reg, unsigned mask);
395
396static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
397{
398	if (ctx->bc->family == CHIP_HEMLOCK ||
399	    ctx->bc->family == CHIP_CYPRESS ||
400	    ctx->bc->family == CHIP_JUNIPER)
401		return false;
402	return true;
403}
404
405static int tgsi_last_instruction(unsigned writemask)
406{
407	int i, lasti = 0;
408
409	for (i = 0; i < 4; i++) {
410		if (writemask & (1 << i)) {
411			lasti = i;
412		}
413	}
414	return lasti;
415}
416
417static int tgsi_is_supported(struct r600_shader_ctx *ctx)
418{
419	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
420	unsigned j;
421
422	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
423		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
424		return -EINVAL;
425	}
426#if 0
427	if (i->Instruction.Label) {
428		R600_ERR("label unsupported\n");
429		return -EINVAL;
430	}
431#endif
432	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
433		if (i->Src[j].Register.Dimension) {
434		   switch (i->Src[j].Register.File) {
435		   case TGSI_FILE_CONSTANT:
436		   case TGSI_FILE_HW_ATOMIC:
437			   break;
438		   case TGSI_FILE_INPUT:
439			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
440			       ctx->type == PIPE_SHADER_TESS_CTRL ||
441			       ctx->type == PIPE_SHADER_TESS_EVAL)
442				   break;
443		   case TGSI_FILE_OUTPUT:
444			   if (ctx->type == PIPE_SHADER_TESS_CTRL)
445				   break;
446		   default:
447			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
448				    i->Src[j].Register.File,
449				    i->Src[j].Register.Dimension);
450			   return -EINVAL;
451		   }
452		}
453	}
454	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
455		if (i->Dst[j].Register.Dimension) {
456			if (ctx->type == PIPE_SHADER_TESS_CTRL)
457				continue;
458			R600_ERR("unsupported dst (dimension)\n");
459			return -EINVAL;
460		}
461	}
462	return 0;
463}
464
465int eg_get_interpolator_index(unsigned interpolate, unsigned location)
466{
467	if (interpolate == TGSI_INTERPOLATE_COLOR ||
468		interpolate == TGSI_INTERPOLATE_LINEAR ||
469		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
470	{
471		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
472		int loc;
473
474		switch(location) {
475		case TGSI_INTERPOLATE_LOC_CENTER:
476			loc = 1;
477			break;
478		case TGSI_INTERPOLATE_LOC_CENTROID:
479			loc = 2;
480			break;
481		case TGSI_INTERPOLATE_LOC_SAMPLE:
482		default:
483			loc = 0; break;
484		}
485
486		return is_linear * 3 + loc;
487	}
488
489	return -1;
490}
491
492static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
493		int input)
494{
495	int i = eg_get_interpolator_index(
496		ctx->shader->input[input].interpolate,
497		ctx->shader->input[input].interpolate_location);
498	assert(i >= 0);
499	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
500}
501
502static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
503{
504	int i, r;
505	struct r600_bytecode_alu alu;
506	int gpr = 0, base_chan = 0;
507	int ij_index = ctx->shader->input[input].ij_index;
508
509	/* work out gpr and base_chan from index */
510	gpr = ij_index / 2;
511	base_chan = (2 * (ij_index % 2)) + 1;
512
513	for (i = 0; i < 8; i++) {
514		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
515
516		if (i < 4)
517			alu.op = ALU_OP2_INTERP_ZW;
518		else
519			alu.op = ALU_OP2_INTERP_XY;
520
521		if ((i > 1) && (i < 6)) {
522			alu.dst.sel = ctx->shader->input[input].gpr;
523			alu.dst.write = 1;
524		}
525
526		alu.dst.chan = i % 4;
527
528		alu.src[0].sel = gpr;
529		alu.src[0].chan = (base_chan - (i % 2));
530
531		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
532
533		alu.bank_swizzle_force = SQ_ALU_VEC_210;
534		if ((i % 4) == 3)
535			alu.last = 1;
536		r = r600_bytecode_add_alu(ctx->bc, &alu);
537		if (r)
538			return r;
539	}
540	return 0;
541}
542
543static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
544{
545	int i, r;
546	struct r600_bytecode_alu alu;
547
548	for (i = 0; i < 4; i++) {
549		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
550
551		alu.op = ALU_OP1_INTERP_LOAD_P0;
552
553		alu.dst.sel = ctx->shader->input[input].gpr;
554		alu.dst.write = 1;
555
556		alu.dst.chan = i;
557
558		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
559		alu.src[0].chan = i;
560
561		if (i == 3)
562			alu.last = 1;
563		r = r600_bytecode_add_alu(ctx->bc, &alu);
564		if (r)
565			return r;
566	}
567	return 0;
568}
569
570/*
571 * Special export handling in shaders
572 *
573 * shader export ARRAY_BASE for EXPORT_POS:
574 * 60 is position
575 * 61 is misc vector
576 * 62, 63 are clip distance vectors
577 *
578 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
579 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
580 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
581 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
582 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
583 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
584 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
585 * exclusive from render target index)
586 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
587 *
588 *
589 * shader export ARRAY_BASE for EXPORT_PIXEL:
590 * 0-7 CB targets
591 * 61 computed Z vector
592 *
593 * The use of the values exported in the computed Z vector are controlled
594 * by DB_SHADER_CONTROL:
595 * Z_EXPORT_ENABLE - Z as a float in RED
596 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
597 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
598 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
599 * DB_SOURCE_FORMAT - export control restrictions
600 *
601 */
602
603
604/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
605static int r600_spi_sid(struct r600_shader_io * io)
606{
607	int index, name = io->name;
608
609	/* These params are handled differently, they don't need
610	 * semantic indices, so we'll use 0 for them.
611	 */
612	if (name == TGSI_SEMANTIC_POSITION ||
613	    name == TGSI_SEMANTIC_PSIZE ||
614	    name == TGSI_SEMANTIC_EDGEFLAG ||
615	    name == TGSI_SEMANTIC_FACE ||
616	    name == TGSI_SEMANTIC_SAMPLEMASK)
617		index = 0;
618	else {
619		if (name == TGSI_SEMANTIC_GENERIC) {
620			/* For generic params simply use sid from tgsi */
621			index = io->sid;
622		} else {
623			/* For non-generic params - pack name and sid into 8 bits */
624			index = 0x80 | (name<<3) | (io->sid);
625		}
626
627		/* Make sure that all really used indices have nonzero value, so
628		 * we can just compare it to 0 later instead of comparing the name
629		 * with different values to detect special cases. */
630		index++;
631	}
632
633	return index;
634};
635
636/* we need this to get a common lds index for vs/tcs/tes input/outputs */
637int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
638{
639	switch (semantic_name) {
640	case TGSI_SEMANTIC_POSITION:
641		return 0;
642	case TGSI_SEMANTIC_PSIZE:
643		return 1;
644	case TGSI_SEMANTIC_CLIPDIST:
645		assert(index <= 1);
646		return 2 + index;
647	case TGSI_SEMANTIC_GENERIC:
648		if (index <= 63-4)
649			return 4 + index - 9;
650		else
651			/* same explanation as in the default statement,
652			 * the only user hitting this is st/nine.
653			 */
654			return 0;
655
656	/* patch indices are completely separate and thus start from 0 */
657	case TGSI_SEMANTIC_TESSOUTER:
658		return 0;
659	case TGSI_SEMANTIC_TESSINNER:
660		return 1;
661	case TGSI_SEMANTIC_PATCH:
662		return 2 + index;
663
664	default:
665		/* Don't fail here. The result of this function is only used
666		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
667		 * occur, but this function is called for all vertex shaders
668		 * before it's known whether LS will be compiled or not.
669		 */
670		return 0;
671	}
672}
673
674/* turn input into interpolate on EG */
675static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
676{
677	int r = 0;
678
679	if (ctx->shader->input[index].spi_sid) {
680		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
681		if (ctx->shader->input[index].interpolate > 0) {
682			evergreen_interp_assign_ij_index(ctx, index);
683			r = evergreen_interp_alu(ctx, index);
684		} else {
685			r = evergreen_interp_flat(ctx, index);
686		}
687	}
688	return r;
689}
690
691static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
692{
693	struct r600_bytecode_alu alu;
694	int i, r;
695	int gpr_front = ctx->shader->input[front].gpr;
696	int gpr_back = ctx->shader->input[back].gpr;
697
698	for (i = 0; i < 4; i++) {
699		memset(&alu, 0, sizeof(alu));
700		alu.op = ALU_OP3_CNDGT;
701		alu.is_op3 = 1;
702		alu.dst.write = 1;
703		alu.dst.sel = gpr_front;
704		alu.src[0].sel = ctx->face_gpr;
705		alu.src[1].sel = gpr_front;
706		alu.src[2].sel = gpr_back;
707
708		alu.dst.chan = i;
709		alu.src[1].chan = i;
710		alu.src[2].chan = i;
711		alu.last = (i==3);
712
713		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
714			return r;
715	}
716
717	return 0;
718}
719
720/* execute a single slot ALU calculation */
721static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
722			  int dst_sel, int dst_chan,
723			  int src0_sel, unsigned src0_chan_val,
724			  int src1_sel, unsigned src1_chan_val)
725{
726	struct r600_bytecode_alu alu;
727	int r, i;
728
729	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
730		for (i = 0; i < 4; i++) {
731			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
732			alu.op = op;
733			alu.src[0].sel = src0_sel;
734			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
735				alu.src[0].value = src0_chan_val;
736			else
737				alu.src[0].chan = src0_chan_val;
738			alu.src[1].sel = src1_sel;
739			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
740				alu.src[1].value = src1_chan_val;
741			else
742				alu.src[1].chan = src1_chan_val;
743			alu.dst.sel = dst_sel;
744			alu.dst.chan = i;
745			alu.dst.write = i == dst_chan;
746			alu.last = (i == 3);
747			r = r600_bytecode_add_alu(ctx->bc, &alu);
748			if (r)
749				return r;
750		}
751		return 0;
752	}
753
754	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
755	alu.op = op;
756	alu.src[0].sel = src0_sel;
757	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
758		alu.src[0].value = src0_chan_val;
759	else
760		alu.src[0].chan = src0_chan_val;
761	alu.src[1].sel = src1_sel;
762	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
763		alu.src[1].value = src1_chan_val;
764	else
765		alu.src[1].chan = src1_chan_val;
766	alu.dst.sel = dst_sel;
767	alu.dst.chan = dst_chan;
768	alu.dst.write = 1;
769	alu.last = 1;
770	r = r600_bytecode_add_alu(ctx->bc, &alu);
771	if (r)
772		return r;
773	return 0;
774}
775
776/* execute a single slot ALU calculation */
777static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
778			  int dst_sel, int dst_chan,
779			  int src0_sel, unsigned src0_chan_val,
780			  int src1_sel, unsigned src1_chan_val,
781			  int src2_sel, unsigned src2_chan_val)
782{
783	struct r600_bytecode_alu alu;
784	int r;
785
786	/* validate this for other ops */
787	assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
788	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
789	alu.op = op;
790	alu.src[0].sel = src0_sel;
791	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
792		alu.src[0].value = src0_chan_val;
793	else
794		alu.src[0].chan = src0_chan_val;
795	alu.src[1].sel = src1_sel;
796	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
797		alu.src[1].value = src1_chan_val;
798	else
799		alu.src[1].chan = src1_chan_val;
800	alu.src[2].sel = src2_sel;
801	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
802		alu.src[2].value = src2_chan_val;
803	else
804		alu.src[2].chan = src2_chan_val;
805	alu.dst.sel = dst_sel;
806	alu.dst.chan = dst_chan;
807	alu.is_op3 = 1;
808	alu.last = 1;
809	r = r600_bytecode_add_alu(ctx->bc, &alu);
810	if (r)
811		return r;
812	return 0;
813}
814
815/* put it in temp_reg.x */
816static int get_lds_offset0(struct r600_shader_ctx *ctx,
817			   int rel_patch_chan,
818			   int temp_reg, bool is_patch_var)
819{
820	int r;
821
822	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
823	/* ADD
824	   Dimension - patch0_offset (input_vals.z),
825	   Non-dim - patch0_data_offset (input_vals.w)
826	*/
827	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
828			   temp_reg, 0,
829			   ctx->tess_output_info, 0,
830			   0, rel_patch_chan,
831			   ctx->tess_output_info, is_patch_var ? 3 : 2);
832	if (r)
833		return r;
834	return 0;
835}
836
837static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
838{
839	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
840}
841
842static int r600_get_temp(struct r600_shader_ctx *ctx)
843{
844	return ctx->temp_reg + ctx->max_driver_temp_used++;
845}
846
847static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
848{
849	int i;
850	i = ctx->shader->noutput++;
851	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
852	ctx->shader->output[i].sid = 0;
853	ctx->shader->output[i].gpr = 0;
854	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
855	ctx->shader->output[i].write_mask = 0x4;
856	ctx->shader->output[i].spi_sid = prim_id_sid;
857
858	return 0;
859}
860
861static int tgsi_barrier(struct r600_shader_ctx *ctx)
862{
863	struct r600_bytecode_alu alu;
864	int r;
865
866	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
867	alu.op = ctx->inst_info->op;
868	alu.last = 1;
869
870	r = r600_bytecode_add_alu(ctx->bc, &alu);
871	if (r)
872		return r;
873	return 0;
874}
875
876static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
877{
878	// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
879	unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
880	unsigned narrays_left = n;
881	bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
882
883	*scratch_space_needed = 0;
884	while (*regno > 124 && narrays_left) {
885		unsigned i;
886		unsigned largest = 0;
887		unsigned largest_index = 0;
888
889		for (i = 0; i < n; i++) {
890			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
891			if (!spilled[i] && size > largest) {
892				largest = size;
893				largest_index = i;
894			}
895		}
896
897		spilled[largest_index] = true;
898		*regno -= largest;
899		*scratch_space_needed += largest;
900
901		narrays_left --;
902	}
903
904	if (narrays_left == 0) {
905		ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
906	}
907}
908
909/* Take spilled temp arrays into account when translating tgsi register
910 * indexes into r600 gprs if spilled is false, or scratch array offset if
911 * spilled is true */
912static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
913{
914	unsigned i;
915	unsigned spilled_size = 0;
916
917	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
918		if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
919			if (ctx->spilled_arrays[i]) {
920				/* vec4 index into spilled scratch memory */
921				*spilled = true;
922				return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
923			}
924			else {
925				/* regular GPR array */
926				*spilled = false;
927				return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
928			}
929		}
930
931		if (tgsi_reg_index < ctx->array_infos[i].range.First)
932			break;
933		if (ctx->spilled_arrays[i]) {
934			spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
935		}
936	}
937
938	/* regular GPR index, minus the holes from spilled arrays */
939	*spilled = false;
940
941	return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
942}
943
944/* look up spill area base offset and array size for a spilled temp array */
945static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
946	unsigned *array_base, unsigned *array_size)
947{
948	unsigned i;
949	unsigned offset = 0;
950
951	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
952		if (ctx->spilled_arrays[i]) {
953			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
954
955			if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
956				*array_base = offset;
957				*array_size = size - 1; /* hw counts from 1 */
958
959				return;
960			}
961
962			offset += size;
963		}
964	}
965}
966
967static int tgsi_declaration(struct r600_shader_ctx *ctx)
968{
969	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
970	int r, i, j, count = d->Range.Last - d->Range.First + 1;
971
972	switch (d->Declaration.File) {
973	case TGSI_FILE_INPUT:
974		for (j = 0; j < count; j++) {
975			i = ctx->shader->ninput + j;
976			assert(i < ARRAY_SIZE(ctx->shader->input));
977			ctx->shader->input[i].name = d->Semantic.Name;
978			ctx->shader->input[i].sid = d->Semantic.Index + j;
979			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
980			ctx->shader->input[i].interpolate_location = d->Interp.Location;
981			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
982			if (ctx->type == PIPE_SHADER_FRAGMENT) {
983				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
984				switch (ctx->shader->input[i].name) {
985				case TGSI_SEMANTIC_FACE:
986					if (ctx->face_gpr != -1)
987						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
988					else
989						ctx->face_gpr = ctx->shader->input[i].gpr;
990					break;
991				case TGSI_SEMANTIC_COLOR:
992					ctx->colors_used++;
993					break;
994				case TGSI_SEMANTIC_POSITION:
995					ctx->fragcoord_input = i;
996					break;
997				case TGSI_SEMANTIC_PRIMID:
998					/* set this for now */
999					ctx->shader->gs_prim_id_input = true;
1000					ctx->shader->ps_prim_id_input = i;
1001					break;
1002				}
1003				if (ctx->bc->chip_class >= EVERGREEN) {
1004					if ((r = evergreen_interp_input(ctx, i)))
1005						return r;
1006				}
1007			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1008				/* FIXME probably skip inputs if they aren't passed in the ring */
1009				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1010				ctx->next_ring_offset += 16;
1011				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1012					ctx->shader->gs_prim_id_input = true;
1013			}
1014		}
1015		ctx->shader->ninput += count;
1016		break;
1017	case TGSI_FILE_OUTPUT:
1018		for (j = 0; j < count; j++) {
1019			i = ctx->shader->noutput + j;
1020			assert(i < ARRAY_SIZE(ctx->shader->output));
1021			ctx->shader->output[i].name = d->Semantic.Name;
1022			ctx->shader->output[i].sid = d->Semantic.Index + j;
1023			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1024			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1025			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1026			if (ctx->type == PIPE_SHADER_VERTEX ||
1027			    ctx->type == PIPE_SHADER_GEOMETRY ||
1028			    ctx->type == PIPE_SHADER_TESS_EVAL) {
1029				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1030				switch (d->Semantic.Name) {
1031				case TGSI_SEMANTIC_CLIPDIST:
1032					break;
1033				case TGSI_SEMANTIC_PSIZE:
1034					ctx->shader->vs_out_misc_write = 1;
1035					ctx->shader->vs_out_point_size = 1;
1036					break;
1037				case TGSI_SEMANTIC_EDGEFLAG:
1038					ctx->shader->vs_out_misc_write = 1;
1039					ctx->shader->vs_out_edgeflag = 1;
1040					ctx->edgeflag_output = i;
1041					break;
1042				case TGSI_SEMANTIC_VIEWPORT_INDEX:
1043					ctx->shader->vs_out_misc_write = 1;
1044					ctx->shader->vs_out_viewport = 1;
1045					break;
1046				case TGSI_SEMANTIC_LAYER:
1047					ctx->shader->vs_out_misc_write = 1;
1048					ctx->shader->vs_out_layer = 1;
1049					break;
1050				case TGSI_SEMANTIC_CLIPVERTEX:
1051					ctx->clip_vertex_write = TRUE;
1052					ctx->cv_output = i;
1053					break;
1054				}
1055				if (ctx->type == PIPE_SHADER_GEOMETRY) {
1056					ctx->gs_out_ring_offset += 16;
1057				}
1058			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1059				switch (d->Semantic.Name) {
1060				case TGSI_SEMANTIC_COLOR:
1061					ctx->shader->nr_ps_max_color_exports++;
1062					break;
1063				}
1064			}
1065		}
1066		ctx->shader->noutput += count;
1067		break;
1068	case TGSI_FILE_TEMPORARY:
1069		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1070			if (d->Array.ArrayID) {
1071				bool spilled;
1072				unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1073					d->Range.First,
1074					&spilled);
1075
1076				if (!spilled) {
1077					r600_add_gpr_array(ctx->shader, idx,
1078						d->Range.Last - d->Range.First + 1, 0x0F);
1079				}
1080			}
1081		}
1082		break;
1083
1084	case TGSI_FILE_CONSTANT:
1085	case TGSI_FILE_SAMPLER:
1086	case TGSI_FILE_SAMPLER_VIEW:
1087	case TGSI_FILE_ADDRESS:
1088	case TGSI_FILE_BUFFER:
1089	case TGSI_FILE_IMAGE:
1090	case TGSI_FILE_MEMORY:
1091		break;
1092
1093	case TGSI_FILE_HW_ATOMIC:
1094		i = ctx->shader->nhwatomic_ranges;
1095		ctx->shader->atomics[i].start = d->Range.First;
1096		ctx->shader->atomics[i].end = d->Range.Last;
1097		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1098		ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1099		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1100		ctx->shader->nhwatomic_ranges++;
1101		ctx->shader->nhwatomic += count;
1102		break;
1103
1104	case TGSI_FILE_SYSTEM_VALUE:
1105		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1106			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1107			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1108			break; /* Already handled from allocate_system_value_inputs */
1109		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1110			break;
1111		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1112			break;
1113		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1114			break;
1115		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1116			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1117			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1118			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1119			unsigned temp_reg = r600_get_temp(ctx);
1120
1121			r = get_lds_offset0(ctx, 2, temp_reg, true);
1122			if (r)
1123				return r;
1124
1125			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1126					   temp_reg, 0,
1127					   temp_reg, 0,
1128					   V_SQ_ALU_SRC_LITERAL, param * 16);
1129			if (r)
1130				return r;
1131
1132			do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1133		}
1134		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1135			/* MOV r1.x, r0.x;
1136			   MOV r1.y, r0.y;
1137			*/
1138			for (i = 0; i < 2; i++) {
1139				struct r600_bytecode_alu alu;
1140				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1141				alu.op = ALU_OP1_MOV;
1142				alu.src[0].sel = 0;
1143				alu.src[0].chan = 0 + i;
1144				alu.dst.sel = 1;
1145				alu.dst.chan = 0 + i;
1146				alu.dst.write = 1;
1147				alu.last = (i == 1) ? 1 : 0;
1148				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1149					return r;
1150			}
1151			/* ADD r1.z, 1.0f, -r0.x */
1152			struct r600_bytecode_alu alu;
1153			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1154			alu.op = ALU_OP2_ADD;
1155			alu.src[0].sel = V_SQ_ALU_SRC_1;
1156			alu.src[1].sel = 1;
1157			alu.src[1].chan = 0;
1158			alu.src[1].neg = 1;
1159			alu.dst.sel = 1;
1160			alu.dst.chan = 2;
1161			alu.dst.write = 1;
1162			alu.last = 1;
1163			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1164				return r;
1165
1166			/* ADD r1.z, r1.z, -r1.y */
1167			alu.op = ALU_OP2_ADD;
1168			alu.src[0].sel = 1;
1169			alu.src[0].chan = 2;
1170			alu.src[1].sel = 1;
1171			alu.src[1].chan = 1;
1172			alu.src[1].neg = 1;
1173			alu.dst.sel = 1;
1174			alu.dst.chan = 2;
1175			alu.dst.write = 1;
1176			alu.last = 1;
1177			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1178				return r;
1179			break;
1180		}
1181		break;
1182	default:
1183		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1184		return -EINVAL;
1185	}
1186	return 0;
1187}
1188
1189static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1190{
1191	struct tgsi_parse_context parse;
1192	struct {
1193		boolean enabled;
1194		int *reg;
1195		unsigned name, alternate_name;
1196	} inputs[2] = {
1197		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1198
1199		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1200	};
1201	int num_regs = 0;
1202	unsigned k, i;
1203
1204	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1205		return 0;
1206	}
1207
1208	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1209	while (!tgsi_parse_end_of_tokens(&parse)) {
1210		tgsi_parse_token(&parse);
1211
1212		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1213			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1214			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1215				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1216				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1217			{
1218				int interpolate, location, k;
1219
1220				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1221					location = TGSI_INTERPOLATE_LOC_CENTER;
1222				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1223					location = TGSI_INTERPOLATE_LOC_CENTER;
1224					/* Needs sample positions, currently those are always available */
1225				} else {
1226					location = TGSI_INTERPOLATE_LOC_CENTROID;
1227				}
1228
1229				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1230				k = eg_get_interpolator_index(interpolate, location);
1231				if (k >= 0)
1232					ctx->eg_interpolators[k].enabled = true;
1233			}
1234		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1235			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1236			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1237				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1238					if (d->Semantic.Name == inputs[k].name ||
1239						d->Semantic.Name == inputs[k].alternate_name) {
1240						inputs[k].enabled = true;
1241					}
1242				}
1243			}
1244		}
1245	}
1246
1247	tgsi_parse_free(&parse);
1248
1249	if (ctx->info.reads_samplemask &&
1250	    (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1251		inputs[1].enabled = true;
1252	}
1253
1254	if (ctx->bc->chip_class >= EVERGREEN) {
1255		int num_baryc = 0;
1256		/* assign gpr to each interpolator according to priority */
1257		for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1258			if (ctx->eg_interpolators[i].enabled) {
1259				ctx->eg_interpolators[i].ij_index = num_baryc;
1260				num_baryc++;
1261			}
1262		}
1263		num_baryc = (num_baryc + 1) >> 1;
1264		gpr_offset += num_baryc;
1265	}
1266
1267	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1268		boolean enabled = inputs[i].enabled;
1269		int *reg = inputs[i].reg;
1270		unsigned name = inputs[i].name;
1271
1272		if (enabled) {
1273			int gpr = gpr_offset + num_regs++;
1274			ctx->shader->nsys_inputs++;
1275
1276			// add to inputs, allocate a gpr
1277			k = ctx->shader->ninput++;
1278			ctx->shader->input[k].name = name;
1279			ctx->shader->input[k].sid = 0;
1280			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1281			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1282			*reg = ctx->shader->input[k].gpr = gpr;
1283		}
1284	}
1285
1286	return gpr_offset + num_regs;
1287}
1288
1289/*
1290 * for evergreen we need to scan the shader to find the number of GPRs we need to
1291 * reserve for interpolation and system values
1292 *
1293 * we need to know if we are going to emit any sample or centroid inputs
1294 * if perspective and linear are required
1295*/
1296static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1297{
1298	unsigned i;
1299
1300	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1301
1302	/*
1303	 * Could get this information from the shader info. But right now
1304	 * we interpolate all declared inputs, whereas the shader info will
1305	 * only contain the bits if the inputs are actually used, so it might
1306	 * not be safe...
1307	 */
1308	for (i = 0; i < ctx->info.num_inputs; i++) {
1309		int k;
1310		/* skip position/face/mask/sampleid */
1311		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1312		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1313		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1314		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1315			continue;
1316
1317		k = eg_get_interpolator_index(
1318			ctx->info.input_interpolate[i],
1319			ctx->info.input_interpolate_loc[i]);
1320		if (k >= 0)
1321			ctx->eg_interpolators[k].enabled = TRUE;
1322	}
1323
1324	/* XXX PULL MODEL and LINE STIPPLE */
1325
1326	return allocate_system_value_inputs(ctx, 0);
1327}
1328
1329/* sample_id_sel == NULL means fetch for current sample */
1330static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1331{
1332	struct r600_bytecode_vtx vtx;
1333	int r, t1;
1334
1335	t1 = r600_get_temp(ctx);
1336
1337	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1338	vtx.op = FETCH_OP_VFETCH;
1339	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1340	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1341	if (sample_id == NULL) {
1342		assert(ctx->fixed_pt_position_gpr != -1);
1343
1344		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1345		vtx.src_sel_x = 3;
1346	}
1347	else {
1348		struct r600_bytecode_alu alu;
1349
1350		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1351		alu.op = ALU_OP1_MOV;
1352		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1353		alu.dst.sel = t1;
1354		alu.dst.write = 1;
1355		alu.last = 1;
1356		r = r600_bytecode_add_alu(ctx->bc, &alu);
1357		if (r)
1358			return r;
1359
1360		vtx.src_gpr = t1;
1361		vtx.src_sel_x = 0;
1362	}
1363	vtx.mega_fetch_count = 16;
1364	vtx.dst_gpr = t1;
1365	vtx.dst_sel_x = 0;
1366	vtx.dst_sel_y = 1;
1367	vtx.dst_sel_z = 2;
1368	vtx.dst_sel_w = 3;
1369	vtx.data_format = FMT_32_32_32_32_FLOAT;
1370	vtx.num_format_all = 2;
1371	vtx.format_comp_all = 1;
1372	vtx.use_const_fields = 0;
1373	vtx.offset = 0;
1374	vtx.endian = r600_endian_swap(32);
1375	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1376
1377	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1378	if (r)
1379		return r;
1380
1381	return t1;
1382}
1383
1384static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1385{
1386	int r;
1387	struct r600_bytecode_alu alu;
1388
1389	/* do a vtx fetch with wqm set on the vtx fetch */
1390	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1391	alu.op = ALU_OP1_MOV;
1392	alu.dst.sel = ctx->helper_invoc_reg;
1393	alu.dst.chan = 0;
1394	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1395	alu.src[0].value = 0xffffffff;
1396	alu.dst.write = 1;
1397	alu.last = 1;
1398	r = r600_bytecode_add_alu(ctx->bc, &alu);
1399	if (r)
1400		return r;
1401
1402	/* do a vtx fetch in VPM mode */
1403	struct r600_bytecode_vtx vtx;
1404	memset(&vtx, 0, sizeof(vtx));
1405	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1406	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1407	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1408	vtx.src_gpr = 0;
1409	vtx.mega_fetch_count = 16; /* no idea here really... */
1410	vtx.dst_gpr = ctx->helper_invoc_reg;
1411	vtx.dst_sel_x = 4;
1412	vtx.dst_sel_y = 7;		/* SEL_Y */
1413	vtx.dst_sel_z = 7;		/* SEL_Z */
1414	vtx.dst_sel_w = 7;		/* SEL_W */
1415	vtx.data_format = FMT_32;
1416	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1417		return r;
1418	ctx->bc->cf_last->vpm = 1;
1419	return 0;
1420}
1421
1422static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1423{
1424	int r;
1425	struct r600_bytecode_alu alu;
1426
1427	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1428	alu.op = ALU_OP1_MOV;
1429	alu.dst.sel = ctx->helper_invoc_reg;
1430	alu.dst.chan = 0;
1431	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1432	alu.src[0].value = 0xffffffff;
1433	alu.dst.write = 1;
1434	alu.last = 1;
1435	r = r600_bytecode_add_alu(ctx->bc, &alu);
1436	if (r)
1437		return r;
1438
1439	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1440	alu.op = ALU_OP1_MOV;
1441	alu.dst.sel = ctx->helper_invoc_reg;
1442	alu.dst.chan = 0;
1443	alu.src[0].sel = V_SQ_ALU_SRC_0;
1444	alu.dst.write = 1;
1445	alu.last = 1;
1446	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1447	if (r)
1448		return r;
1449
1450	return ctx->helper_invoc_reg;
1451}
1452
1453static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1454{
1455	struct r600_bytecode_vtx vtx;
1456	int r, t1;
1457
1458	if (ctx->cs_block_size_loaded)
1459		return ctx->cs_block_size_reg;
1460	if (ctx->cs_grid_size_loaded)
1461		return ctx->cs_grid_size_reg;
1462
1463	t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1464	struct r600_bytecode_alu alu;
1465	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1466	alu.op = ALU_OP1_MOV;
1467	alu.src[0].sel = V_SQ_ALU_SRC_0;
1468	alu.dst.sel = t1;
1469	alu.dst.write = 1;
1470	alu.last = 1;
1471	r = r600_bytecode_add_alu(ctx->bc, &alu);
1472	if (r)
1473		return r;
1474
1475	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1476	vtx.op = FETCH_OP_VFETCH;
1477	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1478	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1479	vtx.src_gpr = t1;
1480	vtx.src_sel_x = 0;
1481
1482	vtx.mega_fetch_count = 16;
1483	vtx.dst_gpr = t1;
1484	vtx.dst_sel_x = 0;
1485	vtx.dst_sel_y = 1;
1486	vtx.dst_sel_z = 2;
1487	vtx.dst_sel_w = 7;
1488	vtx.data_format = FMT_32_32_32_32;
1489	vtx.num_format_all = 1;
1490	vtx.format_comp_all = 0;
1491	vtx.use_const_fields = 0;
1492	vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1493	vtx.endian = r600_endian_swap(32);
1494	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1495
1496	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1497	if (r)
1498		return r;
1499
1500	if (load_block)
1501		ctx->cs_block_size_loaded = true;
1502	else
1503		ctx->cs_grid_size_loaded = true;
1504	return t1;
1505}
1506
1507static void tgsi_src(struct r600_shader_ctx *ctx,
1508		     const struct tgsi_full_src_register *tgsi_src,
1509		     struct r600_shader_src *r600_src)
1510{
1511	memset(r600_src, 0, sizeof(*r600_src));
1512	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1513	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1514	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1515	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1516	r600_src->neg = tgsi_src->Register.Negate;
1517	r600_src->abs = tgsi_src->Register.Absolute;
1518
1519	if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1520		bool spilled;
1521		unsigned idx;
1522
1523		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1524
1525		if (spilled) {
1526			int reg = r600_get_temp(ctx);
1527			int r;
1528
1529			r600_src->sel = reg;
1530
1531			if (ctx->bc->chip_class < R700) {
1532				struct r600_bytecode_output cf;
1533
1534				memset(&cf, 0, sizeof(struct r600_bytecode_output));
1535				cf.op = CF_OP_MEM_SCRATCH;
1536				cf.elem_size = 3;
1537				cf.gpr = reg;
1538				cf.comp_mask = 0xF;
1539				cf.swizzle_x = 0;
1540				cf.swizzle_y = 1;
1541				cf.swizzle_z = 2;
1542				cf.swizzle_w = 3;
1543				cf.burst_count = 1;
1544
1545				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1546					&cf.array_base, &cf.array_size);
1547
1548				if (tgsi_src->Register.Indirect) {
1549					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1550					cf.index_gpr = ctx->bc->ar_reg;
1551				}
1552				else {
1553					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1554					cf.array_base += idx;
1555					cf.array_size = 0;
1556				}
1557
1558				r = r600_bytecode_add_output(ctx->bc, &cf);
1559			}
1560			else {
1561				struct r600_bytecode_vtx vtx;
1562
1563				if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1564					r600_bytecode_need_wait_ack(ctx->bc, false);
1565					r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1566				}
1567
1568				memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1569				vtx.op = FETCH_OP_READ_SCRATCH;
1570				vtx.dst_gpr = reg;
1571				vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1572				vtx.elem_size = 3;
1573				vtx.data_format = FMT_32_32_32_32;
1574				vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1575				vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1576				vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1577				vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1578				vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1579
1580				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1581					&vtx.array_base, &vtx.array_size);
1582
1583				if (tgsi_src->Register.Indirect) {
1584					vtx.indexed = 1;
1585					vtx.src_gpr = ctx->bc->ar_reg;
1586				}
1587				else {
1588					vtx.array_base += idx;
1589					vtx.array_size = 0;
1590				}
1591
1592				r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1593			}
1594
1595			if (r)
1596				return;
1597		}
1598		else {
1599			if (tgsi_src->Register.Indirect)
1600				r600_src->rel = V_SQ_REL_RELATIVE;
1601
1602			r600_src->sel = idx;
1603		}
1604
1605		return;
1606	}
1607
1608	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1609		int index;
1610		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1611			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1612			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1613
1614			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1615			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1616			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1617				return;
1618		}
1619		index = tgsi_src->Register.Index;
1620		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1621		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1622	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1623		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1624			r600_src->swizzle[0] = 2; // Z value
1625			r600_src->swizzle[1] = 2;
1626			r600_src->swizzle[2] = 2;
1627			r600_src->swizzle[3] = 2;
1628			r600_src->sel = ctx->face_gpr;
1629		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1630			r600_src->swizzle[0] = 3; // W value
1631			r600_src->swizzle[1] = 3;
1632			r600_src->swizzle[2] = 3;
1633			r600_src->swizzle[3] = 3;
1634			r600_src->sel = ctx->fixed_pt_position_gpr;
1635		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1636			r600_src->swizzle[0] = 0;
1637			r600_src->swizzle[1] = 1;
1638			r600_src->swizzle[2] = 4;
1639			r600_src->swizzle[3] = 4;
1640			r600_src->sel = load_sample_position(ctx, NULL, -1);
1641		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1642			r600_src->swizzle[0] = 3;
1643			r600_src->swizzle[1] = 3;
1644			r600_src->swizzle[2] = 3;
1645			r600_src->swizzle[3] = 3;
1646			r600_src->sel = 0;
1647		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1648			r600_src->swizzle[0] = 0;
1649			r600_src->swizzle[1] = 0;
1650			r600_src->swizzle[2] = 0;
1651			r600_src->swizzle[3] = 0;
1652			r600_src->sel = 0;
1653		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1654			r600_src->sel = 0;
1655		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1656			r600_src->sel = 1;
1657		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1658			r600_src->swizzle[0] = 3;
1659			r600_src->swizzle[1] = 3;
1660			r600_src->swizzle[2] = 3;
1661			r600_src->swizzle[3] = 3;
1662			r600_src->sel = 1;
1663		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1664			r600_src->swizzle[0] = 2;
1665			r600_src->swizzle[1] = 2;
1666			r600_src->swizzle[2] = 2;
1667			r600_src->swizzle[3] = 2;
1668			r600_src->sel = 0;
1669		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1670			r600_src->sel = 1;
1671		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1672			r600_src->sel = 3;
1673		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1674			r600_src->sel = 2;
1675		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1676			r600_src->sel = ctx->tess_input_info;
1677			r600_src->swizzle[0] = 2;
1678			r600_src->swizzle[1] = 2;
1679			r600_src->swizzle[2] = 2;
1680			r600_src->swizzle[3] = 2;
1681		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1682			r600_src->sel = 0;
1683			r600_src->swizzle[0] = 0;
1684			r600_src->swizzle[1] = 0;
1685			r600_src->swizzle[2] = 0;
1686			r600_src->swizzle[3] = 0;
1687		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1688			r600_src->sel = 0;
1689			r600_src->swizzle[0] = 3;
1690			r600_src->swizzle[1] = 3;
1691			r600_src->swizzle[2] = 3;
1692			r600_src->swizzle[3] = 3;
1693		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1694			r600_src->sel = load_block_grid_size(ctx, false);
1695		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1696			r600_src->sel = load_block_grid_size(ctx, true);
1697		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1698			r600_src->sel = ctx->helper_invoc_reg;
1699			r600_src->swizzle[0] = 0;
1700			r600_src->swizzle[1] = 0;
1701			r600_src->swizzle[2] = 0;
1702			r600_src->swizzle[3] = 0;
1703		}
1704	} else {
1705		if (tgsi_src->Register.Indirect)
1706			r600_src->rel = V_SQ_REL_RELATIVE;
1707		r600_src->sel = tgsi_src->Register.Index;
1708		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1709	}
1710	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1711		if (tgsi_src->Register.Dimension) {
1712			r600_src->kc_bank = tgsi_src->Dimension.Index;
1713			if (tgsi_src->Dimension.Indirect) {
1714				r600_src->kc_rel = 1;
1715			}
1716		}
1717	}
1718}
1719
1720static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1721                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1722                                unsigned int dst_reg)
1723{
1724	struct r600_bytecode_vtx vtx;
1725	unsigned int ar_reg;
1726	int r;
1727
1728	if (offset) {
1729		struct r600_bytecode_alu alu;
1730
1731		memset(&alu, 0, sizeof(alu));
1732
1733		alu.op = ALU_OP2_ADD_INT;
1734		alu.src[0].sel = ctx->bc->ar_reg;
1735		alu.src[0].chan = ar_chan;
1736
1737		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1738		alu.src[1].value = offset;
1739
1740		alu.dst.sel = dst_reg;
1741		alu.dst.chan = ar_chan;
1742		alu.dst.write = 1;
1743		alu.last = 1;
1744
1745		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1746			return r;
1747
1748		ar_reg = dst_reg;
1749	} else {
1750		ar_reg = ctx->bc->ar_reg;
1751	}
1752
1753	memset(&vtx, 0, sizeof(vtx));
1754	vtx.buffer_id = cb_idx;
1755	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1756	vtx.src_gpr = ar_reg;
1757	vtx.src_sel_x = ar_chan;
1758	vtx.mega_fetch_count = 16;
1759	vtx.dst_gpr = dst_reg;
1760	vtx.dst_sel_x = 0;		/* SEL_X */
1761	vtx.dst_sel_y = 1;		/* SEL_Y */
1762	vtx.dst_sel_z = 2;		/* SEL_Z */
1763	vtx.dst_sel_w = 3;		/* SEL_W */
1764	vtx.data_format = FMT_32_32_32_32_FLOAT;
1765	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1766	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1767	vtx.endian = r600_endian_swap(32);
1768	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1769
1770	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1771		return r;
1772
1773	return 0;
1774}
1775
1776static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1777{
1778	struct r600_bytecode_vtx vtx;
1779	int r;
1780	unsigned index = src->Register.Index;
1781	unsigned vtx_id = src->Dimension.Index;
1782	int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1783	int offset_chan = vtx_id % 3;
1784	int t2 = 0;
1785
1786	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1787	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1788
1789	if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1790		offset_chan = 3;
1791
1792	if (src->Dimension.Indirect || src->Register.Indirect)
1793		t2 = r600_get_temp(ctx);
1794
1795	if (src->Dimension.Indirect) {
1796		int treg[3];
1797		struct r600_bytecode_alu alu;
1798		int r, i;
1799		unsigned addr_reg;
1800		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1801		if (src->DimIndirect.Index > 0) {
1802			r = single_alu_op2(ctx, ALU_OP1_MOV,
1803					   ctx->bc->ar_reg, 0,
1804					   addr_reg, 0,
1805					   0, 0);
1806			if (r)
1807				return r;
1808		}
1809		/*
1810		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1811		   at least this is what fglrx seems to do. */
1812		for (i = 0; i < 3; i++) {
1813			treg[i] = r600_get_temp(ctx);
1814		}
1815		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1816
1817		for (i = 0; i < 3; i++) {
1818			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1819			alu.op = ALU_OP1_MOV;
1820			alu.src[0].sel = ctx->gs_rotated_input[0];
1821			alu.src[0].chan = i == 2 ? 3 : i;
1822			alu.dst.sel = treg[i];
1823			alu.dst.chan = 0;
1824			alu.dst.write = 1;
1825			alu.last = 1;
1826			r = r600_bytecode_add_alu(ctx->bc, &alu);
1827			if (r)
1828				return r;
1829		}
1830		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1831		alu.op = ALU_OP1_MOV;
1832		alu.src[0].sel = treg[0];
1833		alu.src[0].rel = 1;
1834		alu.dst.sel = t2;
1835		alu.dst.write = 1;
1836		alu.last = 1;
1837		r = r600_bytecode_add_alu(ctx->bc, &alu);
1838		if (r)
1839			return r;
1840		offset_reg = t2;
1841		offset_chan = 0;
1842	}
1843
1844	if (src->Register.Indirect) {
1845		int addr_reg;
1846		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1847
1848		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1849
1850		/* pull the value from index_reg */
1851		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1852				   t2, 1,
1853				   addr_reg, 0,
1854				   V_SQ_ALU_SRC_LITERAL, first);
1855		if (r)
1856			return r;
1857		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1858				   t2, 0,
1859				   t2, 1,
1860				   V_SQ_ALU_SRC_LITERAL, 4,
1861				   offset_reg, offset_chan);
1862		if (r)
1863			return r;
1864		offset_reg = t2;
1865		offset_chan = 0;
1866		index = src->Register.Index - first;
1867	}
1868
1869	memset(&vtx, 0, sizeof(vtx));
1870	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1871	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1872	vtx.src_gpr = offset_reg;
1873	vtx.src_sel_x = offset_chan;
1874	vtx.offset = index * 16; /*bytes*/
1875	vtx.mega_fetch_count = 16;
1876	vtx.dst_gpr = dst_reg;
1877	vtx.dst_sel_x = 0;		/* SEL_X */
1878	vtx.dst_sel_y = 1;		/* SEL_Y */
1879	vtx.dst_sel_z = 2;		/* SEL_Z */
1880	vtx.dst_sel_w = 3;		/* SEL_W */
1881	if (ctx->bc->chip_class >= EVERGREEN) {
1882		vtx.use_const_fields = 1;
1883	} else {
1884		vtx.data_format = FMT_32_32_32_32_FLOAT;
1885	}
1886
1887	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1888		return r;
1889
1890	return 0;
1891}
1892
1893static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1894{
1895	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1896	unsigned i;
1897
1898	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1899		struct tgsi_full_src_register *src = &inst->Src[i];
1900
1901		if (src->Register.File == TGSI_FILE_INPUT) {
1902			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1903				/* primitive id is in R0.z */
1904				ctx->src[i].sel = 0;
1905				ctx->src[i].swizzle[0] = 2;
1906			}
1907		}
1908		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1909			int treg = r600_get_temp(ctx);
1910
1911			fetch_gs_input(ctx, src, treg);
1912			ctx->src[i].sel = treg;
1913			ctx->src[i].rel = 0;
1914		}
1915	}
1916	return 0;
1917}
1918
1919
1920/* Tessellation shaders pass outputs to the next shader using LDS.
1921 *
1922 * LS outputs = TCS(HS) inputs
1923 * TCS(HS) outputs = TES(DS) inputs
1924 *
1925 * The LDS layout is:
1926 * - TCS inputs for patch 0
1927 * - TCS inputs for patch 1
1928 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1929 * - ...
1930 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1931 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1932 * - TCS outputs for patch 1
1933 * - Per-patch TCS outputs for patch 1
1934 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1935 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1936 * - ...
1937 *
1938 * All three shaders VS(LS), TCS, TES share the same LDS space.
1939 */
1940/* this will return with the dw address in temp_reg.x */
1941static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1942				 const struct tgsi_full_dst_register *dst,
1943				 const struct tgsi_full_src_register *src,
1944				 int stride_bytes_reg, int stride_bytes_chan)
1945{
1946	struct tgsi_full_dst_register reg;
1947	ubyte *name, *index, *array_first;
1948	int r;
1949	int param;
1950	struct tgsi_shader_info *info = &ctx->info;
1951	/* Set the register description. The address computation is the same
1952	 * for sources and destinations. */
1953	if (src) {
1954		reg.Register.File = src->Register.File;
1955		reg.Register.Index = src->Register.Index;
1956		reg.Register.Indirect = src->Register.Indirect;
1957		reg.Register.Dimension = src->Register.Dimension;
1958		reg.Indirect = src->Indirect;
1959		reg.Dimension = src->Dimension;
1960		reg.DimIndirect = src->DimIndirect;
1961	} else
1962		reg = *dst;
1963
1964	/* If the register is 2-dimensional (e.g. an array of vertices
1965	 * in a primitive), calculate the base address of the vertex. */
1966	if (reg.Register.Dimension) {
1967		int sel, chan;
1968		if (reg.Dimension.Indirect) {
1969			unsigned addr_reg;
1970			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1971
1972			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1973			/* pull the value from index_reg */
1974			sel = addr_reg;
1975			chan = 0;
1976		} else {
1977			sel = V_SQ_ALU_SRC_LITERAL;
1978			chan = reg.Dimension.Index;
1979		}
1980
1981		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1982				   temp_reg, 0,
1983				   stride_bytes_reg, stride_bytes_chan,
1984				   sel, chan,
1985				   temp_reg, 0);
1986		if (r)
1987			return r;
1988	}
1989
1990	if (reg.Register.File == TGSI_FILE_INPUT) {
1991		name = info->input_semantic_name;
1992		index = info->input_semantic_index;
1993		array_first = info->input_array_first;
1994	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1995		name = info->output_semantic_name;
1996		index = info->output_semantic_index;
1997		array_first = info->output_array_first;
1998	} else {
1999		assert(0);
2000		return -1;
2001	}
2002	if (reg.Register.Indirect) {
2003		int addr_reg;
2004		int first;
2005		/* Add the relative address of the element. */
2006		if (reg.Indirect.ArrayID)
2007			first = array_first[reg.Indirect.ArrayID];
2008		else
2009			first = reg.Register.Index;
2010
2011		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2012
2013		/* pull the value from index_reg */
2014		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2015				   temp_reg, 0,
2016				   V_SQ_ALU_SRC_LITERAL, 16,
2017				   addr_reg, 0,
2018				   temp_reg, 0);
2019		if (r)
2020			return r;
2021
2022		param = r600_get_lds_unique_index(name[first],
2023						  index[first]);
2024
2025	} else {
2026		param = r600_get_lds_unique_index(name[reg.Register.Index],
2027						  index[reg.Register.Index]);
2028	}
2029
2030	/* add to base_addr - passed in temp_reg.x */
2031	if (param) {
2032		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2033				   temp_reg, 0,
2034				   temp_reg, 0,
2035				   V_SQ_ALU_SRC_LITERAL, param * 16);
2036		if (r)
2037			return r;
2038
2039	}
2040	return 0;
2041}
2042
2043static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2044			       unsigned dst_reg, unsigned mask)
2045{
2046	struct r600_bytecode_alu alu;
2047	int r, i, lasti;
2048
2049	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2050		ctx->bc->force_add_cf = 1;
2051
2052	lasti = tgsi_last_instruction(mask);
2053	for (i = 1; i <= lasti; i++) {
2054		if (!(mask & (1 << i)))
2055			continue;
2056
2057		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2058				   temp_reg, i,
2059				   temp_reg, 0,
2060				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2061		if (r)
2062			return r;
2063	}
2064	for (i = 0; i <= lasti; i++) {
2065		if (!(mask & (1 << i)))
2066			continue;
2067
2068		/* emit an LDS_READ_RET */
2069		memset(&alu, 0, sizeof(alu));
2070		alu.op = LDS_OP1_LDS_READ_RET;
2071		alu.src[0].sel = temp_reg;
2072		alu.src[0].chan = i;
2073		alu.src[1].sel = V_SQ_ALU_SRC_0;
2074		alu.src[2].sel = V_SQ_ALU_SRC_0;
2075		alu.dst.chan = 0;
2076		alu.is_lds_idx_op = true;
2077		alu.last = 1;
2078		r = r600_bytecode_add_alu(ctx->bc, &alu);
2079		if (r)
2080			return r;
2081	}
2082	for (i = 0; i <= lasti; i++) {
2083		if (!(mask & (1 << i)))
2084			continue;
2085
2086		/* then read from LDS_OQ_A_POP */
2087		memset(&alu, 0, sizeof(alu));
2088
2089		alu.op = ALU_OP1_MOV;
2090		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2091		alu.src[0].chan = 0;
2092		alu.dst.sel = dst_reg;
2093		alu.dst.chan = i;
2094		alu.dst.write = 1;
2095		alu.last = 1;
2096		r = r600_bytecode_add_alu(ctx->bc, &alu);
2097		if (r)
2098			return r;
2099	}
2100	return 0;
2101}
2102
2103static int fetch_mask(struct tgsi_src_register *reg)
2104{
2105	int mask = 0;
2106	mask |= 1 << reg->SwizzleX;
2107	mask |= 1 << reg->SwizzleY;
2108	mask |= 1 << reg->SwizzleZ;
2109	mask |= 1 << reg->SwizzleW;
2110	return mask;
2111}
2112
2113static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2114{
2115	int r;
2116	unsigned temp_reg = r600_get_temp(ctx);
2117
2118	r = get_lds_offset0(ctx, 2, temp_reg,
2119			    src->Register.Dimension ? false : true);
2120	if (r)
2121		return r;
2122
2123	/* the base address is now in temp.x */
2124	r = r600_get_byte_address(ctx, temp_reg,
2125				  NULL, src, ctx->tess_output_info, 1);
2126	if (r)
2127		return r;
2128
2129	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2130	if (r)
2131		return r;
2132	return 0;
2133}
2134
2135static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2136{
2137	int r;
2138	unsigned temp_reg = r600_get_temp(ctx);
2139
2140	/* t.x = ips * r0.y */
2141	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2142			   temp_reg, 0,
2143			   ctx->tess_input_info, 0,
2144			   0, 1);
2145
2146	if (r)
2147		return r;
2148
2149	/* the base address is now in temp.x */
2150	r = r600_get_byte_address(ctx, temp_reg,
2151				  NULL, src, ctx->tess_input_info, 1);
2152	if (r)
2153		return r;
2154
2155	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2156	if (r)
2157		return r;
2158	return 0;
2159}
2160
2161static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2162{
2163	int r;
2164	unsigned temp_reg = r600_get_temp(ctx);
2165
2166	r = get_lds_offset0(ctx, 1, temp_reg,
2167			    src->Register.Dimension ? false : true);
2168	if (r)
2169		return r;
2170	/* the base address is now in temp.x */
2171	r = r600_get_byte_address(ctx, temp_reg,
2172				  NULL, src,
2173				  ctx->tess_output_info, 1);
2174	if (r)
2175		return r;
2176
2177	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2178	if (r)
2179		return r;
2180	return 0;
2181}
2182
2183static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2184{
2185	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2186	unsigned i;
2187
2188	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2189		struct tgsi_full_src_register *src = &inst->Src[i];
2190
2191		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2192			int treg = r600_get_temp(ctx);
2193			fetch_tes_input(ctx, src, treg);
2194			ctx->src[i].sel = treg;
2195			ctx->src[i].rel = 0;
2196		}
2197		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2198			int treg = r600_get_temp(ctx);
2199			fetch_tcs_input(ctx, src, treg);
2200			ctx->src[i].sel = treg;
2201			ctx->src[i].rel = 0;
2202		}
2203		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2204			int treg = r600_get_temp(ctx);
2205			fetch_tcs_output(ctx, src, treg);
2206			ctx->src[i].sel = treg;
2207			ctx->src[i].rel = 0;
2208		}
2209	}
2210	return 0;
2211}
2212
2213static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2214{
2215	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2216	struct r600_bytecode_alu alu;
2217	int i, j, k, nconst, r;
2218
2219	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2220		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2221			nconst++;
2222		}
2223		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2224	}
2225	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2226		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2227			continue;
2228		}
2229
2230		if (ctx->src[i].rel) {
2231			int chan = inst->Src[i].Indirect.Swizzle;
2232			int treg = r600_get_temp(ctx);
2233			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2234				return r;
2235
2236			ctx->src[i].kc_bank = 0;
2237			ctx->src[i].kc_rel = 0;
2238			ctx->src[i].sel = treg;
2239			ctx->src[i].rel = 0;
2240			j--;
2241		} else if (j > 0) {
2242			int treg = r600_get_temp(ctx);
2243			for (k = 0; k < 4; k++) {
2244				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2245				alu.op = ALU_OP1_MOV;
2246				alu.src[0].sel = ctx->src[i].sel;
2247				alu.src[0].chan = k;
2248				alu.src[0].rel = ctx->src[i].rel;
2249				alu.src[0].kc_bank = ctx->src[i].kc_bank;
2250				alu.src[0].kc_rel = ctx->src[i].kc_rel;
2251				alu.dst.sel = treg;
2252				alu.dst.chan = k;
2253				alu.dst.write = 1;
2254				if (k == 3)
2255					alu.last = 1;
2256				r = r600_bytecode_add_alu(ctx->bc, &alu);
2257				if (r)
2258					return r;
2259			}
2260			ctx->src[i].sel = treg;
2261			ctx->src[i].rel =0;
2262			j--;
2263		}
2264	}
2265	return 0;
2266}
2267
2268/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2269static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2270{
2271	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2272	struct r600_bytecode_alu alu;
2273	int i, j, k, nliteral, r;
2274
2275	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2276		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2277			nliteral++;
2278		}
2279	}
2280	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2281		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2282			int treg = r600_get_temp(ctx);
2283			for (k = 0; k < 4; k++) {
2284				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2285				alu.op = ALU_OP1_MOV;
2286				alu.src[0].sel = ctx->src[i].sel;
2287				alu.src[0].chan = k;
2288				alu.src[0].value = ctx->src[i].value[k];
2289				alu.dst.sel = treg;
2290				alu.dst.chan = k;
2291				alu.dst.write = 1;
2292				if (k == 3)
2293					alu.last = 1;
2294				r = r600_bytecode_add_alu(ctx->bc, &alu);
2295				if (r)
2296					return r;
2297			}
2298			ctx->src[i].sel = treg;
2299			j--;
2300		}
2301	}
2302	return 0;
2303}
2304
2305static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2306{
2307	int i, r, count = ctx->shader->ninput;
2308
2309	for (i = 0; i < count; i++) {
2310		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2311			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2312			if (r)
2313				return r;
2314		}
2315	}
2316	return 0;
2317}
2318
2319static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2320						  int stream, unsigned *stream_item_size UNUSED)
2321{
2322	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2323	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2324	int j, r;
2325	unsigned i;
2326
2327	/* Sanity checking. */
2328	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2329		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2330		r = -EINVAL;
2331		goto out_err;
2332	}
2333	for (i = 0; i < so->num_outputs; i++) {
2334		if (so->output[i].output_buffer >= 4) {
2335			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2336				 so->output[i].output_buffer);
2337			r = -EINVAL;
2338			goto out_err;
2339		}
2340	}
2341
2342	/* Initialize locations where the outputs are stored. */
2343	for (i = 0; i < so->num_outputs; i++) {
2344
2345		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2346		start_comp[i] = so->output[i].start_component;
2347		/* Lower outputs with dst_offset < start_component.
2348		 *
2349		 * We can only output 4D vectors with a write mask, e.g. we can
2350		 * only output the W component at offset 3, etc. If we want
2351		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2352		 * to move it to X and output X. */
2353		if (so->output[i].dst_offset < so->output[i].start_component) {
2354			unsigned tmp = r600_get_temp(ctx);
2355
2356			for (j = 0; j < so->output[i].num_components; j++) {
2357				struct r600_bytecode_alu alu;
2358				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2359				alu.op = ALU_OP1_MOV;
2360				alu.src[0].sel = so_gpr[i];
2361				alu.src[0].chan = so->output[i].start_component + j;
2362
2363				alu.dst.sel = tmp;
2364				alu.dst.chan = j;
2365				alu.dst.write = 1;
2366				if (j == so->output[i].num_components - 1)
2367					alu.last = 1;
2368				r = r600_bytecode_add_alu(ctx->bc, &alu);
2369				if (r)
2370					return r;
2371			}
2372			start_comp[i] = 0;
2373			so_gpr[i] = tmp;
2374		}
2375	}
2376
2377	/* Write outputs to buffers. */
2378	for (i = 0; i < so->num_outputs; i++) {
2379		struct r600_bytecode_output output;
2380
2381		if (stream != -1 && stream != so->output[i].stream)
2382			continue;
2383
2384		memset(&output, 0, sizeof(struct r600_bytecode_output));
2385		output.gpr = so_gpr[i];
2386		output.elem_size = so->output[i].num_components - 1;
2387		if (output.elem_size == 2)
2388			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2389		output.array_base = so->output[i].dst_offset - start_comp[i];
2390		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2391		output.burst_count = 1;
2392		/* array_size is an upper limit for the burst_count
2393		 * with MEM_STREAM instructions */
2394		output.array_size = 0xFFF;
2395		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2396
2397		if (ctx->bc->chip_class >= EVERGREEN) {
2398			switch (so->output[i].output_buffer) {
2399			case 0:
2400				output.op = CF_OP_MEM_STREAM0_BUF0;
2401				break;
2402			case 1:
2403				output.op = CF_OP_MEM_STREAM0_BUF1;
2404				break;
2405			case 2:
2406				output.op = CF_OP_MEM_STREAM0_BUF2;
2407				break;
2408			case 3:
2409				output.op = CF_OP_MEM_STREAM0_BUF3;
2410				break;
2411			}
2412			output.op += so->output[i].stream * 4;
2413			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2414			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2415		} else {
2416			switch (so->output[i].output_buffer) {
2417			case 0:
2418				output.op = CF_OP_MEM_STREAM0;
2419				break;
2420			case 1:
2421				output.op = CF_OP_MEM_STREAM1;
2422				break;
2423			case 2:
2424				output.op = CF_OP_MEM_STREAM2;
2425				break;
2426			case 3:
2427				output.op = CF_OP_MEM_STREAM3;
2428					break;
2429			}
2430			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2431		}
2432		r = r600_bytecode_add_output(ctx->bc, &output);
2433		if (r)
2434			goto out_err;
2435	}
2436	return 0;
2437out_err:
2438	return r;
2439}
2440
2441static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2442{
2443	struct r600_bytecode_alu alu;
2444	unsigned reg;
2445
2446	if (!ctx->shader->vs_out_edgeflag)
2447		return;
2448
2449	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2450
2451	/* clamp(x, 0, 1) */
2452	memset(&alu, 0, sizeof(alu));
2453	alu.op = ALU_OP1_MOV;
2454	alu.src[0].sel = reg;
2455	alu.dst.sel = reg;
2456	alu.dst.write = 1;
2457	alu.dst.clamp = 1;
2458	alu.last = 1;
2459	r600_bytecode_add_alu(ctx->bc, &alu);
2460
2461	memset(&alu, 0, sizeof(alu));
2462	alu.op = ALU_OP1_FLT_TO_INT;
2463	alu.src[0].sel = reg;
2464	alu.dst.sel = reg;
2465	alu.dst.write = 1;
2466	alu.last = 1;
2467	r600_bytecode_add_alu(ctx->bc, &alu);
2468}
2469
2470static int generate_gs_copy_shader(struct r600_context *rctx,
2471				   struct r600_pipe_shader *gs,
2472				   struct pipe_stream_output_info *so)
2473{
2474	struct r600_shader_ctx ctx = {};
2475	struct r600_shader *gs_shader = &gs->shader;
2476	struct r600_pipe_shader *cshader;
2477	unsigned ocnt = gs_shader->noutput;
2478	struct r600_bytecode_alu alu;
2479	struct r600_bytecode_vtx vtx;
2480	struct r600_bytecode_output output;
2481	struct r600_bytecode_cf *cf_jump, *cf_pop,
2482		*last_exp_pos = NULL, *last_exp_param = NULL;
2483	int next_clip_pos = 61, next_param = 0;
2484	unsigned i, j;
2485	int ring;
2486	bool only_ring_0 = true;
2487	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2488	if (!cshader)
2489		return 0;
2490
2491	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2492	       sizeof(struct r600_shader_io));
2493
2494	cshader->shader.noutput = ocnt;
2495
2496	ctx.shader = &cshader->shader;
2497	ctx.bc = &ctx.shader->bc;
2498	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2499
2500	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2501			   rctx->screen->has_compressed_msaa_texturing);
2502
2503	ctx.bc->isa = rctx->isa;
2504
2505	cf_jump = NULL;
2506	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2507
2508	/* R0.x = R0.x & 0x3fffffff */
2509	memset(&alu, 0, sizeof(alu));
2510	alu.op = ALU_OP2_AND_INT;
2511	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2512	alu.src[1].value = 0x3fffffff;
2513	alu.dst.write = 1;
2514	r600_bytecode_add_alu(ctx.bc, &alu);
2515
2516	/* R0.y = R0.x >> 30 */
2517	memset(&alu, 0, sizeof(alu));
2518	alu.op = ALU_OP2_LSHR_INT;
2519	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2520	alu.src[1].value = 0x1e;
2521	alu.dst.chan = 1;
2522	alu.dst.write = 1;
2523	alu.last = 1;
2524	r600_bytecode_add_alu(ctx.bc, &alu);
2525
2526	/* fetch vertex data from GSVS ring */
2527	for (i = 0; i < ocnt; ++i) {
2528		struct r600_shader_io *out = &ctx.shader->output[i];
2529
2530		out->gpr = i + 1;
2531		out->ring_offset = i * 16;
2532
2533		memset(&vtx, 0, sizeof(vtx));
2534		vtx.op = FETCH_OP_VFETCH;
2535		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2536		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2537		vtx.mega_fetch_count = 16;
2538		vtx.offset = out->ring_offset;
2539		vtx.dst_gpr = out->gpr;
2540		vtx.src_gpr = 0;
2541		vtx.dst_sel_x = 0;
2542		vtx.dst_sel_y = 1;
2543		vtx.dst_sel_z = 2;
2544		vtx.dst_sel_w = 3;
2545		if (rctx->b.chip_class >= EVERGREEN) {
2546			vtx.use_const_fields = 1;
2547		} else {
2548			vtx.data_format = FMT_32_32_32_32_FLOAT;
2549		}
2550
2551		r600_bytecode_add_vtx(ctx.bc, &vtx);
2552	}
2553	ctx.temp_reg = i + 1;
2554	for (ring = 3; ring >= 0; --ring) {
2555		bool enabled = false;
2556		for (i = 0; i < so->num_outputs; i++) {
2557			if (so->output[i].stream == ring) {
2558				enabled = true;
2559				if (ring > 0)
2560					only_ring_0 = false;
2561				break;
2562			}
2563		}
2564		if (ring != 0 && !enabled) {
2565			cshader->shader.ring_item_sizes[ring] = 0;
2566			continue;
2567		}
2568
2569		if (cf_jump) {
2570			// Patch up jump label
2571			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2572			cf_pop = ctx.bc->cf_last;
2573
2574			cf_jump->cf_addr = cf_pop->id + 2;
2575			cf_jump->pop_count = 1;
2576			cf_pop->cf_addr = cf_pop->id + 2;
2577			cf_pop->pop_count = 1;
2578		}
2579
2580		/* PRED_SETE_INT __, R0.y, ring */
2581		memset(&alu, 0, sizeof(alu));
2582		alu.op = ALU_OP2_PRED_SETE_INT;
2583		alu.src[0].chan = 1;
2584		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2585		alu.src[1].value = ring;
2586		alu.execute_mask = 1;
2587		alu.update_pred = 1;
2588		alu.last = 1;
2589		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2590
2591		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2592		cf_jump = ctx.bc->cf_last;
2593
2594		if (enabled)
2595			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2596		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2597	}
2598
2599	/* bc adds nops - copy it */
2600	if (ctx.bc->chip_class == R600) {
2601		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2602		alu.op = ALU_OP0_NOP;
2603		alu.last = 1;
2604		r600_bytecode_add_alu(ctx.bc, &alu);
2605
2606		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2607	}
2608
2609	/* export vertex data */
2610	/* XXX factor out common code with r600_shader_from_tgsi ? */
2611	for (i = 0; i < ocnt; ++i) {
2612		struct r600_shader_io *out = &ctx.shader->output[i];
2613		bool instream0 = true;
2614		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2615			continue;
2616
2617		for (j = 0; j < so->num_outputs; j++) {
2618			if (so->output[j].register_index == i) {
2619				if (so->output[j].stream == 0)
2620					break;
2621				if (so->output[j].stream > 0)
2622					instream0 = false;
2623			}
2624		}
2625		if (!instream0)
2626			continue;
2627		memset(&output, 0, sizeof(output));
2628		output.gpr = out->gpr;
2629		output.elem_size = 3;
2630		output.swizzle_x = 0;
2631		output.swizzle_y = 1;
2632		output.swizzle_z = 2;
2633		output.swizzle_w = 3;
2634		output.burst_count = 1;
2635		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2636		output.op = CF_OP_EXPORT;
2637		switch (out->name) {
2638		case TGSI_SEMANTIC_POSITION:
2639			output.array_base = 60;
2640			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2641			break;
2642
2643		case TGSI_SEMANTIC_PSIZE:
2644			output.array_base = 61;
2645			if (next_clip_pos == 61)
2646				next_clip_pos = 62;
2647			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2648			output.swizzle_y = 7;
2649			output.swizzle_z = 7;
2650			output.swizzle_w = 7;
2651			ctx.shader->vs_out_misc_write = 1;
2652			ctx.shader->vs_out_point_size = 1;
2653			break;
2654		case TGSI_SEMANTIC_LAYER:
2655			if (out->spi_sid) {
2656				/* duplicate it as PARAM to pass to the pixel shader */
2657				output.array_base = next_param++;
2658				r600_bytecode_add_output(ctx.bc, &output);
2659				last_exp_param = ctx.bc->cf_last;
2660			}
2661			output.array_base = 61;
2662			if (next_clip_pos == 61)
2663				next_clip_pos = 62;
2664			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2665			output.swizzle_x = 7;
2666			output.swizzle_y = 7;
2667			output.swizzle_z = 0;
2668			output.swizzle_w = 7;
2669			ctx.shader->vs_out_misc_write = 1;
2670			ctx.shader->vs_out_layer = 1;
2671			break;
2672		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2673			if (out->spi_sid) {
2674				/* duplicate it as PARAM to pass to the pixel shader */
2675				output.array_base = next_param++;
2676				r600_bytecode_add_output(ctx.bc, &output);
2677				last_exp_param = ctx.bc->cf_last;
2678			}
2679			output.array_base = 61;
2680			if (next_clip_pos == 61)
2681				next_clip_pos = 62;
2682			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2683			ctx.shader->vs_out_misc_write = 1;
2684			ctx.shader->vs_out_viewport = 1;
2685			output.swizzle_x = 7;
2686			output.swizzle_y = 7;
2687			output.swizzle_z = 7;
2688			output.swizzle_w = 0;
2689			break;
2690		case TGSI_SEMANTIC_CLIPDIST:
2691			/* spi_sid is 0 for clipdistance outputs that were generated
2692			 * for clipvertex - we don't need to pass them to PS */
2693			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2694			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2695			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2696			if (out->spi_sid) {
2697				/* duplicate it as PARAM to pass to the pixel shader */
2698				output.array_base = next_param++;
2699				r600_bytecode_add_output(ctx.bc, &output);
2700				last_exp_param = ctx.bc->cf_last;
2701			}
2702			output.array_base = next_clip_pos++;
2703			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2704			break;
2705		case TGSI_SEMANTIC_FOG:
2706			output.swizzle_y = 4; /* 0 */
2707			output.swizzle_z = 4; /* 0 */
2708			output.swizzle_w = 5; /* 1 */
2709			break;
2710		default:
2711			output.array_base = next_param++;
2712			break;
2713		}
2714		r600_bytecode_add_output(ctx.bc, &output);
2715		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2716			last_exp_param = ctx.bc->cf_last;
2717		else
2718			last_exp_pos = ctx.bc->cf_last;
2719	}
2720
2721	if (!last_exp_pos) {
2722		memset(&output, 0, sizeof(output));
2723		output.gpr = 0;
2724		output.elem_size = 3;
2725		output.swizzle_x = 7;
2726		output.swizzle_y = 7;
2727		output.swizzle_z = 7;
2728		output.swizzle_w = 7;
2729		output.burst_count = 1;
2730		output.type = 2;
2731		output.op = CF_OP_EXPORT;
2732		output.array_base = 60;
2733		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2734		r600_bytecode_add_output(ctx.bc, &output);
2735		last_exp_pos = ctx.bc->cf_last;
2736	}
2737
2738	if (!last_exp_param) {
2739		memset(&output, 0, sizeof(output));
2740		output.gpr = 0;
2741		output.elem_size = 3;
2742		output.swizzle_x = 7;
2743		output.swizzle_y = 7;
2744		output.swizzle_z = 7;
2745		output.swizzle_w = 7;
2746		output.burst_count = 1;
2747		output.type = 2;
2748		output.op = CF_OP_EXPORT;
2749		output.array_base = next_param++;
2750		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2751		r600_bytecode_add_output(ctx.bc, &output);
2752		last_exp_param = ctx.bc->cf_last;
2753	}
2754
2755	last_exp_pos->op = CF_OP_EXPORT_DONE;
2756	last_exp_param->op = CF_OP_EXPORT_DONE;
2757
2758	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2759	cf_pop = ctx.bc->cf_last;
2760
2761	cf_jump->cf_addr = cf_pop->id + 2;
2762	cf_jump->pop_count = 1;
2763	cf_pop->cf_addr = cf_pop->id + 2;
2764	cf_pop->pop_count = 1;
2765
2766	if (ctx.bc->chip_class == CAYMAN)
2767		cm_bytecode_add_cf_end(ctx.bc);
2768	else {
2769		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2770		ctx.bc->cf_last->end_of_program = 1;
2771	}
2772
2773	gs->gs_copy_shader = cshader;
2774	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2775
2776	ctx.bc->nstack = 1;
2777
2778	return r600_bytecode_build(ctx.bc);
2779}
2780
2781static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2782{
2783	if (ind) {
2784		struct r600_bytecode_alu alu;
2785		int r;
2786
2787		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2788		alu.op = ALU_OP2_ADD_INT;
2789		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2790		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2791		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2792		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2793		alu.dst.write = 1;
2794		alu.last = 1;
2795		r = r600_bytecode_add_alu(ctx->bc, &alu);
2796		if (r)
2797			return r;
2798	}
2799	return 0;
2800}
2801
2802static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2803{
2804	struct r600_bytecode_output output;
2805	int ring_offset;
2806	unsigned i, k;
2807	int effective_stream = stream == -1 ? 0 : stream;
2808	int idx = 0;
2809
2810	for (i = 0; i < ctx->shader->noutput; i++) {
2811		if (ctx->gs_for_vs) {
2812			/* for ES we need to lookup corresponding ring offset expected by GS
2813			 * (map this output to GS input by name and sid) */
2814			/* FIXME precompute offsets */
2815			ring_offset = -1;
2816			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2817				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2818				struct r600_shader_io *out = &ctx->shader->output[i];
2819				if (in->name == out->name && in->sid == out->sid)
2820					ring_offset = in->ring_offset;
2821			}
2822
2823			if (ring_offset == -1)
2824				continue;
2825		} else {
2826			ring_offset = idx * 16;
2827			idx++;
2828		}
2829
2830		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2831			continue;
2832		/* next_ring_offset after parsing input decls contains total size of
2833		 * single vertex data, gs_next_vertex - current vertex index */
2834		if (!ind)
2835			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2836
2837		memset(&output, 0, sizeof(struct r600_bytecode_output));
2838		output.gpr = ctx->shader->output[i].gpr;
2839		output.elem_size = 3;
2840		output.comp_mask = 0xF;
2841		output.burst_count = 1;
2842
2843		if (ind)
2844			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2845		else
2846			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2847
2848		switch (stream) {
2849		default:
2850		case 0:
2851			output.op = CF_OP_MEM_RING; break;
2852		case 1:
2853			output.op = CF_OP_MEM_RING1; break;
2854		case 2:
2855			output.op = CF_OP_MEM_RING2; break;
2856		case 3:
2857			output.op = CF_OP_MEM_RING3; break;
2858		}
2859
2860		if (ind) {
2861			output.array_base = ring_offset >> 2; /* in dwords */
2862			output.array_size = 0xfff;
2863			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2864		} else
2865			output.array_base = ring_offset >> 2; /* in dwords */
2866		r600_bytecode_add_output(ctx->bc, &output);
2867	}
2868
2869	++ctx->gs_next_vertex;
2870	return 0;
2871}
2872
2873
2874static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2875{
2876	int r;
2877	struct r600_bytecode_vtx vtx;
2878	int temp_val = ctx->temp_reg;
2879	/* need to store the TCS output somewhere */
2880	r = single_alu_op2(ctx, ALU_OP1_MOV,
2881			   temp_val, 0,
2882			   V_SQ_ALU_SRC_LITERAL, 0,
2883			   0, 0);
2884	if (r)
2885		return r;
2886
2887	/* used by VS/TCS */
2888	if (ctx->tess_input_info) {
2889		/* fetch tcs input values into resv space */
2890		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2891		vtx.op = FETCH_OP_VFETCH;
2892		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2893		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2894		vtx.mega_fetch_count = 16;
2895		vtx.data_format = FMT_32_32_32_32;
2896		vtx.num_format_all = 2;
2897		vtx.format_comp_all = 1;
2898		vtx.use_const_fields = 0;
2899		vtx.endian = r600_endian_swap(32);
2900		vtx.srf_mode_all = 1;
2901		vtx.offset = 0;
2902		vtx.dst_gpr = ctx->tess_input_info;
2903		vtx.dst_sel_x = 0;
2904		vtx.dst_sel_y = 1;
2905		vtx.dst_sel_z = 2;
2906		vtx.dst_sel_w = 3;
2907		vtx.src_gpr = temp_val;
2908		vtx.src_sel_x = 0;
2909
2910		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2911		if (r)
2912			return r;
2913	}
2914
2915	/* used by TCS/TES */
2916	if (ctx->tess_output_info) {
2917		/* fetch tcs output values into resv space */
2918		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2919		vtx.op = FETCH_OP_VFETCH;
2920		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2921		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2922		vtx.mega_fetch_count = 16;
2923		vtx.data_format = FMT_32_32_32_32;
2924		vtx.num_format_all = 2;
2925		vtx.format_comp_all = 1;
2926		vtx.use_const_fields = 0;
2927		vtx.endian = r600_endian_swap(32);
2928		vtx.srf_mode_all = 1;
2929		vtx.offset = 16;
2930		vtx.dst_gpr = ctx->tess_output_info;
2931		vtx.dst_sel_x = 0;
2932		vtx.dst_sel_y = 1;
2933		vtx.dst_sel_z = 2;
2934		vtx.dst_sel_w = 3;
2935		vtx.src_gpr = temp_val;
2936		vtx.src_sel_x = 0;
2937
2938		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2939		if (r)
2940			return r;
2941	}
2942	return 0;
2943}
2944
2945static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2946{
2947	int j, r;
2948	int temp_reg;
2949	unsigned i;
2950
2951	/* fetch tcs input values into input_vals */
2952	ctx->tess_input_info = r600_get_temp(ctx);
2953	ctx->tess_output_info = 0;
2954	r = r600_fetch_tess_io_info(ctx);
2955	if (r)
2956		return r;
2957
2958	temp_reg = r600_get_temp(ctx);
2959	/* dst reg contains LDS address stride * idx */
2960	/* MUL vertexID, vertex_dw_stride */
2961	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2962			   temp_reg, 0,
2963			   ctx->tess_input_info, 1,
2964			   0, 1); /* rel id in r0.y? */
2965	if (r)
2966		return r;
2967
2968	for (i = 0; i < ctx->shader->noutput; i++) {
2969		struct r600_bytecode_alu alu;
2970		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2971
2972		if (param) {
2973			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2974					   temp_reg, 1,
2975					   temp_reg, 0,
2976					   V_SQ_ALU_SRC_LITERAL, param * 16);
2977			if (r)
2978				return r;
2979		}
2980
2981		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2982				   temp_reg, 2,
2983				   temp_reg, param ? 1 : 0,
2984				   V_SQ_ALU_SRC_LITERAL, 8);
2985		if (r)
2986			return r;
2987
2988
2989		for (j = 0; j < 2; j++) {
2990			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2991			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2992			alu.op = LDS_OP3_LDS_WRITE_REL;
2993			alu.src[0].sel = temp_reg;
2994			alu.src[0].chan = chan;
2995			alu.src[1].sel = ctx->shader->output[i].gpr;
2996			alu.src[1].chan = j * 2;
2997			alu.src[2].sel = ctx->shader->output[i].gpr;
2998			alu.src[2].chan = (j * 2) + 1;
2999			alu.last = 1;
3000			alu.dst.chan = 0;
3001			alu.lds_idx = 1;
3002			alu.is_lds_idx_op = true;
3003			r = r600_bytecode_add_alu(ctx->bc, &alu);
3004			if (r)
3005				return r;
3006		}
3007	}
3008	return 0;
3009}
3010
3011static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3012{
3013	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3014	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3015	int i, r, lasti;
3016	int temp_reg = r600_get_temp(ctx);
3017	struct r600_bytecode_alu alu;
3018	unsigned write_mask = dst->Register.WriteMask;
3019
3020	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3021		return 0;
3022
3023	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3024	if (r)
3025		return r;
3026
3027	/* the base address is now in temp.x */
3028	r = r600_get_byte_address(ctx, temp_reg,
3029				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3030	if (r)
3031		return r;
3032
3033	/* LDS write */
3034	lasti = tgsi_last_instruction(write_mask);
3035	for (i = 1; i <= lasti; i++) {
3036
3037		if (!(write_mask & (1 << i)))
3038			continue;
3039		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3040				   temp_reg, i,
3041				   temp_reg, 0,
3042				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3043		if (r)
3044			return r;
3045	}
3046
3047	for (i = 0; i <= lasti; i++) {
3048		if (!(write_mask & (1 << i)))
3049			continue;
3050
3051		if ((i == 0 && ((write_mask & 3) == 3)) ||
3052		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
3053			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3054			alu.op = LDS_OP3_LDS_WRITE_REL;
3055			alu.src[0].sel = temp_reg;
3056			alu.src[0].chan = i;
3057
3058			alu.src[1].sel = dst->Register.Index;
3059			alu.src[1].sel += ctx->file_offset[dst->Register.File];
3060			alu.src[1].chan = i;
3061
3062			alu.src[2].sel = dst->Register.Index;
3063			alu.src[2].sel += ctx->file_offset[dst->Register.File];
3064			alu.src[2].chan = i + 1;
3065			alu.lds_idx = 1;
3066			alu.dst.chan = 0;
3067			alu.last = 1;
3068			alu.is_lds_idx_op = true;
3069			r = r600_bytecode_add_alu(ctx->bc, &alu);
3070			if (r)
3071				return r;
3072			i += 1;
3073			continue;
3074		}
3075		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3076		alu.op = LDS_OP2_LDS_WRITE;
3077		alu.src[0].sel = temp_reg;
3078		alu.src[0].chan = i;
3079
3080		alu.src[1].sel = dst->Register.Index;
3081		alu.src[1].sel += ctx->file_offset[dst->Register.File];
3082		alu.src[1].chan = i;
3083
3084		alu.src[2].sel = V_SQ_ALU_SRC_0;
3085		alu.dst.chan = 0;
3086		alu.last = 1;
3087		alu.is_lds_idx_op = true;
3088		r = r600_bytecode_add_alu(ctx->bc, &alu);
3089		if (r)
3090			return r;
3091	}
3092	return 0;
3093}
3094
3095static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3096				 int output_idx, int nc)
3097{
3098	int param;
3099	unsigned temp_reg = r600_get_temp(ctx);
3100	unsigned name = ctx->shader->output[output_idx].name;
3101	int dreg = ctx->shader->output[output_idx].gpr;
3102	int r;
3103
3104	param = r600_get_lds_unique_index(name, 0);
3105	r = get_lds_offset0(ctx, 1, temp_reg, true);
3106	if (r)
3107		return r;
3108
3109	if (param) {
3110		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3111				   temp_reg, 0,
3112				   temp_reg, 0,
3113				   V_SQ_ALU_SRC_LITERAL, param * 16);
3114		if (r)
3115			return r;
3116	}
3117
3118	do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3119	return 0;
3120}
3121
3122static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3123{
3124	int stride, outer_comps, inner_comps;
3125	int tessinner_idx = -1, tessouter_idx = -1;
3126	int i, r;
3127	unsigned j;
3128	int temp_reg = r600_get_temp(ctx);
3129	int treg[3] = {-1, -1, -1};
3130	struct r600_bytecode_alu alu;
3131	struct r600_bytecode_cf *cf_jump, *cf_pop;
3132
3133	/* only execute factor emission for invocation 0 */
3134	/* PRED_SETE_INT __, R0.x, 0 */
3135	memset(&alu, 0, sizeof(alu));
3136	alu.op = ALU_OP2_PRED_SETE_INT;
3137	alu.src[0].chan = 2;
3138	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3139	alu.execute_mask = 1;
3140	alu.update_pred = 1;
3141	alu.last = 1;
3142	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3143
3144	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3145	cf_jump = ctx->bc->cf_last;
3146
3147	treg[0] = r600_get_temp(ctx);
3148	switch (ctx->shader->tcs_prim_mode) {
3149	case PIPE_PRIM_LINES:
3150		stride = 8; /* 2 dwords, 1 vec2 store */
3151		outer_comps = 2;
3152		inner_comps = 0;
3153		break;
3154	case PIPE_PRIM_TRIANGLES:
3155		stride = 16; /* 4 dwords, 1 vec4 store */
3156		outer_comps = 3;
3157		inner_comps = 1;
3158		treg[1] = r600_get_temp(ctx);
3159		break;
3160	case PIPE_PRIM_QUADS:
3161		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3162		outer_comps = 4;
3163		inner_comps = 2;
3164		treg[1] = r600_get_temp(ctx);
3165		treg[2] = r600_get_temp(ctx);
3166		break;
3167	default:
3168		assert(0);
3169		return -1;
3170	}
3171
3172	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3173	/* TF_WRITE takes index in R.x, value in R.y */
3174	for (j = 0; j < ctx->shader->noutput; j++) {
3175		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3176			tessinner_idx = j;
3177		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3178			tessouter_idx = j;
3179	}
3180
3181	if (tessouter_idx == -1)
3182		return -1;
3183
3184	if (tessinner_idx == -1 && inner_comps)
3185		return -1;
3186
3187	if (tessouter_idx != -1) {
3188		r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3189		if (r)
3190			return r;
3191	}
3192
3193	if (tessinner_idx != -1) {
3194		r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3195		if (r)
3196			return r;
3197	}
3198
3199	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3200	/* r.x = relpatchid(r0.y) * tf_stride */
3201
3202	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
3203	/* add incoming r0.w to it: t.x = t.x + r0.w */
3204	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3205			   temp_reg, 0,
3206			   0, 1,
3207			   V_SQ_ALU_SRC_LITERAL, stride,
3208			   0, 3);
3209	if (r)
3210		return r;
3211
3212	for (i = 0; i < outer_comps + inner_comps; i++) {
3213		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3214		int out_comp = i >= outer_comps ? i - outer_comps : i;
3215
3216		if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3217			if (out_comp == 1)
3218				out_comp = 0;
3219			else if (out_comp == 0)
3220				out_comp = 1;
3221		}
3222
3223		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3224				   treg[i / 2], (2 * (i % 2)),
3225				   temp_reg, 0,
3226				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3227		if (r)
3228			return r;
3229		r = single_alu_op2(ctx, ALU_OP1_MOV,
3230				   treg[i / 2], 1 + (2 * (i%2)),
3231				   ctx->shader->output[out_idx].gpr, out_comp,
3232				   0, 0);
3233		if (r)
3234			return r;
3235	}
3236	for (i = 0; i < outer_comps + inner_comps; i++) {
3237		struct r600_bytecode_gds gds;
3238
3239		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3240		gds.src_gpr = treg[i / 2];
3241		gds.src_sel_x = 2 * (i % 2);
3242		gds.src_sel_y = 1 + (2 * (i % 2));
3243		gds.src_sel_z = 4;
3244		gds.dst_sel_x = 7;
3245		gds.dst_sel_y = 7;
3246		gds.dst_sel_z = 7;
3247		gds.dst_sel_w = 7;
3248		gds.op = FETCH_OP_TF_WRITE;
3249		r = r600_bytecode_add_gds(ctx->bc, &gds);
3250		if (r)
3251			return r;
3252	}
3253
3254	// Patch up jump label
3255	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3256	cf_pop = ctx->bc->cf_last;
3257
3258	cf_jump->cf_addr = cf_pop->id + 2;
3259	cf_jump->pop_count = 1;
3260	cf_pop->cf_addr = cf_pop->id + 2;
3261	cf_pop->pop_count = 1;
3262
3263	return 0;
3264}
3265
3266/*
3267 * We have to work out the thread ID for load and atomic
3268 * operations, which store the returned value to an index
3269 * in an intermediate buffer.
3270 * The index is calculated by taking the thread id,
3271 * calculated from the MBCNT instructions.
3272 * Then the shader engine ID is multiplied by 256,
3273 * and the wave id is added.
3274 * Then the result is multipled by 64 and thread id is
3275 * added.
3276 */
3277static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3278{
3279	struct r600_bytecode_alu alu;
3280	int r;
3281
3282	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3283	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3284	alu.dst.sel = ctx->temp_reg;
3285	alu.dst.chan = 0;
3286	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3287	alu.src[0].value = 0xffffffff;
3288	alu.dst.write = 1;
3289	r = r600_bytecode_add_alu(ctx->bc, &alu);
3290	if (r)
3291		return r;
3292
3293	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3294	alu.op = ALU_OP1_MBCNT_32HI_INT;
3295	alu.dst.sel = ctx->temp_reg;
3296	alu.dst.chan = 1;
3297	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3298	alu.src[0].value = 0xffffffff;
3299	alu.dst.write = 1;
3300	r = r600_bytecode_add_alu(ctx->bc, &alu);
3301	if (r)
3302		return r;
3303
3304	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3305	alu.op = ALU_OP3_MULADD_UINT24;
3306	alu.dst.sel = ctx->temp_reg;
3307	alu.dst.chan = 2;
3308	alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3309	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3310	alu.src[1].value = 256;
3311	alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3312	alu.dst.write = 1;
3313	alu.is_op3 = 1;
3314	alu.last = 1;
3315	r = r600_bytecode_add_alu(ctx->bc, &alu);
3316	if (r)
3317		return r;
3318
3319	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3320			   ctx->thread_id_gpr, 1,
3321			   ctx->temp_reg, 2,
3322			   V_SQ_ALU_SRC_LITERAL, 0x40,
3323			   ctx->temp_reg, 0);
3324	if (r)
3325		return r;
3326	return 0;
3327}
3328
3329static int r600_shader_from_tgsi(struct r600_context *rctx,
3330				 struct r600_pipe_shader *pipeshader,
3331				 union r600_shader_key key)
3332{
3333	struct r600_screen *rscreen = rctx->screen;
3334	struct r600_shader *shader = &pipeshader->shader;
3335	struct tgsi_token *tokens = pipeshader->selector->tokens;
3336	struct pipe_stream_output_info so = pipeshader->selector->so;
3337	struct tgsi_full_immediate *immediate;
3338	struct r600_shader_ctx ctx;
3339	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3340	unsigned output_done, noutput;
3341	unsigned opcode;
3342	int j, k, r = 0;
3343	unsigned i;
3344	int next_param_base = 0, next_clip_base;
3345	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3346	bool indirect_gprs;
3347	bool ring_outputs = false;
3348	bool lds_outputs = false;
3349	bool lds_inputs = false;
3350	bool pos_emitted = false;
3351
3352	ctx.bc = &shader->bc;
3353	ctx.shader = shader;
3354
3355	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3356			   rscreen->has_compressed_msaa_texturing);
3357	ctx.tokens = tokens;
3358	tgsi_scan_shader(tokens, &ctx.info);
3359	shader->indirect_files = ctx.info.indirect_files;
3360
3361	int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3362	ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3363	ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3364	tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3365
3366	shader->uses_helper_invocation = false;
3367	shader->uses_doubles = ctx.info.uses_doubles;
3368	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3369	shader->nsys_inputs = 0;
3370
3371	shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3372		ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3373	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3374	tgsi_parse_init(&ctx.parse, tokens);
3375	ctx.type = ctx.info.processor;
3376	shader->processor_type = ctx.type;
3377	ctx.bc->type = shader->processor_type;
3378
3379	switch (ctx.type) {
3380	case PIPE_SHADER_VERTEX:
3381		shader->vs_as_gs_a = key.vs.as_gs_a;
3382		shader->vs_as_es = key.vs.as_es;
3383		shader->vs_as_ls = key.vs.as_ls;
3384		shader->atomic_base = key.vs.first_atomic_counter;
3385		if (shader->vs_as_es)
3386			ring_outputs = true;
3387		if (shader->vs_as_ls)
3388			lds_outputs = true;
3389		break;
3390	case PIPE_SHADER_GEOMETRY:
3391		ring_outputs = true;
3392		shader->atomic_base = key.gs.first_atomic_counter;
3393		shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3394		break;
3395	case PIPE_SHADER_TESS_CTRL:
3396		shader->tcs_prim_mode = key.tcs.prim_mode;
3397		shader->atomic_base = key.tcs.first_atomic_counter;
3398		lds_outputs = true;
3399		lds_inputs = true;
3400		break;
3401	case PIPE_SHADER_TESS_EVAL:
3402		shader->tes_as_es = key.tes.as_es;
3403		shader->atomic_base = key.tes.first_atomic_counter;
3404		lds_inputs = true;
3405		if (shader->tes_as_es)
3406			ring_outputs = true;
3407		break;
3408	case PIPE_SHADER_FRAGMENT:
3409		shader->two_side = key.ps.color_two_side;
3410		shader->atomic_base = key.ps.first_atomic_counter;
3411		shader->rat_base = key.ps.nr_cbufs;
3412		shader->image_size_const_offset = key.ps.image_size_const_offset;
3413		break;
3414	case PIPE_SHADER_COMPUTE:
3415		shader->rat_base = 0;
3416		shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3417		break;
3418	default:
3419		break;
3420	}
3421
3422	if (shader->vs_as_es || shader->tes_as_es) {
3423		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3424	} else {
3425		ctx.gs_for_vs = NULL;
3426	}
3427
3428	ctx.next_ring_offset = 0;
3429	ctx.gs_out_ring_offset = 0;
3430	ctx.gs_next_vertex = 0;
3431	ctx.gs_stream_output_info = &so;
3432
3433	ctx.thread_id_gpr = -1;
3434	ctx.face_gpr = -1;
3435	ctx.fixed_pt_position_gpr = -1;
3436	ctx.fragcoord_input = -1;
3437	ctx.colors_used = 0;
3438	ctx.clip_vertex_write = 0;
3439
3440	ctx.helper_invoc_reg = -1;
3441	ctx.cs_block_size_reg = -1;
3442	ctx.cs_grid_size_reg = -1;
3443	ctx.cs_block_size_loaded = false;
3444	ctx.cs_grid_size_loaded = false;
3445
3446	shader->nr_ps_color_exports = 0;
3447	shader->nr_ps_max_color_exports = 0;
3448
3449
3450	/* register allocations */
3451	/* Values [0,127] correspond to GPR[0..127].
3452	 * Values [128,159] correspond to constant buffer bank 0
3453	 * Values [160,191] correspond to constant buffer bank 1
3454	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3455	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3456	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3457	 * Other special values are shown in the list below.
3458	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3459	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3460	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3461	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3462	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3463	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3464	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3465	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3466	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3467	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3468	 * 254	SQ_ALU_SRC_PV: previous vector result.
3469	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3470	 */
3471	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3472		ctx.file_offset[i] = 0;
3473	}
3474
3475	if (ctx.type == PIPE_SHADER_VERTEX)  {
3476
3477		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3478		if (ctx.info.num_inputs)
3479			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3480	}
3481	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3482		if (ctx.bc->chip_class >= EVERGREEN)
3483			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3484		else
3485			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3486
3487		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3488			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3489				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3490				shader->uses_helper_invocation = true;
3491			}
3492		}
3493	}
3494	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3495		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3496		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3497	}
3498	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3499		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3500	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3501		bool add_tesscoord = false, add_tess_inout = false;
3502		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3503		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3504			/* if we have tesscoord save one reg */
3505			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3506				add_tesscoord = true;
3507			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3508			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3509				add_tess_inout = true;
3510		}
3511		if (add_tesscoord || add_tess_inout)
3512			ctx.file_offset[TGSI_FILE_INPUT]++;
3513		if (add_tess_inout)
3514			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3515	}
3516	if (ctx.type == PIPE_SHADER_COMPUTE) {
3517		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3518		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3519			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3520				ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3521			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3522				ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3523		}
3524	}
3525
3526	ctx.file_offset[TGSI_FILE_OUTPUT] =
3527			ctx.file_offset[TGSI_FILE_INPUT] +
3528			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3529	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3530						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3531
3532	/* Outside the GPR range. This will be translated to one of the
3533	 * kcache banks later. */
3534	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3535	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3536
3537	pipeshader->scratch_space_needed = 0;
3538	int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3539			ctx.info.file_max[TGSI_FILE_TEMPORARY];
3540	if (regno > 124) {
3541		choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3542		shader->indirect_files = ctx.info.indirect_files;
3543	}
3544	shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3545
3546	ctx.bc->ar_reg = ++regno;
3547	ctx.bc->index_reg[0] = ++regno;
3548	ctx.bc->index_reg[1] = ++regno;
3549
3550	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3551		ctx.tess_input_info = ++regno;
3552		ctx.tess_output_info = ++regno;
3553	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3554		ctx.tess_input_info = ++regno;
3555		ctx.tess_output_info = ++regno;
3556	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3557		ctx.gs_export_gpr_tregs[0] = ++regno;
3558		ctx.gs_export_gpr_tregs[1] = ++regno;
3559		ctx.gs_export_gpr_tregs[2] = ++regno;
3560		ctx.gs_export_gpr_tregs[3] = ++regno;
3561		if (ctx.shader->gs_tri_strip_adj_fix) {
3562			ctx.gs_rotated_input[0] = ++regno;
3563			ctx.gs_rotated_input[1] = ++regno;
3564		} else {
3565			ctx.gs_rotated_input[0] = 0;
3566			ctx.gs_rotated_input[1] = 1;
3567		}
3568	}
3569
3570	if (shader->uses_images) {
3571		ctx.thread_id_gpr = ++regno;
3572	}
3573	ctx.temp_reg = ++regno;
3574
3575	shader->max_arrays = 0;
3576	shader->num_arrays = 0;
3577	if (indirect_gprs) {
3578
3579		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3580			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3581			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3582			                   ctx.file_offset[TGSI_FILE_INPUT],
3583			                   0x0F);
3584		}
3585		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3586			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3587			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3588			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3589			                   0x0F);
3590		}
3591	}
3592
3593	ctx.nliterals = 0;
3594	ctx.literals = NULL;
3595	ctx.max_driver_temp_used = 0;
3596
3597	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3598			       ctx.info.colors_written == 1;
3599	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3600	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3601
3602	if (ctx.type == PIPE_SHADER_VERTEX ||
3603	    ctx.type == PIPE_SHADER_GEOMETRY ||
3604	    ctx.type == PIPE_SHADER_TESS_EVAL) {
3605		shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3606					      ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3607		shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3608		shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3609	}
3610
3611	if (shader->vs_as_gs_a)
3612		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3613
3614	if (ctx.thread_id_gpr != -1) {
3615		r = load_thread_id_gpr(&ctx);
3616		if (r)
3617			return r;
3618	}
3619
3620	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3621		r600_fetch_tess_io_info(&ctx);
3622
3623	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3624		tgsi_parse_token(&ctx.parse);
3625		switch (ctx.parse.FullToken.Token.Type) {
3626		case TGSI_TOKEN_TYPE_IMMEDIATE:
3627			immediate = &ctx.parse.FullToken.FullImmediate;
3628			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3629			if(ctx.literals == NULL) {
3630				r = -ENOMEM;
3631				goto out_err;
3632			}
3633			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3634			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3635			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3636			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3637			ctx.nliterals++;
3638			break;
3639		case TGSI_TOKEN_TYPE_DECLARATION:
3640			r = tgsi_declaration(&ctx);
3641			if (r)
3642				goto out_err;
3643			break;
3644		case TGSI_TOKEN_TYPE_INSTRUCTION:
3645		case TGSI_TOKEN_TYPE_PROPERTY:
3646			break;
3647		default:
3648			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3649			r = -EINVAL;
3650			goto out_err;
3651		}
3652	}
3653
3654	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3655	shader->ring_item_sizes[1] = 0;
3656	shader->ring_item_sizes[2] = 0;
3657	shader->ring_item_sizes[3] = 0;
3658
3659	/* Process two side if needed */
3660	if (shader->two_side && ctx.colors_used) {
3661		int i, count = ctx.shader->ninput;
3662		unsigned next_lds_loc = ctx.shader->nlds;
3663
3664		/* additional inputs will be allocated right after the existing inputs,
3665		 * we won't need them after the color selection, so we don't need to
3666		 * reserve these gprs for the rest of the shader code and to adjust
3667		 * output offsets etc. */
3668		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3669				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3670
3671		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3672		if (ctx.face_gpr == -1) {
3673			i = ctx.shader->ninput++;
3674			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3675			ctx.shader->input[i].spi_sid = 0;
3676			ctx.shader->input[i].gpr = gpr++;
3677			ctx.face_gpr = ctx.shader->input[i].gpr;
3678		}
3679
3680		for (i = 0; i < count; i++) {
3681			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3682				int ni = ctx.shader->ninput++;
3683				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3684				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3685				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3686				ctx.shader->input[ni].gpr = gpr++;
3687				// TGSI to LLVM needs to know the lds position of inputs.
3688				// Non LLVM path computes it later (in process_twoside_color)
3689				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3690				ctx.shader->input[i].back_color_input = ni;
3691				if (ctx.bc->chip_class >= EVERGREEN) {
3692					if ((r = evergreen_interp_input(&ctx, ni)))
3693						return r;
3694				}
3695			}
3696		}
3697	}
3698
3699	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3700		shader->nr_ps_max_color_exports = 8;
3701
3702	if (ctx.shader->uses_helper_invocation) {
3703		if (ctx.bc->chip_class == CAYMAN)
3704			r = cm_load_helper_invocation(&ctx);
3705		else
3706			r = eg_load_helper_invocation(&ctx);
3707		if (r)
3708			return r;
3709	}
3710
3711	/*
3712	 * XXX this relies on fixed_pt_position_gpr only being present when
3713	 * this shader should be executed per sample. Should be the case for now...
3714	 */
3715	if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3716		/*
3717		 * Fix up sample mask. The hw always gives us coverage mask for
3718		 * the pixel. However, for per-sample shading, we need the
3719		 * coverage for the shader invocation only.
3720		 * Also, with disabled msaa, only the first bit should be set
3721		 * (luckily the same fixup works for both problems).
3722		 * For now, we can only do it if we know this shader is always
3723		 * executed per sample (due to usage of bits in the shader
3724		 * forcing per-sample execution).
3725		 * If the fb is not multisampled, we'd do unnecessary work but
3726		 * it should still be correct.
3727		 * It will however do nothing for sample shading according
3728		 * to MinSampleShading.
3729		 */
3730		struct r600_bytecode_alu alu;
3731		int tmp = r600_get_temp(&ctx);
3732		assert(ctx.face_gpr != -1);
3733		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3734
3735		alu.op = ALU_OP2_LSHL_INT;
3736		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3737		alu.src[0].value = 0x1;
3738		alu.src[1].sel = ctx.fixed_pt_position_gpr;
3739		alu.src[1].chan = 3;
3740		alu.dst.sel = tmp;
3741		alu.dst.chan = 0;
3742		alu.dst.write = 1;
3743		alu.last = 1;
3744		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3745			return r;
3746
3747		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3748		alu.op = ALU_OP2_AND_INT;
3749		alu.src[0].sel = tmp;
3750		alu.src[1].sel = ctx.face_gpr;
3751		alu.src[1].chan = 2;
3752		alu.dst.sel = ctx.face_gpr;
3753		alu.dst.chan = 2;
3754		alu.dst.write = 1;
3755		alu.last = 1;
3756		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3757			return r;
3758	}
3759
3760	if (ctx.fragcoord_input >= 0) {
3761		if (ctx.bc->chip_class == CAYMAN) {
3762			for (j = 0 ; j < 4; j++) {
3763				struct r600_bytecode_alu alu;
3764				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3765				alu.op = ALU_OP1_RECIP_IEEE;
3766				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3767				alu.src[0].chan = 3;
3768
3769				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3770				alu.dst.chan = j;
3771				alu.dst.write = (j == 3);
3772				alu.last = (j == 3);
3773				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3774					return r;
3775			}
3776		} else {
3777			struct r600_bytecode_alu alu;
3778			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3779			alu.op = ALU_OP1_RECIP_IEEE;
3780			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3781			alu.src[0].chan = 3;
3782
3783			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3784			alu.dst.chan = 3;
3785			alu.dst.write = 1;
3786			alu.last = 1;
3787			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3788				return r;
3789		}
3790	}
3791
3792	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3793		struct r600_bytecode_alu alu;
3794		int r;
3795
3796		/* GS thread with no output workaround - emit a cut at start of GS */
3797		if (ctx.bc->chip_class == R600)
3798			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3799
3800		for (j = 0; j < 4; j++) {
3801			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3802			alu.op = ALU_OP1_MOV;
3803			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3804			alu.src[0].value = 0;
3805			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3806			alu.dst.write = 1;
3807			alu.last = 1;
3808			r = r600_bytecode_add_alu(ctx.bc, &alu);
3809			if (r)
3810				return r;
3811		}
3812
3813		if (ctx.shader->gs_tri_strip_adj_fix) {
3814			r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3815					   ctx.gs_rotated_input[0], 2,
3816					   0, 2,
3817					   V_SQ_ALU_SRC_LITERAL, 1);
3818			if (r)
3819				return r;
3820
3821			for (i = 0; i < 6; i++) {
3822				int rotated = (i + 4) % 6;
3823				int offset_reg = i / 3;
3824				int offset_chan = i % 3;
3825				int rotated_offset_reg = rotated / 3;
3826				int rotated_offset_chan = rotated % 3;
3827
3828				if (offset_reg == 0 && offset_chan == 2)
3829					offset_chan = 3;
3830				if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3831					rotated_offset_chan = 3;
3832
3833				r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3834						   ctx.gs_rotated_input[offset_reg], offset_chan,
3835						   ctx.gs_rotated_input[0], 2,
3836						   offset_reg, offset_chan,
3837						   rotated_offset_reg, rotated_offset_chan);
3838				if (r)
3839					return r;
3840			}
3841		}
3842	}
3843
3844	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3845		r600_fetch_tess_io_info(&ctx);
3846
3847	if (shader->two_side && ctx.colors_used) {
3848		if ((r = process_twoside_color_inputs(&ctx)))
3849			return r;
3850	}
3851
3852	tgsi_parse_init(&ctx.parse, tokens);
3853	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3854		tgsi_parse_token(&ctx.parse);
3855		switch (ctx.parse.FullToken.Token.Type) {
3856		case TGSI_TOKEN_TYPE_INSTRUCTION:
3857			r = tgsi_is_supported(&ctx);
3858			if (r)
3859				goto out_err;
3860			ctx.max_driver_temp_used = 0;
3861			/* reserve first tmp for everyone */
3862			r600_get_temp(&ctx);
3863
3864			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3865			if ((r = tgsi_split_constant(&ctx)))
3866				goto out_err;
3867			if ((r = tgsi_split_literal_constant(&ctx)))
3868				goto out_err;
3869			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3870				if ((r = tgsi_split_gs_inputs(&ctx)))
3871					goto out_err;
3872			} else if (lds_inputs) {
3873				if ((r = tgsi_split_lds_inputs(&ctx)))
3874					goto out_err;
3875			}
3876			if (ctx.bc->chip_class == CAYMAN)
3877				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3878			else if (ctx.bc->chip_class >= EVERGREEN)
3879				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3880			else
3881				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3882
3883			ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3884
3885			r = ctx.inst_info->process(&ctx);
3886			if (r)
3887				goto out_err;
3888
3889			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3890				r = r600_store_tcs_output(&ctx);
3891				if (r)
3892					goto out_err;
3893			}
3894			break;
3895		default:
3896			break;
3897		}
3898	}
3899
3900	/* Reset the temporary register counter. */
3901	ctx.max_driver_temp_used = 0;
3902
3903	noutput = shader->noutput;
3904
3905	if (!ring_outputs && ctx.clip_vertex_write) {
3906		unsigned clipdist_temp[2];
3907
3908		clipdist_temp[0] = r600_get_temp(&ctx);
3909		clipdist_temp[1] = r600_get_temp(&ctx);
3910
3911		/* need to convert a clipvertex write into clipdistance writes and not export
3912		   the clip vertex anymore */
3913
3914		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3915		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3916		shader->output[noutput].gpr = clipdist_temp[0];
3917		noutput++;
3918		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3919		shader->output[noutput].gpr = clipdist_temp[1];
3920		noutput++;
3921
3922		/* reset spi_sid for clipvertex output to avoid confusing spi */
3923		shader->output[ctx.cv_output].spi_sid = 0;
3924
3925		shader->clip_dist_write = 0xFF;
3926		shader->cc_dist_mask = 0xFF;
3927
3928		for (i = 0; i < 8; i++) {
3929			int oreg = i >> 2;
3930			int ochan = i & 3;
3931
3932			for (j = 0; j < 4; j++) {
3933				struct r600_bytecode_alu alu;
3934				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3935				alu.op = ALU_OP2_DOT4;
3936				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3937				alu.src[0].chan = j;
3938
3939				alu.src[1].sel = 512 + i;
3940				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3941				alu.src[1].chan = j;
3942
3943				alu.dst.sel = clipdist_temp[oreg];
3944				alu.dst.chan = j;
3945				alu.dst.write = (j == ochan);
3946				if (j == 3)
3947					alu.last = 1;
3948				r = r600_bytecode_add_alu(ctx.bc, &alu);
3949				if (r)
3950					return r;
3951			}
3952		}
3953	}
3954
3955	/* Add stream outputs. */
3956	if (so.num_outputs) {
3957		bool emit = false;
3958		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3959			emit = true;
3960		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3961			emit = true;
3962		if (emit)
3963			emit_streamout(&ctx, &so, -1, NULL);
3964	}
3965	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3966	convert_edgeflag_to_int(&ctx);
3967
3968	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3969		r600_emit_tess_factor(&ctx);
3970
3971	if (lds_outputs) {
3972		if (ctx.type == PIPE_SHADER_VERTEX) {
3973			if (ctx.shader->noutput)
3974				emit_lds_vs_writes(&ctx);
3975		}
3976	} else if (ring_outputs) {
3977		if (shader->vs_as_es || shader->tes_as_es) {
3978			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3979			ctx.gs_export_gpr_tregs[1] = -1;
3980			ctx.gs_export_gpr_tregs[2] = -1;
3981			ctx.gs_export_gpr_tregs[3] = -1;
3982
3983			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3984		}
3985	} else {
3986		/* Export output */
3987		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3988
3989		for (i = 0, j = 0; i < noutput; i++, j++) {
3990			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3991			output[j].gpr = shader->output[i].gpr;
3992			output[j].elem_size = 3;
3993			output[j].swizzle_x = 0;
3994			output[j].swizzle_y = 1;
3995			output[j].swizzle_z = 2;
3996			output[j].swizzle_w = 3;
3997			output[j].burst_count = 1;
3998			output[j].type = 0xffffffff;
3999			output[j].op = CF_OP_EXPORT;
4000			switch (ctx.type) {
4001			case PIPE_SHADER_VERTEX:
4002			case PIPE_SHADER_TESS_EVAL:
4003				switch (shader->output[i].name) {
4004				case TGSI_SEMANTIC_POSITION:
4005					output[j].array_base = 60;
4006					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4007					pos_emitted = true;
4008					break;
4009
4010				case TGSI_SEMANTIC_PSIZE:
4011					output[j].array_base = 61;
4012					output[j].swizzle_y = 7;
4013					output[j].swizzle_z = 7;
4014					output[j].swizzle_w = 7;
4015					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4016					pos_emitted = true;
4017					break;
4018				case TGSI_SEMANTIC_EDGEFLAG:
4019					output[j].array_base = 61;
4020					output[j].swizzle_x = 7;
4021					output[j].swizzle_y = 0;
4022					output[j].swizzle_z = 7;
4023					output[j].swizzle_w = 7;
4024					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4025					pos_emitted = true;
4026					break;
4027				case TGSI_SEMANTIC_LAYER:
4028					/* spi_sid is 0 for outputs that are
4029					 * not consumed by PS */
4030					if (shader->output[i].spi_sid) {
4031						output[j].array_base = next_param_base++;
4032						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4033						j++;
4034						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4035					}
4036					output[j].array_base = 61;
4037					output[j].swizzle_x = 7;
4038					output[j].swizzle_y = 7;
4039					output[j].swizzle_z = 0;
4040					output[j].swizzle_w = 7;
4041					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4042					pos_emitted = true;
4043					break;
4044				case TGSI_SEMANTIC_VIEWPORT_INDEX:
4045					/* spi_sid is 0 for outputs that are
4046					 * not consumed by PS */
4047					if (shader->output[i].spi_sid) {
4048						output[j].array_base = next_param_base++;
4049						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4050						j++;
4051						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4052					}
4053					output[j].array_base = 61;
4054					output[j].swizzle_x = 7;
4055					output[j].swizzle_y = 7;
4056					output[j].swizzle_z = 7;
4057					output[j].swizzle_w = 0;
4058					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4059					pos_emitted = true;
4060					break;
4061				case TGSI_SEMANTIC_CLIPVERTEX:
4062					j--;
4063					break;
4064				case TGSI_SEMANTIC_CLIPDIST:
4065					output[j].array_base = next_clip_base++;
4066					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4067					pos_emitted = true;
4068					/* spi_sid is 0 for clipdistance outputs that were generated
4069					 * for clipvertex - we don't need to pass them to PS */
4070					if (shader->output[i].spi_sid) {
4071						j++;
4072						/* duplicate it as PARAM to pass to the pixel shader */
4073						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4074						output[j].array_base = next_param_base++;
4075						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4076					}
4077					break;
4078				case TGSI_SEMANTIC_FOG:
4079					output[j].swizzle_y = 4; /* 0 */
4080					output[j].swizzle_z = 4; /* 0 */
4081					output[j].swizzle_w = 5; /* 1 */
4082					break;
4083				case TGSI_SEMANTIC_PRIMID:
4084					output[j].swizzle_x = 2;
4085					output[j].swizzle_y = 4; /* 0 */
4086					output[j].swizzle_z = 4; /* 0 */
4087					output[j].swizzle_w = 4; /* 0 */
4088					break;
4089				}
4090
4091				break;
4092			case PIPE_SHADER_FRAGMENT:
4093				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4094					/* never export more colors than the number of CBs */
4095					if (shader->output[i].sid >= max_color_exports) {
4096						/* skip export */
4097						j--;
4098						continue;
4099					}
4100					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4101					output[j].array_base = shader->output[i].sid;
4102					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4103					shader->nr_ps_color_exports++;
4104					shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4105
4106					/* If the i-th target format is set, all previous target formats must
4107					 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4108					 */
4109					if (shader->output[i].sid > 0)
4110						for (unsigned x = 0; x < shader->output[i].sid; x++)
4111							shader->ps_color_export_mask |= (1 << (x*4));
4112
4113					if (shader->output[i].sid > shader->ps_export_highest)
4114						shader->ps_export_highest = shader->output[i].sid;
4115					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4116						for (k = 1; k < max_color_exports; k++) {
4117							j++;
4118							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4119							output[j].gpr = shader->output[i].gpr;
4120							output[j].elem_size = 3;
4121							output[j].swizzle_x = 0;
4122							output[j].swizzle_y = 1;
4123							output[j].swizzle_z = 2;
4124							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4125							output[j].burst_count = 1;
4126							output[j].array_base = k;
4127							output[j].op = CF_OP_EXPORT;
4128							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4129							shader->nr_ps_color_exports++;
4130							if (k > shader->ps_export_highest)
4131								shader->ps_export_highest = k;
4132							shader->ps_color_export_mask |= (0xf << (j * 4));
4133						}
4134					}
4135				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4136					output[j].array_base = 61;
4137					output[j].swizzle_x = 2;
4138					output[j].swizzle_y = 7;
4139					output[j].swizzle_z = output[j].swizzle_w = 7;
4140					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4141				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4142					output[j].array_base = 61;
4143					output[j].swizzle_x = 7;
4144					output[j].swizzle_y = 1;
4145					output[j].swizzle_z = output[j].swizzle_w = 7;
4146					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4147				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4148					output[j].array_base = 61;
4149					output[j].swizzle_x = 7;
4150					output[j].swizzle_y = 7;
4151					output[j].swizzle_z = 0;
4152					output[j].swizzle_w = 7;
4153					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4154				} else {
4155					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4156					r = -EINVAL;
4157					goto out_err;
4158				}
4159				break;
4160			case PIPE_SHADER_TESS_CTRL:
4161				break;
4162			default:
4163				R600_ERR("unsupported processor type %d\n", ctx.type);
4164				r = -EINVAL;
4165				goto out_err;
4166			}
4167
4168			if (output[j].type == 0xffffffff) {
4169				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4170				output[j].array_base = next_param_base++;
4171			}
4172		}
4173
4174		/* add fake position export */
4175		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4176			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4177			output[j].gpr = 0;
4178			output[j].elem_size = 3;
4179			output[j].swizzle_x = 7;
4180			output[j].swizzle_y = 7;
4181			output[j].swizzle_z = 7;
4182			output[j].swizzle_w = 7;
4183			output[j].burst_count = 1;
4184			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4185			output[j].array_base = 60;
4186			output[j].op = CF_OP_EXPORT;
4187			j++;
4188		}
4189
4190		/* add fake param output for vertex shader if no param is exported */
4191		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4192			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4193			output[j].gpr = 0;
4194			output[j].elem_size = 3;
4195			output[j].swizzle_x = 7;
4196			output[j].swizzle_y = 7;
4197			output[j].swizzle_z = 7;
4198			output[j].swizzle_w = 7;
4199			output[j].burst_count = 1;
4200			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4201			output[j].array_base = 0;
4202			output[j].op = CF_OP_EXPORT;
4203			j++;
4204		}
4205
4206		/* add fake pixel export */
4207		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4208			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4209			output[j].gpr = 0;
4210			output[j].elem_size = 3;
4211			output[j].swizzle_x = 7;
4212			output[j].swizzle_y = 7;
4213			output[j].swizzle_z = 7;
4214			output[j].swizzle_w = 7;
4215			output[j].burst_count = 1;
4216			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4217			output[j].array_base = 0;
4218			output[j].op = CF_OP_EXPORT;
4219			j++;
4220			shader->nr_ps_color_exports++;
4221			shader->ps_color_export_mask = 0xf;
4222		}
4223
4224		noutput = j;
4225
4226		/* set export done on last export of each type */
4227		for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4228			if (!(output_done & (1 << output[k].type))) {
4229				output_done |= (1 << output[k].type);
4230				output[k].op = CF_OP_EXPORT_DONE;
4231			}
4232		}
4233		/* add output to bytecode */
4234		for (i = 0; i < noutput; i++) {
4235			r = r600_bytecode_add_output(ctx.bc, &output[i]);
4236			if (r)
4237				goto out_err;
4238		}
4239	}
4240
4241	/* add program end */
4242	if (ctx.bc->chip_class == CAYMAN)
4243		cm_bytecode_add_cf_end(ctx.bc);
4244	else {
4245		const struct cf_op_info *last = NULL;
4246
4247		if (ctx.bc->cf_last)
4248			last = r600_isa_cf(ctx.bc->cf_last->op);
4249
4250		/* alu clause instructions don't have EOP bit, so add NOP */
4251		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4252			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4253
4254		ctx.bc->cf_last->end_of_program = 1;
4255	}
4256
4257	/* check GPR limit - we have 124 = 128 - 4
4258	 * (4 are reserved as alu clause temporary registers) */
4259	if (ctx.bc->ngpr > 124) {
4260		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4261		r = -ENOMEM;
4262		goto out_err;
4263	}
4264
4265	if (ctx.type == PIPE_SHADER_GEOMETRY) {
4266		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4267			return r;
4268	}
4269
4270	free(ctx.spilled_arrays);
4271	free(ctx.array_infos);
4272	free(ctx.literals);
4273	tgsi_parse_free(&ctx.parse);
4274	return 0;
4275out_err:
4276	free(ctx.spilled_arrays);
4277	free(ctx.array_infos);
4278	free(ctx.literals);
4279	tgsi_parse_free(&ctx.parse);
4280	return r;
4281}
4282
4283static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4284{
4285	const unsigned tgsi_opcode =
4286		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4287	R600_ERR("%s tgsi opcode unsupported\n",
4288		 tgsi_get_opcode_name(tgsi_opcode));
4289	return -EINVAL;
4290}
4291
4292static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4293{
4294	return 0;
4295}
4296
4297static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4298			const struct r600_shader_src *shader_src,
4299			unsigned chan)
4300{
4301	bc_src->sel = shader_src->sel;
4302	bc_src->chan = shader_src->swizzle[chan];
4303	bc_src->neg = shader_src->neg;
4304	bc_src->abs = shader_src->abs;
4305	bc_src->rel = shader_src->rel;
4306	bc_src->value = shader_src->value[bc_src->chan];
4307	bc_src->kc_bank = shader_src->kc_bank;
4308	bc_src->kc_rel = shader_src->kc_rel;
4309}
4310
4311static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4312{
4313	bc_src->abs = 1;
4314	bc_src->neg = 0;
4315}
4316
4317static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4318{
4319	bc_src->neg = !bc_src->neg;
4320}
4321
4322static void tgsi_dst(struct r600_shader_ctx *ctx,
4323		     const struct tgsi_full_dst_register *tgsi_dst,
4324		     unsigned swizzle,
4325		     struct r600_bytecode_alu_dst *r600_dst)
4326{
4327	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4328
4329	if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4330		bool spilled;
4331		unsigned idx;
4332
4333		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4334
4335		if (spilled) {
4336			struct r600_bytecode_output cf;
4337			int reg = 0;
4338			int r;
4339			bool add_pending_output = true;
4340
4341			memset(&cf, 0, sizeof(struct r600_bytecode_output));
4342			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4343				&cf.array_base, &cf.array_size);
4344
4345			/* If no component has spilled, reserve a register and add the spill code
4346			 *  ctx->bc->n_pending_outputs is cleared after each instruction group */
4347			if (ctx->bc->n_pending_outputs == 0) {
4348				reg = r600_get_temp(ctx);
4349			} else {
4350				/* If we are already spilling and the output address is the same like
4351				* before then just reuse the same slot */
4352				struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4353				if ((cf.array_base + idx == tmpl->array_base) ||
4354				    (cf.array_base == tmpl->array_base &&
4355				     tmpl->index_gpr == ctx->bc->ar_reg &&
4356				     tgsi_dst->Register.Indirect)) {
4357					reg = ctx->bc->pending_outputs[0].gpr;
4358					add_pending_output = false;
4359				} else {
4360					reg = r600_get_temp(ctx);
4361				}
4362			}
4363
4364			r600_dst->sel = reg;
4365			r600_dst->chan = swizzle;
4366			r600_dst->write = 1;
4367			if (inst->Instruction.Saturate) {
4368				r600_dst->clamp = 1;
4369			}
4370
4371			/* Add new outputs as pending */
4372			if (add_pending_output) {
4373				cf.op = CF_OP_MEM_SCRATCH;
4374				cf.elem_size = 3;
4375				cf.gpr = reg;
4376				cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4377				cf.mark = 1;
4378				cf.comp_mask = inst->Dst[0].Register.WriteMask;
4379				cf.swizzle_x = 0;
4380				cf.swizzle_y = 1;
4381				cf.swizzle_z = 2;
4382				cf.swizzle_w = 3;
4383				cf.burst_count = 1;
4384
4385				if (tgsi_dst->Register.Indirect) {
4386					if (ctx->bc->chip_class < R700)
4387						cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4388					else
4389						cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4390					cf.index_gpr = ctx->bc->ar_reg;
4391			}
4392			else {
4393				cf.array_base += idx;
4394				cf.array_size = 0;
4395			}
4396
4397			r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4398			if (r)
4399				return;
4400
4401			if (ctx->bc->chip_class >= R700)
4402				r600_bytecode_need_wait_ack(ctx->bc, true);
4403			}
4404			return;
4405		}
4406		else {
4407			r600_dst->sel = idx;
4408		}
4409	}
4410	else {
4411		r600_dst->sel = tgsi_dst->Register.Index;
4412		r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4413	}
4414	r600_dst->chan = swizzle;
4415	r600_dst->write = 1;
4416	if (inst->Instruction.Saturate) {
4417		r600_dst->clamp = 1;
4418	}
4419	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4420		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4421			return;
4422		}
4423	}
4424	if (tgsi_dst->Register.Indirect)
4425		r600_dst->rel = V_SQ_REL_RELATIVE;
4426
4427}
4428
4429static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4430{
4431	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4432	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4433	struct r600_bytecode_alu alu;
4434	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4435	int use_tmp = 0;
4436	int swizzle_x = inst->Src[0].Register.SwizzleX;
4437
4438	if (singledest) {
4439		switch (write_mask) {
4440		case 0x1:
4441			if (swizzle_x == 2) {
4442				write_mask = 0xc;
4443				use_tmp = 3;
4444			} else
4445				write_mask = 0x3;
4446			break;
4447		case 0x2:
4448			if (swizzle_x == 2) {
4449				write_mask = 0xc;
4450				use_tmp = 3;
4451			} else {
4452				write_mask = 0x3;
4453				use_tmp = 1;
4454			}
4455			break;
4456		case 0x4:
4457			if (swizzle_x == 0) {
4458				write_mask = 0x3;
4459				use_tmp = 1;
4460			} else
4461				write_mask = 0xc;
4462			break;
4463		case 0x8:
4464			if (swizzle_x == 0) {
4465				write_mask = 0x3;
4466				use_tmp = 1;
4467			} else {
4468				write_mask = 0xc;
4469				use_tmp = 3;
4470			}
4471			break;
4472		}
4473	}
4474
4475	lasti = tgsi_last_instruction(write_mask);
4476	for (i = 0; i <= lasti; i++) {
4477
4478		if (!(write_mask & (1 << i)))
4479			continue;
4480
4481		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4482
4483		if (singledest) {
4484			if (use_tmp || dest_temp) {
4485				alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4486				alu.dst.chan = i;
4487				alu.dst.write = 1;
4488			} else {
4489				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4490			}
4491			if (i == 1 || i == 3)
4492				alu.dst.write = 0;
4493		} else
4494			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4495
4496		alu.op = op_override ? op_override : ctx->inst_info->op;
4497		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4498			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4499		} else if (!swap) {
4500			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4501				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4502			}
4503		} else {
4504			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4505			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4506		}
4507
4508		/* handle some special cases */
4509		if (i == 1 || i == 3) {
4510			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4511			case TGSI_OPCODE_DABS:
4512				r600_bytecode_src_set_abs(&alu.src[0]);
4513				break;
4514			default:
4515				break;
4516			}
4517		}
4518		if (i == lasti) {
4519			alu.last = 1;
4520		}
4521		r = r600_bytecode_add_alu(ctx->bc, &alu);
4522		if (r)
4523			return r;
4524	}
4525
4526	if (use_tmp) {
4527		write_mask = inst->Dst[0].Register.WriteMask;
4528
4529		lasti = tgsi_last_instruction(write_mask);
4530		/* move result from temp to dst */
4531		for (i = 0; i <= lasti; i++) {
4532			if (!(write_mask & (1 << i)))
4533				continue;
4534
4535			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4536			alu.op = ALU_OP1_MOV;
4537
4538			if (dest_temp) {
4539				alu.dst.sel = dest_temp;
4540				alu.dst.chan = i;
4541				alu.dst.write = 1;
4542			} else
4543				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4544			alu.src[0].sel = ctx->temp_reg;
4545			alu.src[0].chan = use_tmp - 1;
4546			alu.last = (i == lasti);
4547
4548			r = r600_bytecode_add_alu(ctx->bc, &alu);
4549			if (r)
4550				return r;
4551		}
4552	}
4553	return 0;
4554}
4555
4556static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4557{
4558	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4559	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4560	/* confirm writemasking */
4561	if ((write_mask & 0x3) != 0x3 &&
4562	    (write_mask & 0xc) != 0xc) {
4563		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4564		return -1;
4565	}
4566	return tgsi_op2_64_params(ctx, false, false, 0, 0);
4567}
4568
4569static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4570{
4571	return tgsi_op2_64_params(ctx, true, false, 0, 0);
4572}
4573
4574static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4575{
4576	return tgsi_op2_64_params(ctx, true, true, 0, 0);
4577}
4578
4579static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4580{
4581	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4582	struct r600_bytecode_alu alu;
4583	int i, j, r;
4584	int lasti = 3;
4585	int tmp = r600_get_temp(ctx);
4586
4587	for (i = 0; i < lasti + 1; i++) {
4588
4589		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4590		alu.op = ctx->inst_info->op;
4591		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4592			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4593		}
4594
4595		if (inst->Dst[0].Register.WriteMask & (1 << i))
4596			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4597		else
4598			alu.dst.sel = tmp;
4599
4600		alu.dst.chan = i;
4601		alu.is_op3 = 1;
4602		if (i == lasti) {
4603			alu.last = 1;
4604		}
4605		r = r600_bytecode_add_alu(ctx->bc, &alu);
4606		if (r)
4607			return r;
4608	}
4609	return 0;
4610}
4611
4612static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4613{
4614	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4615	struct r600_bytecode_alu alu;
4616	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4617	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4618	/* use temp register if trans_only and more than one dst component */
4619	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4620	unsigned op = ctx->inst_info->op;
4621
4622	if (op == ALU_OP2_MUL_IEEE &&
4623	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4624		op = ALU_OP2_MUL;
4625
4626	for (i = 0; i <= lasti; i++) {
4627		if (!(write_mask & (1 << i)))
4628			continue;
4629
4630		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631		if (use_tmp) {
4632			alu.dst.sel = ctx->temp_reg;
4633			alu.dst.chan = i;
4634			alu.dst.write = 1;
4635		} else
4636			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4637
4638		alu.op = op;
4639		if (!swap) {
4640			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4641				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4642			}
4643		} else {
4644			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4645			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4646		}
4647		if (i == lasti || trans_only) {
4648			alu.last = 1;
4649		}
4650		r = r600_bytecode_add_alu(ctx->bc, &alu);
4651		if (r)
4652			return r;
4653	}
4654
4655	if (use_tmp) {
4656		/* move result from temp to dst */
4657		for (i = 0; i <= lasti; i++) {
4658			if (!(write_mask & (1 << i)))
4659				continue;
4660
4661			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4662			alu.op = ALU_OP1_MOV;
4663			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4664			alu.src[0].sel = ctx->temp_reg;
4665			alu.src[0].chan = i;
4666			alu.last = (i == lasti);
4667
4668			r = r600_bytecode_add_alu(ctx->bc, &alu);
4669			if (r)
4670				return r;
4671		}
4672	}
4673	return 0;
4674}
4675
4676static int tgsi_op2(struct r600_shader_ctx *ctx)
4677{
4678	return tgsi_op2_s(ctx, 0, 0);
4679}
4680
4681static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4682{
4683	return tgsi_op2_s(ctx, 1, 0);
4684}
4685
4686static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4687{
4688	return tgsi_op2_s(ctx, 0, 1);
4689}
4690
4691static int tgsi_ineg(struct r600_shader_ctx *ctx)
4692{
4693	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4694	struct r600_bytecode_alu alu;
4695	int i, r;
4696	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4697
4698	for (i = 0; i < lasti + 1; i++) {
4699
4700		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4701			continue;
4702		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4703		alu.op = ctx->inst_info->op;
4704
4705		alu.src[0].sel = V_SQ_ALU_SRC_0;
4706
4707		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4708
4709		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4710
4711		if (i == lasti) {
4712			alu.last = 1;
4713		}
4714		r = r600_bytecode_add_alu(ctx->bc, &alu);
4715		if (r)
4716			return r;
4717	}
4718	return 0;
4719
4720}
4721
4722static int tgsi_dneg(struct r600_shader_ctx *ctx)
4723{
4724	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4725	struct r600_bytecode_alu alu;
4726	int i, r;
4727	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4728
4729	for (i = 0; i < lasti + 1; i++) {
4730
4731		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4732			continue;
4733		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4734		alu.op = ALU_OP1_MOV;
4735
4736		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4737
4738		if (i == 1 || i == 3)
4739			r600_bytecode_src_toggle_neg(&alu.src[0]);
4740		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4741
4742		if (i == lasti) {
4743			alu.last = 1;
4744		}
4745		r = r600_bytecode_add_alu(ctx->bc, &alu);
4746		if (r)
4747			return r;
4748	}
4749	return 0;
4750
4751}
4752
4753static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4754{
4755	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4756	struct r600_bytecode_alu alu;
4757	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4758	int i, j, r;
4759
4760	for (i = 0; i <= 3; i++) {
4761		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4762		alu.op = ctx->inst_info->op;
4763
4764		alu.dst.sel = ctx->temp_reg;
4765		alu.dst.chan = i;
4766		alu.dst.write = 1;
4767		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4768			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4769		}
4770
4771		if (i == 3)
4772			alu.last = 1;
4773
4774		r = r600_bytecode_add_alu(ctx->bc, &alu);
4775		if (r)
4776			return r;
4777	}
4778
4779	/* Replicate significand result across channels. */
4780	for (i = 0; i <= 3; i++) {
4781		if (!(write_mask & (1 << i)))
4782			continue;
4783
4784		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4785		alu.op = ALU_OP1_MOV;
4786		alu.src[0].chan = (i & 1) + 2;
4787		alu.src[0].sel = ctx->temp_reg;
4788
4789		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4790		alu.dst.write = 1;
4791		alu.last = 1;
4792		r = r600_bytecode_add_alu(ctx->bc, &alu);
4793		if (r)
4794			return r;
4795	}
4796
4797	for (i = 0; i <= 3; i++) {
4798		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4799			/* MOV third channels to writemask dst1 */
4800			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4801			alu.op = ALU_OP1_MOV;
4802			alu.src[0].chan = 1;
4803			alu.src[0].sel = ctx->temp_reg;
4804
4805			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4806			alu.last = 1;
4807			r = r600_bytecode_add_alu(ctx->bc, &alu);
4808			if (r)
4809				return r;
4810			break;
4811		}
4812	}
4813	return 0;
4814}
4815
4816
4817static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4818{
4819	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4820	struct r600_bytecode_alu alu;
4821	int i, c, r;
4822	int write_mask = inst->Dst[0].Register.WriteMask;
4823	int temp_reg = r600_get_temp(ctx);
4824
4825	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4826		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4827
4828	for (c = 0; c < 2; c++) {
4829		int dchan = c * 2;
4830		if (write_mask & (0x3 << dchan)) {
4831	/* split into 24-bit int and 8-bit int */
4832			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4833			alu.op = ALU_OP2_AND_INT;
4834			alu.dst.sel = temp_reg;
4835			alu.dst.chan = dchan;
4836			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4837			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4838			alu.src[1].value = 0xffffff00;
4839			alu.dst.write = 1;
4840			r = r600_bytecode_add_alu(ctx->bc, &alu);
4841			if (r)
4842				return r;
4843
4844			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4845			alu.op = ALU_OP2_AND_INT;
4846			alu.dst.sel = temp_reg;
4847			alu.dst.chan = dchan + 1;
4848			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4849			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4850			alu.src[1].value = 0xff;
4851			alu.dst.write = 1;
4852			alu.last = 1;
4853			r = r600_bytecode_add_alu(ctx->bc, &alu);
4854			if (r)
4855				return r;
4856		}
4857	}
4858
4859	for (c = 0; c < 2; c++) {
4860		int dchan = c * 2;
4861		if (write_mask & (0x3 << dchan)) {
4862			for (i = dchan; i <= dchan + 1; i++) {
4863				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4864				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4865
4866				alu.src[0].sel = temp_reg;
4867				alu.src[0].chan = i;
4868				alu.dst.sel = temp_reg;
4869				alu.dst.chan = i;
4870				alu.dst.write = 1;
4871				if (ctx->bc->chip_class == CAYMAN)
4872					alu.last = i == dchan + 1;
4873				else
4874					alu.last = 1; /* trans only ops on evergreen */
4875
4876				r = r600_bytecode_add_alu(ctx->bc, &alu);
4877				if (r)
4878					return r;
4879			}
4880		}
4881	}
4882
4883	for (c = 0; c < 2; c++) {
4884		int dchan = c * 2;
4885		if (write_mask & (0x3 << dchan)) {
4886			for (i = 0; i < 4; i++) {
4887				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4888				alu.op = ALU_OP1_FLT32_TO_FLT64;
4889
4890				alu.src[0].chan = dchan + (i / 2);
4891				if (i == 0 || i == 2)
4892					alu.src[0].sel = temp_reg;
4893				else {
4894					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4895					alu.src[0].value = 0x0;
4896				}
4897				alu.dst.sel = ctx->temp_reg;
4898				alu.dst.chan = i;
4899				alu.last = i == 3;
4900				alu.dst.write = 1;
4901
4902				r = r600_bytecode_add_alu(ctx->bc, &alu);
4903				if (r)
4904					return r;
4905			}
4906
4907			for (i = 0; i <= 1; i++) {
4908				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4909				alu.op = ALU_OP2_ADD_64;
4910
4911				alu.src[0].chan = fp64_switch(i);
4912				alu.src[0].sel = ctx->temp_reg;
4913
4914				alu.src[1].chan = fp64_switch(i + 2);
4915				alu.src[1].sel = ctx->temp_reg;
4916				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
4917				alu.last = i == 1;
4918
4919				r = r600_bytecode_add_alu(ctx->bc, &alu);
4920				if (r)
4921					return r;
4922			}
4923		}
4924	}
4925
4926	return 0;
4927}
4928
4929static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4930{
4931	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4932	struct r600_bytecode_alu alu;
4933	int i, r;
4934	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4935	int treg = r600_get_temp(ctx);
4936	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4937		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4938
4939	/* do a 64->32 into a temp register */
4940	r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4941	if (r)
4942		return r;
4943
4944	for (i = 0; i <= lasti; i++) {
4945		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4946			continue;
4947		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4948		alu.op = ctx->inst_info->op;
4949
4950		alu.src[0].chan = i;
4951		alu.src[0].sel = treg;
4952		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4953		alu.last = (i == lasti);
4954
4955		r = r600_bytecode_add_alu(ctx->bc, &alu);
4956		if (r)
4957			return r;
4958	}
4959
4960	return 0;
4961}
4962
4963static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4964					unsigned op,
4965					int dst_reg,
4966					struct r600_shader_src *src,
4967					bool abs)
4968{
4969	struct r600_bytecode_alu alu;
4970	const int last_slot = 3;
4971	int r;
4972
4973	/* these have to write the result to X/Y by the looks of it */
4974	for (int i = 0 ; i < last_slot; i++) {
4975		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4976		alu.op = op;
4977
4978		r600_bytecode_src(&alu.src[0], src, 1);
4979		r600_bytecode_src(&alu.src[1], src, 0);
4980
4981		if (abs)
4982			r600_bytecode_src_set_abs(&alu.src[1]);
4983
4984		alu.dst.sel = dst_reg;
4985		alu.dst.chan = i;
4986		alu.dst.write = (i == 0 || i == 1);
4987
4988		if (bc->chip_class != CAYMAN || i == last_slot - 1)
4989			alu.last = 1;
4990		r = r600_bytecode_add_alu(bc, &alu);
4991		if (r)
4992			return r;
4993	}
4994
4995	return 0;
4996}
4997
4998static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4999{
5000	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5001	int i, r;
5002	struct r600_bytecode_alu alu;
5003	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5004	int t1 = ctx->temp_reg;
5005
5006	/* should only be one src regs */
5007	assert(inst->Instruction.NumSrcRegs == 1);
5008
5009	/* only support one double at a time */
5010	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5011	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5012
5013	r = cayman_emit_unary_double_raw(
5014		ctx->bc, ctx->inst_info->op, t1,
5015		&ctx->src[0],
5016		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5017		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5018	if (r)
5019		return r;
5020
5021	for (i = 0 ; i <= lasti; i++) {
5022		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5023			continue;
5024		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5025		alu.op = ALU_OP1_MOV;
5026		alu.src[0].sel = t1;
5027		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5028		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5029		alu.dst.write = 1;
5030		if (i == lasti)
5031			alu.last = 1;
5032		r = r600_bytecode_add_alu(ctx->bc, &alu);
5033		if (r)
5034			return r;
5035	}
5036	return 0;
5037}
5038
5039static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5040{
5041	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5042	int i, j, r;
5043	struct r600_bytecode_alu alu;
5044	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5045
5046	for (i = 0 ; i < last_slot; i++) {
5047		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5048		alu.op = ctx->inst_info->op;
5049		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5050			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5051
5052			/* RSQ should take the absolute value of src */
5053			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5054				r600_bytecode_src_set_abs(&alu.src[j]);
5055			}
5056		}
5057		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5058		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5059
5060		if (i == last_slot - 1)
5061			alu.last = 1;
5062		r = r600_bytecode_add_alu(ctx->bc, &alu);
5063		if (r)
5064			return r;
5065	}
5066	return 0;
5067}
5068
5069static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5070{
5071	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5072	int i, j, k, r;
5073	struct r600_bytecode_alu alu;
5074	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5075	int t1 = ctx->temp_reg;
5076
5077	for (k = 0; k <= lasti; k++) {
5078		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5079			continue;
5080
5081		for (i = 0 ; i < 4; i++) {
5082			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5083			alu.op = ctx->inst_info->op;
5084			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5085				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5086			}
5087			alu.dst.sel = t1;
5088			alu.dst.chan = i;
5089			alu.dst.write = (i == k);
5090			if (i == 3)
5091				alu.last = 1;
5092			r = r600_bytecode_add_alu(ctx->bc, &alu);
5093			if (r)
5094				return r;
5095		}
5096	}
5097
5098	for (i = 0 ; i <= lasti; i++) {
5099		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5100			continue;
5101		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5102		alu.op = ALU_OP1_MOV;
5103		alu.src[0].sel = t1;
5104		alu.src[0].chan = i;
5105		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5106		alu.dst.write = 1;
5107		if (i == lasti)
5108			alu.last = 1;
5109		r = r600_bytecode_add_alu(ctx->bc, &alu);
5110		if (r)
5111			return r;
5112	}
5113
5114	return 0;
5115}
5116
5117
5118static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5119{
5120	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5121	int i, j, k, r;
5122	struct r600_bytecode_alu alu;
5123	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5124	int t1 = ctx->temp_reg;
5125
5126	/* t1 would get overwritten below if we actually tried to
5127	 * multiply two pairs of doubles at a time. */
5128	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5129	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5130
5131	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5132
5133	for (i = 0; i < 4; i++) {
5134		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5135		alu.op = ctx->inst_info->op;
5136		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5137			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5138		}
5139		alu.dst.sel = t1;
5140		alu.dst.chan = i;
5141		alu.dst.write = 1;
5142		if (i == 3)
5143			alu.last = 1;
5144		r = r600_bytecode_add_alu(ctx->bc, &alu);
5145		if (r)
5146			return r;
5147	}
5148
5149	for (i = 0; i <= lasti; i++) {
5150		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5151			continue;
5152		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5153		alu.op = ALU_OP1_MOV;
5154		alu.src[0].sel = t1;
5155		alu.src[0].chan = i;
5156		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5157		alu.dst.write = 1;
5158		if (i == lasti)
5159			alu.last = 1;
5160		r = r600_bytecode_add_alu(ctx->bc, &alu);
5161		if (r)
5162			return r;
5163	}
5164
5165	return 0;
5166}
5167
5168/*
5169 * Emit RECIP_64 + MUL_64 to implement division.
5170 */
5171static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5172{
5173	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5174	int r;
5175	struct r600_bytecode_alu alu;
5176	int t1 = ctx->temp_reg;
5177	int k;
5178
5179	/* Only support one double at a time. This is the same constraint as
5180	 * in DMUL lowering. */
5181	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5182	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5183
5184	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5185
5186	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5187	if (r)
5188		return r;
5189
5190	for (int i = 0; i < 4; i++) {
5191		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5192		alu.op = ALU_OP2_MUL_64;
5193
5194		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5195
5196		alu.src[1].sel = t1;
5197		alu.src[1].chan = (i == 3) ? 0 : 1;
5198
5199		alu.dst.sel = t1;
5200		alu.dst.chan = i;
5201		alu.dst.write = 1;
5202		if (i == 3)
5203			alu.last = 1;
5204		r = r600_bytecode_add_alu(ctx->bc, &alu);
5205		if (r)
5206			return r;
5207	}
5208
5209	for (int i = 0; i < 2; i++) {
5210		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5211		alu.op = ALU_OP1_MOV;
5212		alu.src[0].sel = t1;
5213		alu.src[0].chan = i;
5214		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5215		alu.dst.write = 1;
5216		if (i == 1)
5217			alu.last = 1;
5218		r = r600_bytecode_add_alu(ctx->bc, &alu);
5219		if (r)
5220			return r;
5221	}
5222	return 0;
5223}
5224
5225/*
5226 * r600 - trunc to -PI..PI range
5227 * r700 - normalize by dividing by 2PI
5228 * see fdo bug 27901
5229 */
5230static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5231{
5232	int r;
5233	struct r600_bytecode_alu alu;
5234
5235	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5236	alu.op = ALU_OP3_MULADD;
5237	alu.is_op3 = 1;
5238
5239	alu.dst.chan = 0;
5240	alu.dst.sel = ctx->temp_reg;
5241	alu.dst.write = 1;
5242
5243	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5244
5245	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5246	alu.src[1].chan = 0;
5247	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5248	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5249	alu.src[2].chan = 0;
5250	alu.last = 1;
5251	r = r600_bytecode_add_alu(ctx->bc, &alu);
5252	if (r)
5253		return r;
5254
5255	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5256	alu.op = ALU_OP1_FRACT;
5257
5258	alu.dst.chan = 0;
5259	alu.dst.sel = ctx->temp_reg;
5260	alu.dst.write = 1;
5261
5262	alu.src[0].sel = ctx->temp_reg;
5263	alu.src[0].chan = 0;
5264	alu.last = 1;
5265	r = r600_bytecode_add_alu(ctx->bc, &alu);
5266	if (r)
5267		return r;
5268
5269	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5270	alu.op = ALU_OP3_MULADD;
5271	alu.is_op3 = 1;
5272
5273	alu.dst.chan = 0;
5274	alu.dst.sel = ctx->temp_reg;
5275	alu.dst.write = 1;
5276
5277	alu.src[0].sel = ctx->temp_reg;
5278	alu.src[0].chan = 0;
5279
5280	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5281	alu.src[1].chan = 0;
5282	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5283	alu.src[2].chan = 0;
5284
5285	if (ctx->bc->chip_class == R600) {
5286		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5287		alu.src[2].value = u_bitcast_f2u(-M_PI);
5288	} else {
5289		alu.src[1].sel = V_SQ_ALU_SRC_1;
5290		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5291		alu.src[2].neg = 1;
5292	}
5293
5294	alu.last = 1;
5295	r = r600_bytecode_add_alu(ctx->bc, &alu);
5296	if (r)
5297		return r;
5298	return 0;
5299}
5300
5301static int cayman_trig(struct r600_shader_ctx *ctx)
5302{
5303	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5304	struct r600_bytecode_alu alu;
5305	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5306	int i, r;
5307
5308	r = tgsi_setup_trig(ctx);
5309	if (r)
5310		return r;
5311
5312
5313	for (i = 0; i < last_slot; i++) {
5314		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5315		alu.op = ctx->inst_info->op;
5316		alu.dst.chan = i;
5317
5318		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5319		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5320
5321		alu.src[0].sel = ctx->temp_reg;
5322		alu.src[0].chan = 0;
5323		if (i == last_slot - 1)
5324			alu.last = 1;
5325		r = r600_bytecode_add_alu(ctx->bc, &alu);
5326		if (r)
5327			return r;
5328	}
5329	return 0;
5330}
5331
5332static int tgsi_trig(struct r600_shader_ctx *ctx)
5333{
5334	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5335	struct r600_bytecode_alu alu;
5336	int i, r;
5337	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5338
5339	r = tgsi_setup_trig(ctx);
5340	if (r)
5341		return r;
5342
5343	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5344	alu.op = ctx->inst_info->op;
5345	alu.dst.chan = 0;
5346	alu.dst.sel = ctx->temp_reg;
5347	alu.dst.write = 1;
5348
5349	alu.src[0].sel = ctx->temp_reg;
5350	alu.src[0].chan = 0;
5351	alu.last = 1;
5352	r = r600_bytecode_add_alu(ctx->bc, &alu);
5353	if (r)
5354		return r;
5355
5356	/* replicate result */
5357	for (i = 0; i < lasti + 1; i++) {
5358		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5359			continue;
5360
5361		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5362		alu.op = ALU_OP1_MOV;
5363
5364		alu.src[0].sel = ctx->temp_reg;
5365		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5366		if (i == lasti)
5367			alu.last = 1;
5368		r = r600_bytecode_add_alu(ctx->bc, &alu);
5369		if (r)
5370			return r;
5371	}
5372	return 0;
5373}
5374
5375static int tgsi_kill(struct r600_shader_ctx *ctx)
5376{
5377	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5378	struct r600_bytecode_alu alu;
5379	int i, r;
5380
5381	for (i = 0; i < 4; i++) {
5382		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5383		alu.op = ctx->inst_info->op;
5384
5385		alu.dst.chan = i;
5386
5387		alu.src[0].sel = V_SQ_ALU_SRC_0;
5388
5389		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5390			alu.src[1].sel = V_SQ_ALU_SRC_1;
5391			alu.src[1].neg = 1;
5392		} else {
5393			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5394		}
5395		if (i == 3) {
5396			alu.last = 1;
5397		}
5398		r = r600_bytecode_add_alu(ctx->bc, &alu);
5399		if (r)
5400			return r;
5401	}
5402
5403	/* kill must be last in ALU */
5404	ctx->bc->force_add_cf = 1;
5405	ctx->shader->uses_kill = TRUE;
5406	return 0;
5407}
5408
5409static int tgsi_lit(struct r600_shader_ctx *ctx)
5410{
5411	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5412	struct r600_bytecode_alu alu;
5413	int r;
5414
5415	/* tmp.x = max(src.y, 0.0) */
5416	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5417	alu.op = ALU_OP2_MAX;
5418	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5419	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5420	alu.src[1].chan = 1;
5421
5422	alu.dst.sel = ctx->temp_reg;
5423	alu.dst.chan = 0;
5424	alu.dst.write = 1;
5425
5426	alu.last = 1;
5427	r = r600_bytecode_add_alu(ctx->bc, &alu);
5428	if (r)
5429		return r;
5430
5431	if (inst->Dst[0].Register.WriteMask & (1 << 2))
5432	{
5433		int chan;
5434		int sel;
5435		unsigned i;
5436
5437		if (ctx->bc->chip_class == CAYMAN) {
5438			for (i = 0; i < 3; i++) {
5439				/* tmp.z = log(tmp.x) */
5440				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5441				alu.op = ALU_OP1_LOG_CLAMPED;
5442				alu.src[0].sel = ctx->temp_reg;
5443				alu.src[0].chan = 0;
5444				alu.dst.sel = ctx->temp_reg;
5445				alu.dst.chan = i;
5446				if (i == 2) {
5447					alu.dst.write = 1;
5448					alu.last = 1;
5449				} else
5450					alu.dst.write = 0;
5451
5452				r = r600_bytecode_add_alu(ctx->bc, &alu);
5453				if (r)
5454					return r;
5455			}
5456		} else {
5457			/* tmp.z = log(tmp.x) */
5458			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5459			alu.op = ALU_OP1_LOG_CLAMPED;
5460			alu.src[0].sel = ctx->temp_reg;
5461			alu.src[0].chan = 0;
5462			alu.dst.sel = ctx->temp_reg;
5463			alu.dst.chan = 2;
5464			alu.dst.write = 1;
5465			alu.last = 1;
5466			r = r600_bytecode_add_alu(ctx->bc, &alu);
5467			if (r)
5468				return r;
5469		}
5470
5471		chan = alu.dst.chan;
5472		sel = alu.dst.sel;
5473
5474		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5475		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5476		alu.op = ALU_OP3_MUL_LIT;
5477		alu.src[0].sel  = sel;
5478		alu.src[0].chan = chan;
5479		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5480		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5481		alu.dst.sel = ctx->temp_reg;
5482		alu.dst.chan = 0;
5483		alu.dst.write = 1;
5484		alu.is_op3 = 1;
5485		alu.last = 1;
5486		r = r600_bytecode_add_alu(ctx->bc, &alu);
5487		if (r)
5488			return r;
5489
5490		if (ctx->bc->chip_class == CAYMAN) {
5491			for (i = 0; i < 3; i++) {
5492				/* dst.z = exp(tmp.x) */
5493				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5494				alu.op = ALU_OP1_EXP_IEEE;
5495				alu.src[0].sel = ctx->temp_reg;
5496				alu.src[0].chan = 0;
5497				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5498				if (i == 2) {
5499					alu.dst.write = 1;
5500					alu.last = 1;
5501				} else
5502					alu.dst.write = 0;
5503				r = r600_bytecode_add_alu(ctx->bc, &alu);
5504				if (r)
5505					return r;
5506			}
5507		} else {
5508			/* dst.z = exp(tmp.x) */
5509			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5510			alu.op = ALU_OP1_EXP_IEEE;
5511			alu.src[0].sel = ctx->temp_reg;
5512			alu.src[0].chan = 0;
5513			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5514			alu.last = 1;
5515			r = r600_bytecode_add_alu(ctx->bc, &alu);
5516			if (r)
5517				return r;
5518		}
5519	}
5520
5521	/* dst.x, <- 1.0  */
5522	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5523	alu.op = ALU_OP1_MOV;
5524	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
5525	alu.src[0].chan = 0;
5526	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5527	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5528	r = r600_bytecode_add_alu(ctx->bc, &alu);
5529	if (r)
5530		return r;
5531
5532	/* dst.y = max(src.x, 0.0) */
5533	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5534	alu.op = ALU_OP2_MAX;
5535	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5536	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5537	alu.src[1].chan = 0;
5538	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5539	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5540	r = r600_bytecode_add_alu(ctx->bc, &alu);
5541	if (r)
5542		return r;
5543
5544	/* dst.w, <- 1.0  */
5545	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5546	alu.op = ALU_OP1_MOV;
5547	alu.src[0].sel  = V_SQ_ALU_SRC_1;
5548	alu.src[0].chan = 0;
5549	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5550	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5551	alu.last = 1;
5552	r = r600_bytecode_add_alu(ctx->bc, &alu);
5553	if (r)
5554		return r;
5555
5556	return 0;
5557}
5558
5559static int tgsi_rsq(struct r600_shader_ctx *ctx)
5560{
5561	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5562	struct r600_bytecode_alu alu;
5563	int i, r;
5564
5565	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5566
5567	alu.op = ALU_OP1_RECIPSQRT_IEEE;
5568
5569	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5570		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5571		r600_bytecode_src_set_abs(&alu.src[i]);
5572	}
5573	alu.dst.sel = ctx->temp_reg;
5574	alu.dst.write = 1;
5575	alu.last = 1;
5576	r = r600_bytecode_add_alu(ctx->bc, &alu);
5577	if (r)
5578		return r;
5579	/* replicate result */
5580	return tgsi_helper_tempx_replicate(ctx);
5581}
5582
5583static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5584{
5585	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5586	struct r600_bytecode_alu alu;
5587	int i, r;
5588
5589	for (i = 0; i < 4; i++) {
5590		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5591		alu.src[0].sel = ctx->temp_reg;
5592		alu.op = ALU_OP1_MOV;
5593		alu.dst.chan = i;
5594		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5595		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5596		if (i == 3)
5597			alu.last = 1;
5598		r = r600_bytecode_add_alu(ctx->bc, &alu);
5599		if (r)
5600			return r;
5601	}
5602	return 0;
5603}
5604
5605static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5606{
5607	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5608	struct r600_bytecode_alu alu;
5609	int i, r;
5610
5611	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5612	alu.op = ctx->inst_info->op;
5613	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5614		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5615	}
5616	alu.dst.sel = ctx->temp_reg;
5617	alu.dst.write = 1;
5618	alu.last = 1;
5619	r = r600_bytecode_add_alu(ctx->bc, &alu);
5620	if (r)
5621		return r;
5622	/* replicate result */
5623	return tgsi_helper_tempx_replicate(ctx);
5624}
5625
5626static int cayman_pow(struct r600_shader_ctx *ctx)
5627{
5628	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5629	int i, r;
5630	struct r600_bytecode_alu alu;
5631	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5632
5633	for (i = 0; i < 3; i++) {
5634		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5635		alu.op = ALU_OP1_LOG_IEEE;
5636		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5637		alu.dst.sel = ctx->temp_reg;
5638		alu.dst.chan = i;
5639		alu.dst.write = 1;
5640		if (i == 2)
5641			alu.last = 1;
5642		r = r600_bytecode_add_alu(ctx->bc, &alu);
5643		if (r)
5644			return r;
5645	}
5646
5647	/* b * LOG2(a) */
5648	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5649	alu.op = ALU_OP2_MUL;
5650	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5651	alu.src[1].sel = ctx->temp_reg;
5652	alu.dst.sel = ctx->temp_reg;
5653	alu.dst.write = 1;
5654	alu.last = 1;
5655	r = r600_bytecode_add_alu(ctx->bc, &alu);
5656	if (r)
5657		return r;
5658
5659	for (i = 0; i < last_slot; i++) {
5660		/* POW(a,b) = EXP2(b * LOG2(a))*/
5661		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5662		alu.op = ALU_OP1_EXP_IEEE;
5663		alu.src[0].sel = ctx->temp_reg;
5664
5665		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5666		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5667		if (i == last_slot - 1)
5668			alu.last = 1;
5669		r = r600_bytecode_add_alu(ctx->bc, &alu);
5670		if (r)
5671			return r;
5672	}
5673	return 0;
5674}
5675
5676static int tgsi_pow(struct r600_shader_ctx *ctx)
5677{
5678	struct r600_bytecode_alu alu;
5679	int r;
5680
5681	/* LOG2(a) */
5682	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5683	alu.op = ALU_OP1_LOG_IEEE;
5684	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5685	alu.dst.sel = ctx->temp_reg;
5686	alu.dst.write = 1;
5687	alu.last = 1;
5688	r = r600_bytecode_add_alu(ctx->bc, &alu);
5689	if (r)
5690		return r;
5691	/* b * LOG2(a) */
5692	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5693	alu.op = ALU_OP2_MUL;
5694	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5695	alu.src[1].sel = ctx->temp_reg;
5696	alu.dst.sel = ctx->temp_reg;
5697	alu.dst.write = 1;
5698	alu.last = 1;
5699	r = r600_bytecode_add_alu(ctx->bc, &alu);
5700	if (r)
5701		return r;
5702	/* POW(a,b) = EXP2(b * LOG2(a))*/
5703	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5704	alu.op = ALU_OP1_EXP_IEEE;
5705	alu.src[0].sel = ctx->temp_reg;
5706	alu.dst.sel = ctx->temp_reg;
5707	alu.dst.write = 1;
5708	alu.last = 1;
5709	r = r600_bytecode_add_alu(ctx->bc, &alu);
5710	if (r)
5711		return r;
5712	return tgsi_helper_tempx_replicate(ctx);
5713}
5714
5715static int emit_mul_int_op(struct r600_bytecode *bc,
5716			   struct r600_bytecode_alu *alu_src)
5717{
5718	struct r600_bytecode_alu alu;
5719	int i, r;
5720	alu = *alu_src;
5721	if (bc->chip_class == CAYMAN) {
5722		for (i = 0; i < 4; i++) {
5723			alu.dst.chan = i;
5724			alu.dst.write = (i == alu_src->dst.chan);
5725			alu.last = (i == 3);
5726
5727			r = r600_bytecode_add_alu(bc, &alu);
5728			if (r)
5729				return r;
5730		}
5731	} else {
5732		alu.last = 1;
5733		r = r600_bytecode_add_alu(bc, &alu);
5734		if (r)
5735			return r;
5736	}
5737	return 0;
5738}
5739
5740static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5741{
5742	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5743	struct r600_bytecode_alu alu;
5744	int i, r, j;
5745	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5746	int lasti = tgsi_last_instruction(write_mask);
5747	int tmp0 = ctx->temp_reg;
5748	int tmp1 = r600_get_temp(ctx);
5749	int tmp2 = r600_get_temp(ctx);
5750	int tmp3 = r600_get_temp(ctx);
5751	int tmp4 = 0;
5752
5753	/* Use additional temp if dst register and src register are the same */
5754	if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5755	    inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5756		tmp4 = r600_get_temp(ctx);
5757	}
5758
5759	/* Unsigned path:
5760	 *
5761	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5762	 *
5763	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5764	 * 2. tmp0.z = lo (tmp0.x * src2)
5765	 * 3. tmp0.w = -tmp0.z
5766	 * 4. tmp0.y = hi (tmp0.x * src2)
5767	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5768	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5769	 * 7. tmp1.x = tmp0.x - tmp0.w
5770	 * 8. tmp1.y = tmp0.x + tmp0.w
5771	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5772	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5773	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5774	 *
5775	 * 12. tmp0.w = src1 - tmp0.y       = r
5776	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5777	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5778	 *
5779	 * if DIV
5780	 *
5781	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5782	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5783	 *
5784	 * else MOD
5785	 *
5786	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5787	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5788	 *
5789	 * endif
5790	 *
5791	 * 17. tmp1.x = tmp1.x & tmp1.y
5792	 *
5793	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5794	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5795	 *
5796	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5797	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5798	 *
5799	 * Signed path:
5800	 *
5801	 * Same as unsigned, using abs values of the operands,
5802	 * and fixing the sign of the result in the end.
5803	 */
5804
5805	for (i = 0; i < 4; i++) {
5806		if (!(write_mask & (1<<i)))
5807			continue;
5808
5809		if (signed_op) {
5810
5811			/* tmp2.x = -src0 */
5812			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5813			alu.op = ALU_OP2_SUB_INT;
5814
5815			alu.dst.sel = tmp2;
5816			alu.dst.chan = 0;
5817			alu.dst.write = 1;
5818
5819			alu.src[0].sel = V_SQ_ALU_SRC_0;
5820
5821			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5822
5823			alu.last = 1;
5824			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5825				return r;
5826
5827			/* tmp2.y = -src1 */
5828			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5829			alu.op = ALU_OP2_SUB_INT;
5830
5831			alu.dst.sel = tmp2;
5832			alu.dst.chan = 1;
5833			alu.dst.write = 1;
5834
5835			alu.src[0].sel = V_SQ_ALU_SRC_0;
5836
5837			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5838
5839			alu.last = 1;
5840			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5841				return r;
5842
5843			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5844			/* it will be a sign of the quotient */
5845			if (!mod) {
5846
5847				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5848				alu.op = ALU_OP2_XOR_INT;
5849
5850				alu.dst.sel = tmp2;
5851				alu.dst.chan = 2;
5852				alu.dst.write = 1;
5853
5854				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5855				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5856
5857				alu.last = 1;
5858				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5859					return r;
5860			}
5861
5862			/* tmp2.x = |src0| */
5863			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5864			alu.op = ALU_OP3_CNDGE_INT;
5865			alu.is_op3 = 1;
5866
5867			alu.dst.sel = tmp2;
5868			alu.dst.chan = 0;
5869			alu.dst.write = 1;
5870
5871			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5872			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5873			alu.src[2].sel = tmp2;
5874			alu.src[2].chan = 0;
5875
5876			alu.last = 1;
5877			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5878				return r;
5879
5880			/* tmp2.y = |src1| */
5881			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5882			alu.op = ALU_OP3_CNDGE_INT;
5883			alu.is_op3 = 1;
5884
5885			alu.dst.sel = tmp2;
5886			alu.dst.chan = 1;
5887			alu.dst.write = 1;
5888
5889			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5890			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5891			alu.src[2].sel = tmp2;
5892			alu.src[2].chan = 1;
5893
5894			alu.last = 1;
5895			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5896				return r;
5897
5898		}
5899
5900		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5901		if (ctx->bc->chip_class == CAYMAN) {
5902			/* tmp3.x = u2f(src2) */
5903			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5904			alu.op = ALU_OP1_UINT_TO_FLT;
5905
5906			alu.dst.sel = tmp3;
5907			alu.dst.chan = 0;
5908			alu.dst.write = 1;
5909
5910			if (signed_op) {
5911				alu.src[0].sel = tmp2;
5912				alu.src[0].chan = 1;
5913			} else {
5914				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5915			}
5916
5917			alu.last = 1;
5918			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5919				return r;
5920
5921			/* tmp0.x = recip(tmp3.x) */
5922			for (j = 0 ; j < 3; j++) {
5923				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5924				alu.op = ALU_OP1_RECIP_IEEE;
5925
5926				alu.dst.sel = tmp0;
5927				alu.dst.chan = j;
5928				alu.dst.write = (j == 0);
5929
5930				alu.src[0].sel = tmp3;
5931				alu.src[0].chan = 0;
5932
5933				if (j == 2)
5934					alu.last = 1;
5935				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5936					return r;
5937			}
5938
5939			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5940			alu.op = ALU_OP2_MUL;
5941
5942			alu.src[0].sel = tmp0;
5943			alu.src[0].chan = 0;
5944
5945			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5946			alu.src[1].value = 0x4f800000;
5947
5948			alu.dst.sel = tmp3;
5949			alu.dst.write = 1;
5950			alu.last = 1;
5951			r = r600_bytecode_add_alu(ctx->bc, &alu);
5952			if (r)
5953				return r;
5954
5955			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5956			alu.op = ALU_OP1_FLT_TO_UINT;
5957
5958			alu.dst.sel = tmp0;
5959			alu.dst.chan = 0;
5960			alu.dst.write = 1;
5961
5962			alu.src[0].sel = tmp3;
5963			alu.src[0].chan = 0;
5964
5965			alu.last = 1;
5966			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5967				return r;
5968
5969		} else {
5970			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5971			alu.op = ALU_OP1_RECIP_UINT;
5972
5973			alu.dst.sel = tmp0;
5974			alu.dst.chan = 0;
5975			alu.dst.write = 1;
5976
5977			if (signed_op) {
5978				alu.src[0].sel = tmp2;
5979				alu.src[0].chan = 1;
5980			} else {
5981				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5982			}
5983
5984			alu.last = 1;
5985			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5986				return r;
5987		}
5988
5989		/* 2. tmp0.z = lo (tmp0.x * src2) */
5990		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5991		alu.op = ALU_OP2_MULLO_UINT;
5992
5993		alu.dst.sel = tmp0;
5994		alu.dst.chan = 2;
5995		alu.dst.write = 1;
5996
5997		alu.src[0].sel = tmp0;
5998		alu.src[0].chan = 0;
5999		if (signed_op) {
6000			alu.src[1].sel = tmp2;
6001			alu.src[1].chan = 1;
6002		} else {
6003			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6004		}
6005
6006		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6007			return r;
6008
6009		/* 3. tmp0.w = -tmp0.z */
6010		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6011		alu.op = ALU_OP2_SUB_INT;
6012
6013		alu.dst.sel = tmp0;
6014		alu.dst.chan = 3;
6015		alu.dst.write = 1;
6016
6017		alu.src[0].sel = V_SQ_ALU_SRC_0;
6018		alu.src[1].sel = tmp0;
6019		alu.src[1].chan = 2;
6020
6021		alu.last = 1;
6022		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6023			return r;
6024
6025		/* 4. tmp0.y = hi (tmp0.x * src2) */
6026		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6027		alu.op = ALU_OP2_MULHI_UINT;
6028
6029		alu.dst.sel = tmp0;
6030		alu.dst.chan = 1;
6031		alu.dst.write = 1;
6032
6033		alu.src[0].sel = tmp0;
6034		alu.src[0].chan = 0;
6035
6036		if (signed_op) {
6037			alu.src[1].sel = tmp2;
6038			alu.src[1].chan = 1;
6039		} else {
6040			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6041		}
6042
6043		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6044			return r;
6045
6046		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
6047		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6048		alu.op = ALU_OP3_CNDE_INT;
6049		alu.is_op3 = 1;
6050
6051		alu.dst.sel = tmp0;
6052		alu.dst.chan = 2;
6053		alu.dst.write = 1;
6054
6055		alu.src[0].sel = tmp0;
6056		alu.src[0].chan = 1;
6057		alu.src[1].sel = tmp0;
6058		alu.src[1].chan = 3;
6059		alu.src[2].sel = tmp0;
6060		alu.src[2].chan = 2;
6061
6062		alu.last = 1;
6063		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6064			return r;
6065
6066		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
6067		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6068		alu.op = ALU_OP2_MULHI_UINT;
6069
6070		alu.dst.sel = tmp0;
6071		alu.dst.chan = 3;
6072		alu.dst.write = 1;
6073
6074		alu.src[0].sel = tmp0;
6075		alu.src[0].chan = 2;
6076
6077		alu.src[1].sel = tmp0;
6078		alu.src[1].chan = 0;
6079
6080		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6081				return r;
6082
6083		/* 7. tmp1.x = tmp0.x - tmp0.w */
6084		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6085		alu.op = ALU_OP2_SUB_INT;
6086
6087		alu.dst.sel = tmp1;
6088		alu.dst.chan = 0;
6089		alu.dst.write = 1;
6090
6091		alu.src[0].sel = tmp0;
6092		alu.src[0].chan = 0;
6093		alu.src[1].sel = tmp0;
6094		alu.src[1].chan = 3;
6095
6096		alu.last = 1;
6097		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6098			return r;
6099
6100		/* 8. tmp1.y = tmp0.x + tmp0.w */
6101		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6102		alu.op = ALU_OP2_ADD_INT;
6103
6104		alu.dst.sel = tmp1;
6105		alu.dst.chan = 1;
6106		alu.dst.write = 1;
6107
6108		alu.src[0].sel = tmp0;
6109		alu.src[0].chan = 0;
6110		alu.src[1].sel = tmp0;
6111		alu.src[1].chan = 3;
6112
6113		alu.last = 1;
6114		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6115			return r;
6116
6117		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6118		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6119		alu.op = ALU_OP3_CNDE_INT;
6120		alu.is_op3 = 1;
6121
6122		alu.dst.sel = tmp0;
6123		alu.dst.chan = 0;
6124		alu.dst.write = 1;
6125
6126		alu.src[0].sel = tmp0;
6127		alu.src[0].chan = 1;
6128		alu.src[1].sel = tmp1;
6129		alu.src[1].chan = 1;
6130		alu.src[2].sel = tmp1;
6131		alu.src[2].chan = 0;
6132
6133		alu.last = 1;
6134		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6135			return r;
6136
6137		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
6138		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6139		alu.op = ALU_OP2_MULHI_UINT;
6140
6141		alu.dst.sel = tmp0;
6142		alu.dst.chan = 2;
6143		alu.dst.write = 1;
6144
6145		alu.src[0].sel = tmp0;
6146		alu.src[0].chan = 0;
6147
6148		if (signed_op) {
6149			alu.src[1].sel = tmp2;
6150			alu.src[1].chan = 0;
6151		} else {
6152			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6153		}
6154
6155		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6156			return r;
6157
6158		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
6159		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6160		alu.op = ALU_OP2_MULLO_UINT;
6161
6162		alu.dst.sel = tmp0;
6163		alu.dst.chan = 1;
6164		alu.dst.write = 1;
6165
6166		if (signed_op) {
6167			alu.src[0].sel = tmp2;
6168			alu.src[0].chan = 1;
6169		} else {
6170			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6171		}
6172
6173		alu.src[1].sel = tmp0;
6174		alu.src[1].chan = 2;
6175
6176		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6177			return r;
6178
6179		/* 12. tmp0.w = src1 - tmp0.y       = r */
6180		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6181		alu.op = ALU_OP2_SUB_INT;
6182
6183		alu.dst.sel = tmp0;
6184		alu.dst.chan = 3;
6185		alu.dst.write = 1;
6186
6187		if (signed_op) {
6188			alu.src[0].sel = tmp2;
6189			alu.src[0].chan = 0;
6190		} else {
6191			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6192		}
6193
6194		alu.src[1].sel = tmp0;
6195		alu.src[1].chan = 1;
6196
6197		alu.last = 1;
6198		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6199			return r;
6200
6201		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
6202		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6203		alu.op = ALU_OP2_SETGE_UINT;
6204
6205		alu.dst.sel = tmp1;
6206		alu.dst.chan = 0;
6207		alu.dst.write = 1;
6208
6209		alu.src[0].sel = tmp0;
6210		alu.src[0].chan = 3;
6211		if (signed_op) {
6212			alu.src[1].sel = tmp2;
6213			alu.src[1].chan = 1;
6214		} else {
6215			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6216		}
6217
6218		alu.last = 1;
6219		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6220			return r;
6221
6222		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
6223		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6224		alu.op = ALU_OP2_SETGE_UINT;
6225
6226		alu.dst.sel = tmp1;
6227		alu.dst.chan = 1;
6228		alu.dst.write = 1;
6229
6230		if (signed_op) {
6231			alu.src[0].sel = tmp2;
6232			alu.src[0].chan = 0;
6233		} else {
6234			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6235		}
6236
6237		alu.src[1].sel = tmp0;
6238		alu.src[1].chan = 1;
6239
6240		alu.last = 1;
6241		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6242			return r;
6243
6244		if (mod) { /* UMOD */
6245
6246			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
6247			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6248			alu.op = ALU_OP2_SUB_INT;
6249
6250			alu.dst.sel = tmp1;
6251			alu.dst.chan = 2;
6252			alu.dst.write = 1;
6253
6254			alu.src[0].sel = tmp0;
6255			alu.src[0].chan = 3;
6256
6257			if (signed_op) {
6258				alu.src[1].sel = tmp2;
6259				alu.src[1].chan = 1;
6260			} else {
6261				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6262			}
6263
6264			alu.last = 1;
6265			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6266				return r;
6267
6268			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
6269			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6270			alu.op = ALU_OP2_ADD_INT;
6271
6272			alu.dst.sel = tmp1;
6273			alu.dst.chan = 3;
6274			alu.dst.write = 1;
6275
6276			alu.src[0].sel = tmp0;
6277			alu.src[0].chan = 3;
6278			if (signed_op) {
6279				alu.src[1].sel = tmp2;
6280				alu.src[1].chan = 1;
6281			} else {
6282				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6283			}
6284
6285			alu.last = 1;
6286			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6287				return r;
6288
6289		} else { /* UDIV */
6290
6291			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
6292			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6293			alu.op = ALU_OP2_ADD_INT;
6294
6295			alu.dst.sel = tmp1;
6296			alu.dst.chan = 2;
6297			alu.dst.write = 1;
6298
6299			alu.src[0].sel = tmp0;
6300			alu.src[0].chan = 2;
6301			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6302
6303			alu.last = 1;
6304			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6305				return r;
6306
6307			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
6308			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6309			alu.op = ALU_OP2_ADD_INT;
6310
6311			alu.dst.sel = tmp1;
6312			alu.dst.chan = 3;
6313			alu.dst.write = 1;
6314
6315			alu.src[0].sel = tmp0;
6316			alu.src[0].chan = 2;
6317			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6318
6319			alu.last = 1;
6320			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6321				return r;
6322
6323		}
6324
6325		/* 17. tmp1.x = tmp1.x & tmp1.y */
6326		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6327		alu.op = ALU_OP2_AND_INT;
6328
6329		alu.dst.sel = tmp1;
6330		alu.dst.chan = 0;
6331		alu.dst.write = 1;
6332
6333		alu.src[0].sel = tmp1;
6334		alu.src[0].chan = 0;
6335		alu.src[1].sel = tmp1;
6336		alu.src[1].chan = 1;
6337
6338		alu.last = 1;
6339		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6340			return r;
6341
6342		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
6343		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
6344		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6345		alu.op = ALU_OP3_CNDE_INT;
6346		alu.is_op3 = 1;
6347
6348		alu.dst.sel = tmp0;
6349		alu.dst.chan = 2;
6350		alu.dst.write = 1;
6351
6352		alu.src[0].sel = tmp1;
6353		alu.src[0].chan = 0;
6354		alu.src[1].sel = tmp0;
6355		alu.src[1].chan = mod ? 3 : 2;
6356		alu.src[2].sel = tmp1;
6357		alu.src[2].chan = 2;
6358
6359		alu.last = 1;
6360		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6361			return r;
6362
6363		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6364		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6365		alu.op = ALU_OP3_CNDE_INT;
6366		alu.is_op3 = 1;
6367
6368		if (signed_op) {
6369			alu.dst.sel = tmp0;
6370			alu.dst.chan = 2;
6371			alu.dst.write = 1;
6372		} else {
6373			if (tmp4 > 0) {
6374				alu.dst.sel = tmp4;
6375				alu.dst.chan = i;
6376				alu.dst.write = 1;
6377			} else {
6378				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6379			}
6380		}
6381
6382		alu.src[0].sel = tmp1;
6383		alu.src[0].chan = 1;
6384		alu.src[1].sel = tmp1;
6385		alu.src[1].chan = 3;
6386		alu.src[2].sel = tmp0;
6387		alu.src[2].chan = 2;
6388
6389		alu.last = 1;
6390		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6391			return r;
6392
6393		if (signed_op) {
6394
6395			/* fix the sign of the result */
6396
6397			if (mod) {
6398
6399				/* tmp0.x = -tmp0.z */
6400				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6401				alu.op = ALU_OP2_SUB_INT;
6402
6403				alu.dst.sel = tmp0;
6404				alu.dst.chan = 0;
6405				alu.dst.write = 1;
6406
6407				alu.src[0].sel = V_SQ_ALU_SRC_0;
6408				alu.src[1].sel = tmp0;
6409				alu.src[1].chan = 2;
6410
6411				alu.last = 1;
6412				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6413					return r;
6414
6415				/* sign of the remainder is the same as the sign of src0 */
6416				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6417				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6418				alu.op = ALU_OP3_CNDGE_INT;
6419				alu.is_op3 = 1;
6420
6421				if (tmp4 > 0) {
6422					alu.dst.sel = tmp4;
6423					alu.dst.chan = i;
6424					alu.dst.write = 1;
6425				} else {
6426					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6427				}
6428
6429				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6430				alu.src[1].sel = tmp0;
6431				alu.src[1].chan = 2;
6432				alu.src[2].sel = tmp0;
6433				alu.src[2].chan = 0;
6434
6435				alu.last = 1;
6436				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6437					return r;
6438
6439			} else {
6440
6441				/* tmp0.x = -tmp0.z */
6442				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6443				alu.op = ALU_OP2_SUB_INT;
6444
6445				alu.dst.sel = tmp0;
6446				alu.dst.chan = 0;
6447				alu.dst.write = 1;
6448
6449				alu.src[0].sel = V_SQ_ALU_SRC_0;
6450				alu.src[1].sel = tmp0;
6451				alu.src[1].chan = 2;
6452
6453				alu.last = 1;
6454				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6455					return r;
6456
6457				/* fix the quotient sign (same as the sign of src0*src1) */
6458				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6459				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6460				alu.op = ALU_OP3_CNDGE_INT;
6461				alu.is_op3 = 1;
6462
6463				if (tmp4 > 0) {
6464					alu.dst.sel = tmp4;
6465					alu.dst.chan = i;
6466					alu.dst.write = 1;
6467				} else {
6468					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6469				}
6470
6471				alu.src[0].sel = tmp2;
6472				alu.src[0].chan = 2;
6473				alu.src[1].sel = tmp0;
6474				alu.src[1].chan = 2;
6475				alu.src[2].sel = tmp0;
6476				alu.src[2].chan = 0;
6477
6478				alu.last = 1;
6479				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6480					return r;
6481			}
6482		}
6483	}
6484
6485	if (tmp4 > 0) {
6486		for (i = 0; i <= lasti; ++i) {
6487			if (!(write_mask & (1<<i)))
6488				continue;
6489
6490			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6491			alu.op = ALU_OP1_MOV;
6492			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6493			alu.src[0].sel = tmp4;
6494			alu.src[0].chan = i;
6495
6496			if (i == lasti)
6497				alu.last = 1;
6498			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6499				return r;
6500		}
6501	}
6502
6503	return 0;
6504}
6505
6506static int tgsi_udiv(struct r600_shader_ctx *ctx)
6507{
6508	return tgsi_divmod(ctx, 0, 0);
6509}
6510
6511static int tgsi_umod(struct r600_shader_ctx *ctx)
6512{
6513	return tgsi_divmod(ctx, 1, 0);
6514}
6515
6516static int tgsi_idiv(struct r600_shader_ctx *ctx)
6517{
6518	return tgsi_divmod(ctx, 0, 1);
6519}
6520
6521static int tgsi_imod(struct r600_shader_ctx *ctx)
6522{
6523	return tgsi_divmod(ctx, 1, 1);
6524}
6525
6526
6527static int tgsi_f2i(struct r600_shader_ctx *ctx)
6528{
6529	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6530	struct r600_bytecode_alu alu;
6531	int i, r;
6532	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6533	int last_inst = tgsi_last_instruction(write_mask);
6534
6535	for (i = 0; i < 4; i++) {
6536		if (!(write_mask & (1<<i)))
6537			continue;
6538
6539		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6540		alu.op = ALU_OP1_TRUNC;
6541
6542		alu.dst.sel = ctx->temp_reg;
6543		alu.dst.chan = i;
6544		alu.dst.write = 1;
6545
6546		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6547		if (i == last_inst)
6548			alu.last = 1;
6549		r = r600_bytecode_add_alu(ctx->bc, &alu);
6550		if (r)
6551			return r;
6552	}
6553
6554	for (i = 0; i < 4; i++) {
6555		if (!(write_mask & (1<<i)))
6556			continue;
6557
6558		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6559		alu.op = ctx->inst_info->op;
6560
6561		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6562
6563		alu.src[0].sel = ctx->temp_reg;
6564		alu.src[0].chan = i;
6565
6566		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6567			alu.last = 1;
6568		r = r600_bytecode_add_alu(ctx->bc, &alu);
6569		if (r)
6570			return r;
6571	}
6572
6573	return 0;
6574}
6575
6576static int tgsi_iabs(struct r600_shader_ctx *ctx)
6577{
6578	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6579	struct r600_bytecode_alu alu;
6580	int i, r;
6581	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6582	int last_inst = tgsi_last_instruction(write_mask);
6583
6584	/* tmp = -src */
6585	for (i = 0; i < 4; i++) {
6586		if (!(write_mask & (1<<i)))
6587			continue;
6588
6589		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6590		alu.op = ALU_OP2_SUB_INT;
6591
6592		alu.dst.sel = ctx->temp_reg;
6593		alu.dst.chan = i;
6594		alu.dst.write = 1;
6595
6596		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6597		alu.src[0].sel = V_SQ_ALU_SRC_0;
6598
6599		if (i == last_inst)
6600			alu.last = 1;
6601		r = r600_bytecode_add_alu(ctx->bc, &alu);
6602		if (r)
6603			return r;
6604	}
6605
6606	/* dst = (src >= 0 ? src : tmp) */
6607	for (i = 0; i < 4; i++) {
6608		if (!(write_mask & (1<<i)))
6609			continue;
6610
6611		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6612		alu.op = ALU_OP3_CNDGE_INT;
6613		alu.is_op3 = 1;
6614		alu.dst.write = 1;
6615
6616		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6617
6618		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6619		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6620		alu.src[2].sel = ctx->temp_reg;
6621		alu.src[2].chan = i;
6622
6623		if (i == last_inst)
6624			alu.last = 1;
6625		r = r600_bytecode_add_alu(ctx->bc, &alu);
6626		if (r)
6627			return r;
6628	}
6629	return 0;
6630}
6631
6632static int tgsi_issg(struct r600_shader_ctx *ctx)
6633{
6634	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6635	struct r600_bytecode_alu alu;
6636	int i, r;
6637	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6638	int last_inst = tgsi_last_instruction(write_mask);
6639
6640	/* tmp = (src >= 0 ? src : -1) */
6641	for (i = 0; i < 4; i++) {
6642		if (!(write_mask & (1<<i)))
6643			continue;
6644
6645		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6646		alu.op = ALU_OP3_CNDGE_INT;
6647		alu.is_op3 = 1;
6648
6649		alu.dst.sel = ctx->temp_reg;
6650		alu.dst.chan = i;
6651		alu.dst.write = 1;
6652
6653		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6654		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6655		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6656
6657		if (i == last_inst)
6658			alu.last = 1;
6659		r = r600_bytecode_add_alu(ctx->bc, &alu);
6660		if (r)
6661			return r;
6662	}
6663
6664	/* dst = (tmp > 0 ? 1 : tmp) */
6665	for (i = 0; i < 4; i++) {
6666		if (!(write_mask & (1<<i)))
6667			continue;
6668
6669		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6670		alu.op = ALU_OP3_CNDGT_INT;
6671		alu.is_op3 = 1;
6672		alu.dst.write = 1;
6673
6674		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6675
6676		alu.src[0].sel = ctx->temp_reg;
6677		alu.src[0].chan = i;
6678
6679		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6680
6681		alu.src[2].sel = ctx->temp_reg;
6682		alu.src[2].chan = i;
6683
6684		if (i == last_inst)
6685			alu.last = 1;
6686		r = r600_bytecode_add_alu(ctx->bc, &alu);
6687		if (r)
6688			return r;
6689	}
6690	return 0;
6691}
6692
6693
6694
6695static int tgsi_ssg(struct r600_shader_ctx *ctx)
6696{
6697	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6698	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6699	int last_inst = tgsi_last_instruction(write_mask);
6700	struct r600_bytecode_alu alu;
6701	int i, r;
6702
6703	/* tmp = (src > 0 ? 1 : src) */
6704	for (i = 0; i <= last_inst; i++) {
6705		if (!(write_mask & (1 << i)))
6706			continue;
6707		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6708		alu.op = ALU_OP3_CNDGT;
6709		alu.is_op3 = 1;
6710
6711		alu.dst.sel = ctx->temp_reg;
6712		alu.dst.chan = i;
6713
6714		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6715		alu.src[1].sel = V_SQ_ALU_SRC_1;
6716		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6717
6718		if (i == last_inst)
6719			alu.last = 1;
6720		r = r600_bytecode_add_alu(ctx->bc, &alu);
6721		if (r)
6722			return r;
6723	}
6724
6725	/* dst = (-tmp > 0 ? -1 : tmp) */
6726	for (i = 0; i <= last_inst; i++) {
6727		if (!(write_mask & (1 << i)))
6728			continue;
6729		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6730		alu.op = ALU_OP3_CNDGT;
6731		alu.is_op3 = 1;
6732		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6733
6734		alu.src[0].sel = ctx->temp_reg;
6735		alu.src[0].chan = i;
6736		alu.src[0].neg = 1;
6737
6738		alu.src[1].sel = V_SQ_ALU_SRC_1;
6739		alu.src[1].neg = 1;
6740
6741		alu.src[2].sel = ctx->temp_reg;
6742		alu.src[2].chan = i;
6743
6744		if (i == last_inst)
6745			alu.last = 1;
6746		r = r600_bytecode_add_alu(ctx->bc, &alu);
6747		if (r)
6748			return r;
6749	}
6750	return 0;
6751}
6752
6753static int tgsi_bfi(struct r600_shader_ctx *ctx)
6754{
6755	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6756	struct r600_bytecode_alu alu;
6757	int i, r, t1, t2;
6758
6759	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6760	int last_inst = tgsi_last_instruction(write_mask);
6761
6762	t1 = r600_get_temp(ctx);
6763
6764	for (i = 0; i < 4; i++) {
6765		if (!(write_mask & (1<<i)))
6766			continue;
6767
6768		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6769		alu.op = ALU_OP2_SETGE_INT;
6770		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6771		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6772		alu.src[1].value = 32;
6773		alu.dst.sel = ctx->temp_reg;
6774		alu.dst.chan = i;
6775		alu.dst.write = 1;
6776		alu.last = i == last_inst;
6777		r = r600_bytecode_add_alu(ctx->bc, &alu);
6778		if (r)
6779			return r;
6780	}
6781
6782	for (i = 0; i < 4; i++) {
6783		if (!(write_mask & (1<<i)))
6784			continue;
6785
6786		/* create mask tmp */
6787		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6788		alu.op = ALU_OP2_BFM_INT;
6789		alu.dst.sel = t1;
6790		alu.dst.chan = i;
6791		alu.dst.write = 1;
6792		alu.last = i == last_inst;
6793
6794		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6795		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6796
6797		r = r600_bytecode_add_alu(ctx->bc, &alu);
6798		if (r)
6799			return r;
6800	}
6801
6802	t2 = r600_get_temp(ctx);
6803
6804	for (i = 0; i < 4; i++) {
6805		if (!(write_mask & (1<<i)))
6806			continue;
6807
6808		/* shift insert left */
6809		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6810		alu.op = ALU_OP2_LSHL_INT;
6811		alu.dst.sel = t2;
6812		alu.dst.chan = i;
6813		alu.dst.write = 1;
6814		alu.last = i == last_inst;
6815
6816		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6817		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6818
6819		r = r600_bytecode_add_alu(ctx->bc, &alu);
6820		if (r)
6821			return r;
6822	}
6823
6824	for (i = 0; i < 4; i++) {
6825		if (!(write_mask & (1<<i)))
6826			continue;
6827
6828		/* actual bitfield insert */
6829		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6830		alu.op = ALU_OP3_BFI_INT;
6831		alu.is_op3 = 1;
6832		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6833		alu.dst.chan = i;
6834		alu.dst.write = 1;
6835		alu.last = i == last_inst;
6836
6837		alu.src[0].sel = t1;
6838		alu.src[0].chan = i;
6839		alu.src[1].sel = t2;
6840		alu.src[1].chan = i;
6841		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6842
6843		r = r600_bytecode_add_alu(ctx->bc, &alu);
6844		if (r)
6845			return r;
6846	}
6847
6848	for (i = 0; i < 4; i++) {
6849		if (!(write_mask & (1<<i)))
6850			continue;
6851		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6852		alu.op = ALU_OP3_CNDE_INT;
6853		alu.is_op3 = 1;
6854		alu.src[0].sel = ctx->temp_reg;
6855		alu.src[0].chan = i;
6856		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6857
6858		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6859
6860		alu.src[1].sel = alu.dst.sel;
6861		alu.src[1].chan = i;
6862
6863		alu.last = i == last_inst;
6864		r = r600_bytecode_add_alu(ctx->bc, &alu);
6865		if (r)
6866			return r;
6867	}
6868	return 0;
6869}
6870
6871static int tgsi_msb(struct r600_shader_ctx *ctx)
6872{
6873	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6874	struct r600_bytecode_alu alu;
6875	int i, r, t1, t2;
6876
6877	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6878	int last_inst = tgsi_last_instruction(write_mask);
6879
6880	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6881		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6882
6883	t1 = ctx->temp_reg;
6884
6885	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6886	for (i = 0; i < 4; i++) {
6887		if (!(write_mask & (1<<i)))
6888			continue;
6889
6890		/* t1 = FFBH_INT / FFBH_UINT */
6891		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6892		alu.op = ctx->inst_info->op;
6893		alu.dst.sel = t1;
6894		alu.dst.chan = i;
6895		alu.dst.write = 1;
6896		alu.last = i == last_inst;
6897
6898		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6899
6900		r = r600_bytecode_add_alu(ctx->bc, &alu);
6901		if (r)
6902			return r;
6903	}
6904
6905	t2 = r600_get_temp(ctx);
6906
6907	for (i = 0; i < 4; i++) {
6908		if (!(write_mask & (1<<i)))
6909			continue;
6910
6911		/* t2 = 31 - t1 */
6912		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6913		alu.op = ALU_OP2_SUB_INT;
6914		alu.dst.sel = t2;
6915		alu.dst.chan = i;
6916		alu.dst.write = 1;
6917		alu.last = i == last_inst;
6918
6919		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6920		alu.src[0].value = 31;
6921		alu.src[1].sel = t1;
6922		alu.src[1].chan = i;
6923
6924		r = r600_bytecode_add_alu(ctx->bc, &alu);
6925		if (r)
6926			return r;
6927	}
6928
6929	for (i = 0; i < 4; i++) {
6930		if (!(write_mask & (1<<i)))
6931			continue;
6932
6933		/* result = t1 >= 0 ? t2 : t1 */
6934		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6935		alu.op = ALU_OP3_CNDGE_INT;
6936		alu.is_op3 = 1;
6937		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6938		alu.dst.chan = i;
6939		alu.dst.write = 1;
6940		alu.last = i == last_inst;
6941
6942		alu.src[0].sel = t1;
6943		alu.src[0].chan = i;
6944		alu.src[1].sel = t2;
6945		alu.src[1].chan = i;
6946		alu.src[2].sel = t1;
6947		alu.src[2].chan = i;
6948
6949		r = r600_bytecode_add_alu(ctx->bc, &alu);
6950		if (r)
6951			return r;
6952	}
6953
6954	return 0;
6955}
6956
6957static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6958{
6959	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6960	struct r600_bytecode_alu alu;
6961	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6962	unsigned location;
6963	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6964
6965	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6966
6967	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6968	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6969		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6970		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6971	}
6972	else {
6973		location = TGSI_INTERPOLATE_LOC_CENTROID;
6974	}
6975
6976	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6977	if (k < 0)
6978		k = 0;
6979	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6980	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6981
6982	/* NOTE: currently offset is not perspective correct */
6983	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6984		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6985		int sample_gpr = -1;
6986		int gradientsH, gradientsV;
6987		struct r600_bytecode_tex tex;
6988
6989		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6990			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6991		}
6992
6993		gradientsH = r600_get_temp(ctx);
6994		gradientsV = r600_get_temp(ctx);
6995		for (i = 0; i < 2; i++) {
6996			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6997			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6998			tex.src_gpr = interp_gpr;
6999			tex.src_sel_x = interp_base_chan + 0;
7000			tex.src_sel_y = interp_base_chan + 1;
7001			tex.src_sel_z = 0;
7002			tex.src_sel_w = 0;
7003			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7004			tex.dst_sel_x = 0;
7005			tex.dst_sel_y = 1;
7006			tex.dst_sel_z = 7;
7007			tex.dst_sel_w = 7;
7008			tex.inst_mod = 1; // Use per pixel gradient calculation
7009			tex.sampler_id = 0;
7010			tex.resource_id = tex.sampler_id;
7011			r = r600_bytecode_add_tex(ctx->bc, &tex);
7012			if (r)
7013				return r;
7014		}
7015
7016		for (i = 0; i < 2; i++) {
7017			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7018			alu.op = ALU_OP3_MULADD;
7019			alu.is_op3 = 1;
7020			alu.src[0].sel = gradientsH;
7021			alu.src[0].chan = i;
7022			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7023				alu.src[1].sel = sample_gpr;
7024				alu.src[1].chan = 2;
7025			}
7026			else {
7027				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7028			}
7029			alu.src[2].sel = interp_gpr;
7030			alu.src[2].chan = interp_base_chan + i;
7031			alu.dst.sel = ctx->temp_reg;
7032			alu.dst.chan = i;
7033			alu.last = i == 1;
7034
7035			r = r600_bytecode_add_alu(ctx->bc, &alu);
7036			if (r)
7037				return r;
7038		}
7039
7040		for (i = 0; i < 2; i++) {
7041			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7042			alu.op = ALU_OP3_MULADD;
7043			alu.is_op3 = 1;
7044			alu.src[0].sel = gradientsV;
7045			alu.src[0].chan = i;
7046			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7047				alu.src[1].sel = sample_gpr;
7048				alu.src[1].chan = 3;
7049			}
7050			else {
7051				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7052			}
7053			alu.src[2].sel = ctx->temp_reg;
7054			alu.src[2].chan = i;
7055			alu.dst.sel = ctx->temp_reg;
7056			alu.dst.chan = i;
7057			alu.last = i == 1;
7058
7059			r = r600_bytecode_add_alu(ctx->bc, &alu);
7060			if (r)
7061				return r;
7062		}
7063	}
7064
7065	tmp = r600_get_temp(ctx);
7066	for (i = 0; i < 8; i++) {
7067		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7068		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7069
7070		alu.dst.sel = tmp;
7071		if ((i > 1 && i < 6)) {
7072			alu.dst.write = 1;
7073		}
7074		else {
7075			alu.dst.write = 0;
7076		}
7077		alu.dst.chan = i % 4;
7078
7079		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7080			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7081			alu.src[0].sel = ctx->temp_reg;
7082			alu.src[0].chan = 1 - (i % 2);
7083		} else {
7084			alu.src[0].sel = interp_gpr;
7085			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7086		}
7087		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7088		alu.src[1].chan = 0;
7089
7090		alu.last = i % 4 == 3;
7091		alu.bank_swizzle_force = SQ_ALU_VEC_210;
7092
7093		r = r600_bytecode_add_alu(ctx->bc, &alu);
7094		if (r)
7095			return r;
7096	}
7097
7098	// INTERP can't swizzle dst
7099	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7100	for (i = 0; i <= lasti; i++) {
7101		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7102			continue;
7103
7104		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7105		alu.op = ALU_OP1_MOV;
7106		alu.src[0].sel = tmp;
7107		alu.src[0].chan = ctx->src[0].swizzle[i];
7108		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7109		alu.dst.write = 1;
7110		alu.last = i == lasti;
7111		r = r600_bytecode_add_alu(ctx->bc, &alu);
7112		if (r)
7113			return r;
7114	}
7115
7116	return 0;
7117}
7118
7119
7120static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7121{
7122	struct r600_bytecode_alu alu;
7123	int i, r;
7124
7125	for (i = 0; i < 4; i++) {
7126		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7127		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7128			alu.op = ALU_OP0_NOP;
7129			alu.dst.chan = i;
7130		} else {
7131			alu.op = ALU_OP1_MOV;
7132			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7133			alu.src[0].sel = ctx->temp_reg;
7134			alu.src[0].chan = i;
7135		}
7136		if (i == 3) {
7137			alu.last = 1;
7138		}
7139		r = r600_bytecode_add_alu(ctx->bc, &alu);
7140		if (r)
7141			return r;
7142	}
7143	return 0;
7144}
7145
7146static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7147                                 unsigned writemask,
7148                                 struct r600_bytecode_alu_src *bc_src,
7149                                 const struct r600_shader_src *shader_src)
7150{
7151	struct r600_bytecode_alu alu;
7152	int i, r;
7153	int lasti = tgsi_last_instruction(writemask);
7154	int temp_reg = 0;
7155
7156	r600_bytecode_src(&bc_src[0], shader_src, 0);
7157	r600_bytecode_src(&bc_src[1], shader_src, 1);
7158	r600_bytecode_src(&bc_src[2], shader_src, 2);
7159	r600_bytecode_src(&bc_src[3], shader_src, 3);
7160
7161	if (bc_src->abs) {
7162		temp_reg = r600_get_temp(ctx);
7163
7164		for (i = 0; i < lasti + 1; i++) {
7165			if (!(writemask & (1 << i)))
7166				continue;
7167			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7168			alu.op = ALU_OP1_MOV;
7169			alu.dst.sel = temp_reg;
7170			alu.dst.chan = i;
7171			alu.dst.write = 1;
7172			alu.src[0] = bc_src[i];
7173			if (i == lasti) {
7174				alu.last = 1;
7175			}
7176			r = r600_bytecode_add_alu(ctx->bc, &alu);
7177			if (r)
7178				return r;
7179			memset(&bc_src[i], 0, sizeof(*bc_src));
7180			bc_src[i].sel = temp_reg;
7181			bc_src[i].chan = i;
7182		}
7183	}
7184	return 0;
7185}
7186
7187static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7188{
7189	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7190	struct r600_bytecode_alu alu;
7191	struct r600_bytecode_alu_src srcs[4][4];
7192	int i, j, r;
7193	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7194	unsigned op = ctx->inst_info->op;
7195
7196	if (op == ALU_OP3_MULADD_IEEE &&
7197	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7198		op = ALU_OP3_MULADD;
7199
7200	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7201		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7202					  srcs[j], &ctx->src[j]);
7203		if (r)
7204			return r;
7205	}
7206
7207	for (i = 0; i < lasti + 1; i++) {
7208		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7209			continue;
7210
7211		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7212		alu.op = op;
7213		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7214			alu.src[j] = srcs[j][i];
7215		}
7216
7217		if (dst == -1) {
7218			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7219		} else {
7220			alu.dst.sel = dst;
7221		}
7222		alu.dst.chan = i;
7223		alu.dst.write = 1;
7224		alu.is_op3 = 1;
7225		if (i == lasti) {
7226			alu.last = 1;
7227		}
7228		r = r600_bytecode_add_alu(ctx->bc, &alu);
7229		if (r)
7230			return r;
7231	}
7232	return 0;
7233}
7234
7235static int tgsi_op3(struct r600_shader_ctx *ctx)
7236{
7237	return tgsi_op3_dst(ctx, -1);
7238}
7239
7240static int tgsi_dp(struct r600_shader_ctx *ctx)
7241{
7242	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7243	struct r600_bytecode_alu alu;
7244	int i, j, r;
7245	unsigned op = ctx->inst_info->op;
7246	if (op == ALU_OP2_DOT4_IEEE &&
7247	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7248		op = ALU_OP2_DOT4;
7249
7250	for (i = 0; i < 4; i++) {
7251		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7252		alu.op = op;
7253		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7254			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7255		}
7256
7257		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7258		alu.dst.chan = i;
7259		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7260		/* handle some special cases */
7261		switch (inst->Instruction.Opcode) {
7262		case TGSI_OPCODE_DP2:
7263			if (i > 1) {
7264				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7265				alu.src[0].chan = alu.src[1].chan = 0;
7266			}
7267			break;
7268		case TGSI_OPCODE_DP3:
7269			if (i > 2) {
7270				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7271				alu.src[0].chan = alu.src[1].chan = 0;
7272			}
7273			break;
7274		default:
7275			break;
7276		}
7277		if (i == 3) {
7278			alu.last = 1;
7279		}
7280		r = r600_bytecode_add_alu(ctx->bc, &alu);
7281		if (r)
7282			return r;
7283	}
7284	return 0;
7285}
7286
7287static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7288						    unsigned index)
7289{
7290	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7291	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7292		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7293		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7294		ctx->src[index].neg || ctx->src[index].abs ||
7295		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7296}
7297
7298static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7299					unsigned index)
7300{
7301	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7302	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7303}
7304
7305static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7306{
7307	struct r600_bytecode_vtx vtx;
7308	struct r600_bytecode_alu alu;
7309	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7310	int src_gpr, r, i;
7311	int id = tgsi_tex_get_src_gpr(ctx, 1);
7312	int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7313
7314	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7315	if (src_requires_loading) {
7316		for (i = 0; i < 4; i++) {
7317			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7318			alu.op = ALU_OP1_MOV;
7319			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7320			alu.dst.sel = ctx->temp_reg;
7321			alu.dst.chan = i;
7322			if (i == 3)
7323				alu.last = 1;
7324			alu.dst.write = 1;
7325			r = r600_bytecode_add_alu(ctx->bc, &alu);
7326			if (r)
7327				return r;
7328		}
7329		src_gpr = ctx->temp_reg;
7330	}
7331
7332	memset(&vtx, 0, sizeof(vtx));
7333	vtx.op = FETCH_OP_VFETCH;
7334	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7335	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7336	vtx.src_gpr = src_gpr;
7337	vtx.mega_fetch_count = 16;
7338	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7339	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7340	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
7341	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
7342	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
7343	vtx.use_const_fields = 1;
7344	vtx.buffer_index_mode = sampler_index_mode;
7345
7346	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7347		return r;
7348
7349	if (ctx->bc->chip_class >= EVERGREEN)
7350		return 0;
7351
7352	for (i = 0; i < 4; i++) {
7353		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7354		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7355			continue;
7356
7357		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7358		alu.op = ALU_OP2_AND_INT;
7359
7360		alu.dst.chan = i;
7361		alu.dst.sel = vtx.dst_gpr;
7362		alu.dst.write = 1;
7363
7364		alu.src[0].sel = vtx.dst_gpr;
7365		alu.src[0].chan = i;
7366
7367		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7368		alu.src[1].sel += (id * 2);
7369		alu.src[1].chan = i % 4;
7370		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7371
7372		if (i == lasti)
7373			alu.last = 1;
7374		r = r600_bytecode_add_alu(ctx->bc, &alu);
7375		if (r)
7376			return r;
7377	}
7378
7379	if (inst->Dst[0].Register.WriteMask & 3) {
7380		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7381		alu.op = ALU_OP2_OR_INT;
7382
7383		alu.dst.chan = 3;
7384		alu.dst.sel = vtx.dst_gpr;
7385		alu.dst.write = 1;
7386
7387		alu.src[0].sel = vtx.dst_gpr;
7388		alu.src[0].chan = 3;
7389
7390		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7391		alu.src[1].chan = 0;
7392		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7393
7394		alu.last = 1;
7395		r = r600_bytecode_add_alu(ctx->bc, &alu);
7396		if (r)
7397			return r;
7398	}
7399	return 0;
7400}
7401
7402static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7403{
7404	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7405	int r;
7406	int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7407	int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7408
7409	if (ctx->bc->chip_class < EVERGREEN) {
7410		struct r600_bytecode_alu alu;
7411		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7412		alu.op = ALU_OP1_MOV;
7413		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7414		/* r600 we have them at channel 2 of the second dword */
7415		alu.src[0].sel += (id * 2) + 1;
7416		alu.src[0].chan = 1;
7417		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7418		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7419		alu.last = 1;
7420		r = r600_bytecode_add_alu(ctx->bc, &alu);
7421		if (r)
7422			return r;
7423		return 0;
7424	} else {
7425		struct r600_bytecode_vtx vtx;
7426		memset(&vtx, 0, sizeof(vtx));
7427		vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7428		vtx.buffer_id = id + eg_buffer_base;
7429		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7430		vtx.src_gpr = 0;
7431		vtx.mega_fetch_count = 16; /* no idea here really... */
7432		vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7433		vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7434		vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;		/* SEL_Y */
7435		vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;		/* SEL_Z */
7436		vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;		/* SEL_W */
7437		vtx.data_format = FMT_32_32_32_32;
7438		vtx.buffer_index_mode = sampler_index_mode;
7439
7440		if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7441			return r;
7442		return 0;
7443	}
7444}
7445
7446
7447static int tgsi_tex(struct r600_shader_ctx *ctx)
7448{
7449	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7450	struct r600_bytecode_tex tex;
7451	struct r600_bytecode_tex grad_offs[3];
7452	struct r600_bytecode_alu alu;
7453	unsigned src_gpr;
7454	int r, i, j, n_grad_offs = 0;
7455	int opcode;
7456	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7457				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7458				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7459				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7460
7461	bool txf_add_offsets = inst->Texture.NumOffsets &&
7462			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7463			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7464
7465	/* Texture fetch instructions can only use gprs as source.
7466	 * Also they cannot negate the source or take the absolute value */
7467	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7468                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
7469					     read_compressed_msaa || txf_add_offsets;
7470
7471	boolean src_loaded = FALSE;
7472	unsigned sampler_src_reg = 1;
7473	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7474	boolean has_txq_cube_array_z = false;
7475	unsigned sampler_index_mode;
7476	int array_index_offset_channel = -1;
7477
7478	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7479	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7480	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7481		if (inst->Dst[0].Register.WriteMask & 4) {
7482			ctx->shader->has_txq_cube_array_z_comp = true;
7483			has_txq_cube_array_z = true;
7484		}
7485
7486	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7487	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7488	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7489	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7490		sampler_src_reg = 2;
7491
7492	/* TGSI moves the sampler to src reg 3 for TXD */
7493	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7494		sampler_src_reg = 3;
7495
7496	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7497
7498	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7499
7500	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7501		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7502			if (ctx->bc->chip_class < EVERGREEN)
7503				ctx->shader->uses_tex_buffers = true;
7504			return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7505		}
7506		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7507			if (ctx->bc->chip_class < EVERGREEN)
7508				ctx->shader->uses_tex_buffers = true;
7509			return do_vtx_fetch_inst(ctx, src_requires_loading);
7510		}
7511	}
7512
7513	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7514		int out_chan;
7515		/* Add perspective divide */
7516		if (ctx->bc->chip_class == CAYMAN) {
7517			out_chan = 2;
7518			for (i = 0; i < 3; i++) {
7519				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7520				alu.op = ALU_OP1_RECIP_IEEE;
7521				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7522
7523				alu.dst.sel = ctx->temp_reg;
7524				alu.dst.chan = i;
7525				if (i == 2)
7526					alu.last = 1;
7527				if (out_chan == i)
7528					alu.dst.write = 1;
7529				r = r600_bytecode_add_alu(ctx->bc, &alu);
7530				if (r)
7531					return r;
7532			}
7533
7534		} else {
7535			out_chan = 3;
7536			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7537			alu.op = ALU_OP1_RECIP_IEEE;
7538			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7539
7540			alu.dst.sel = ctx->temp_reg;
7541			alu.dst.chan = out_chan;
7542			alu.last = 1;
7543			alu.dst.write = 1;
7544			r = r600_bytecode_add_alu(ctx->bc, &alu);
7545			if (r)
7546				return r;
7547		}
7548
7549		for (i = 0; i < 3; i++) {
7550			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7551			alu.op = ALU_OP2_MUL;
7552			alu.src[0].sel = ctx->temp_reg;
7553			alu.src[0].chan = out_chan;
7554			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7555			alu.dst.sel = ctx->temp_reg;
7556			alu.dst.chan = i;
7557			alu.dst.write = 1;
7558			r = r600_bytecode_add_alu(ctx->bc, &alu);
7559			if (r)
7560				return r;
7561		}
7562		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7563		alu.op = ALU_OP1_MOV;
7564		alu.src[0].sel = V_SQ_ALU_SRC_1;
7565		alu.src[0].chan = 0;
7566		alu.dst.sel = ctx->temp_reg;
7567		alu.dst.chan = 3;
7568		alu.last = 1;
7569		alu.dst.write = 1;
7570		r = r600_bytecode_add_alu(ctx->bc, &alu);
7571		if (r)
7572			return r;
7573		src_loaded = TRUE;
7574		src_gpr = ctx->temp_reg;
7575	}
7576
7577
7578	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7579	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7580	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7581	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7582	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7583
7584		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7585		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7586
7587		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7588		for (i = 0; i < 4; i++) {
7589			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7590			alu.op = ALU_OP2_CUBE;
7591			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7592			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7593			alu.dst.sel = ctx->temp_reg;
7594			alu.dst.chan = i;
7595			if (i == 3)
7596				alu.last = 1;
7597			alu.dst.write = 1;
7598			r = r600_bytecode_add_alu(ctx->bc, &alu);
7599			if (r)
7600				return r;
7601		}
7602
7603		/* tmp1.z = RCP_e(|tmp1.z|) */
7604		if (ctx->bc->chip_class == CAYMAN) {
7605			for (i = 0; i < 3; i++) {
7606				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7607				alu.op = ALU_OP1_RECIP_IEEE;
7608				alu.src[0].sel = ctx->temp_reg;
7609				alu.src[0].chan = 2;
7610				alu.src[0].abs = 1;
7611				alu.dst.sel = ctx->temp_reg;
7612				alu.dst.chan = i;
7613				if (i == 2)
7614					alu.dst.write = 1;
7615				if (i == 2)
7616					alu.last = 1;
7617				r = r600_bytecode_add_alu(ctx->bc, &alu);
7618				if (r)
7619					return r;
7620			}
7621		} else {
7622			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7623			alu.op = ALU_OP1_RECIP_IEEE;
7624			alu.src[0].sel = ctx->temp_reg;
7625			alu.src[0].chan = 2;
7626			alu.src[0].abs = 1;
7627			alu.dst.sel = ctx->temp_reg;
7628			alu.dst.chan = 2;
7629			alu.dst.write = 1;
7630			alu.last = 1;
7631			r = r600_bytecode_add_alu(ctx->bc, &alu);
7632			if (r)
7633				return r;
7634		}
7635
7636		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
7637		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
7638		 * muladd has no writemask, have to use another temp
7639		 */
7640		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7641		alu.op = ALU_OP3_MULADD;
7642		alu.is_op3 = 1;
7643
7644		alu.src[0].sel = ctx->temp_reg;
7645		alu.src[0].chan = 0;
7646		alu.src[1].sel = ctx->temp_reg;
7647		alu.src[1].chan = 2;
7648
7649		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7650		alu.src[2].chan = 0;
7651		alu.src[2].value = u_bitcast_f2u(1.5f);
7652
7653		alu.dst.sel = ctx->temp_reg;
7654		alu.dst.chan = 0;
7655		alu.dst.write = 1;
7656
7657		r = r600_bytecode_add_alu(ctx->bc, &alu);
7658		if (r)
7659			return r;
7660
7661		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7662		alu.op = ALU_OP3_MULADD;
7663		alu.is_op3 = 1;
7664
7665		alu.src[0].sel = ctx->temp_reg;
7666		alu.src[0].chan = 1;
7667		alu.src[1].sel = ctx->temp_reg;
7668		alu.src[1].chan = 2;
7669
7670		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7671		alu.src[2].chan = 0;
7672		alu.src[2].value = u_bitcast_f2u(1.5f);
7673
7674		alu.dst.sel = ctx->temp_reg;
7675		alu.dst.chan = 1;
7676		alu.dst.write = 1;
7677
7678		alu.last = 1;
7679		r = r600_bytecode_add_alu(ctx->bc, &alu);
7680		if (r)
7681			return r;
7682		/* write initial compare value into Z component
7683		  - W src 0 for shadow cube
7684		  - X src 1 for shadow cube array */
7685		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7686		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7687			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7688			alu.op = ALU_OP1_MOV;
7689			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7690				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7691			else
7692				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7693			alu.dst.sel = ctx->temp_reg;
7694			alu.dst.chan = 2;
7695			alu.dst.write = 1;
7696			alu.last = 1;
7697			r = r600_bytecode_add_alu(ctx->bc, &alu);
7698			if (r)
7699				return r;
7700		}
7701
7702		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7703		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7704			if (ctx->bc->chip_class >= EVERGREEN) {
7705				int mytmp = r600_get_temp(ctx);
7706				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7707				alu.op = ALU_OP1_MOV;
7708				alu.src[0].sel = ctx->temp_reg;
7709				alu.src[0].chan = 3;
7710				alu.dst.sel = mytmp;
7711				alu.dst.chan = 0;
7712				alu.dst.write = 1;
7713				alu.last = 1;
7714				r = r600_bytecode_add_alu(ctx->bc, &alu);
7715				if (r)
7716					return r;
7717
7718				/* Evaluate the array index according to floor(idx + 0.5). This
7719				 * needs to be done before merging the face select value, because
7720				 * otherwise the fractional part of the array index will interfere
7721				 * with the face select value */
7722				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7723				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7724				alu.op = ALU_OP1_RNDNE;
7725				alu.dst.sel = ctx->temp_reg;
7726				alu.dst.chan = 3;
7727				alu.dst.write = 1;
7728				alu.last = 1;
7729				r = r600_bytecode_add_alu(ctx->bc, &alu);
7730				if (r)
7731					return r;
7732
7733				/* Because the array slice index and the cube face index are merged
7734				 * into one value we have to make sure the array slice index is >= 0,
7735				 * otherwise the face selection will fail */
7736				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7737				alu.op = ALU_OP2_MAX;
7738				alu.src[0].sel = ctx->temp_reg;
7739				alu.src[0].chan = 3;
7740				alu.src[1].sel = V_SQ_ALU_SRC_0;
7741				alu.dst.sel = ctx->temp_reg;
7742				alu.dst.chan = 3;
7743				alu.dst.write = 1;
7744				alu.last = 1;
7745				r = r600_bytecode_add_alu(ctx->bc, &alu);
7746				if (r)
7747					return r;
7748
7749				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7750				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7751				alu.op = ALU_OP3_MULADD;
7752				alu.is_op3 = 1;
7753				alu.src[0].sel = ctx->temp_reg;
7754				alu.src[0].chan = 3;
7755				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7756				alu.src[1].chan = 0;
7757				alu.src[1].value = u_bitcast_f2u(8.0f);
7758				alu.src[2].sel = mytmp;
7759				alu.src[2].chan = 0;
7760				alu.dst.sel = ctx->temp_reg;
7761				alu.dst.chan = 3;
7762				alu.dst.write = 1;
7763				alu.last = 1;
7764				r = r600_bytecode_add_alu(ctx->bc, &alu);
7765				if (r)
7766					return r;
7767			} else if (ctx->bc->chip_class < EVERGREEN) {
7768				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7769				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7770				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7771				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7772				tex.src_gpr = r600_get_temp(ctx);
7773				tex.src_sel_x = 0;
7774				tex.src_sel_y = 0;
7775				tex.src_sel_z = 0;
7776				tex.src_sel_w = 0;
7777				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7778				tex.coord_type_x = 1;
7779				tex.coord_type_y = 1;
7780				tex.coord_type_z = 1;
7781				tex.coord_type_w = 1;
7782				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7783				alu.op = ALU_OP1_MOV;
7784				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7785				alu.dst.sel = tex.src_gpr;
7786				alu.dst.chan = 0;
7787				alu.last = 1;
7788				alu.dst.write = 1;
7789				r = r600_bytecode_add_alu(ctx->bc, &alu);
7790				if (r)
7791					return r;
7792
7793				r = r600_bytecode_add_tex(ctx->bc, &tex);
7794				if (r)
7795					return r;
7796			}
7797
7798		}
7799
7800		/* for cube forms of lod and bias we need to route things */
7801		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7802		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7803		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7804		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7805			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7806			alu.op = ALU_OP1_MOV;
7807			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7808			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7809				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7810			else
7811				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7812			alu.dst.sel = ctx->temp_reg;
7813			alu.dst.chan = 2;
7814			alu.last = 1;
7815			alu.dst.write = 1;
7816			r = r600_bytecode_add_alu(ctx->bc, &alu);
7817			if (r)
7818				return r;
7819		}
7820
7821		src_loaded = TRUE;
7822		src_gpr = ctx->temp_reg;
7823	}
7824
7825	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7826		int temp_h = 0, temp_v = 0;
7827		int start_val = 0;
7828
7829		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7830		if (src_loaded == TRUE)
7831			start_val = 1;
7832		else
7833			src_loaded = TRUE;
7834		for (i = start_val; i < 3; i++) {
7835			int treg = r600_get_temp(ctx);
7836
7837			if (i == 0)
7838				src_gpr = treg;
7839			else if (i == 1)
7840				temp_h = treg;
7841			else
7842				temp_v = treg;
7843
7844			for (j = 0; j < 4; j++) {
7845				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7846				alu.op = ALU_OP1_MOV;
7847                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7848                                alu.dst.sel = treg;
7849                                alu.dst.chan = j;
7850                                if (j == 3)
7851                                   alu.last = 1;
7852                                alu.dst.write = 1;
7853                                r = r600_bytecode_add_alu(ctx->bc, &alu);
7854                                if (r)
7855                                    return r;
7856			}
7857		}
7858		for (i = 1; i < 3; i++) {
7859			/* set gradients h/v */
7860			struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7861			memset(t, 0, sizeof(struct r600_bytecode_tex));
7862			t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7863				FETCH_OP_SET_GRADIENTS_V;
7864			t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7865			t->sampler_index_mode = sampler_index_mode;
7866			t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7867			t->resource_index_mode = sampler_index_mode;
7868
7869			t->src_gpr = (i == 1) ? temp_h : temp_v;
7870			t->src_sel_x = 0;
7871			t->src_sel_y = 1;
7872			t->src_sel_z = 2;
7873			t->src_sel_w = 3;
7874
7875			t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7876			t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7877			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7878				t->coord_type_x = 1;
7879				t->coord_type_y = 1;
7880				t->coord_type_z = 1;
7881				t->coord_type_w = 1;
7882			}
7883		}
7884	}
7885
7886	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7887		/* Gather4 should follow the same rules as bilinear filtering, but the hardware
7888		 * incorrectly forces nearest filtering if the texture format is integer.
7889		 * The only effect it has on Gather4, which always returns 4 texels for
7890		 * bilinear filtering, is that the final coordinates are off by 0.5 of
7891		 * the texel size.
7892		 *
7893		 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7894		 * or (0.5 / size) from the normalized coordinates.
7895		 */
7896		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7897		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7898			int treg = r600_get_temp(ctx);
7899
7900			/* mov array and comparison oordinate to temp_reg if needed */
7901			if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7902			     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7903			     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7904				int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7905				for (i = 2; i <= end; i++) {
7906					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7907					alu.op = ALU_OP1_MOV;
7908					alu.dst.sel = ctx->temp_reg;
7909					alu.dst.chan = i;
7910					alu.dst.write = 1;
7911					alu.last = (i == end);
7912					r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7913					r = r600_bytecode_add_alu(ctx->bc, &alu);
7914					if (r)
7915						return r;
7916				}
7917			}
7918
7919			if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
7920			    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
7921				for (i = 0; i < 2; i++) {
7922					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7923					alu.op = ALU_OP2_ADD;
7924					alu.dst.sel = ctx->temp_reg;
7925					alu.dst.chan = i;
7926					alu.dst.write = 1;
7927					alu.last = i == 1;
7928					if (src_loaded) {
7929						alu.src[0].sel = ctx->temp_reg;
7930						alu.src[0].chan = i;
7931					} else
7932						r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7933					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7934					alu.src[1].neg = 1;
7935					r = r600_bytecode_add_alu(ctx->bc, &alu);
7936					if (r)
7937						return r;
7938				}
7939			} else {
7940				/* execute a TXQ */
7941				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7942				tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
7943				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7944				tex.sampler_index_mode = sampler_index_mode;
7945				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7946				tex.resource_index_mode = sampler_index_mode;
7947				tex.dst_gpr = treg;
7948				tex.src_sel_x = 4;
7949				tex.src_sel_y = 4;
7950				tex.src_sel_z = 4;
7951				tex.src_sel_w = 4;
7952				tex.dst_sel_x = 0;
7953				tex.dst_sel_y = 1;
7954				tex.dst_sel_z = 7;
7955				tex.dst_sel_w = 7;
7956				r = r600_bytecode_add_tex(ctx->bc, &tex);
7957				if (r)
7958					return r;
7959
7960				/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
7961				if (ctx->bc->chip_class == CAYMAN) {
7962					/* */
7963					for (i = 0; i < 2; i++) {
7964						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7965						alu.op = ALU_OP1_INT_TO_FLT;
7966						alu.dst.sel = treg;
7967						alu.dst.chan = i;
7968						alu.dst.write = 1;
7969						alu.src[0].sel = treg;
7970						alu.src[0].chan = i;
7971						alu.last = (i == 1) ? 1 : 0;
7972						r = r600_bytecode_add_alu(ctx->bc, &alu);
7973						if (r)
7974							return r;
7975					}
7976					for (j = 0; j < 2; j++) {
7977						for (i = 0; i < 3; i++) {
7978							memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7979							alu.op = ALU_OP1_RECIP_IEEE;
7980							alu.src[0].sel = treg;
7981							alu.src[0].chan = j;
7982							alu.dst.sel = treg;
7983							alu.dst.chan = i;
7984							if (i == 2)
7985								alu.last = 1;
7986							if (i == j)
7987								alu.dst.write = 1;
7988							r = r600_bytecode_add_alu(ctx->bc, &alu);
7989							if (r)
7990								return r;
7991						}
7992					}
7993				} else {
7994					for (i = 0; i < 2; i++) {
7995						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7996						alu.op = ALU_OP1_INT_TO_FLT;
7997						alu.dst.sel = treg;
7998						alu.dst.chan = i;
7999						alu.dst.write = 1;
8000						alu.src[0].sel = treg;
8001						alu.src[0].chan = i;
8002						alu.last = 1;
8003						r = r600_bytecode_add_alu(ctx->bc, &alu);
8004						if (r)
8005							return r;
8006					}
8007					for (i = 0; i < 2; i++) {
8008						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8009						alu.op = ALU_OP1_RECIP_IEEE;
8010						alu.src[0].sel = treg;
8011						alu.src[0].chan = i;
8012						alu.dst.sel = treg;
8013						alu.dst.chan = i;
8014						alu.last = 1;
8015						alu.dst.write = 1;
8016						r = r600_bytecode_add_alu(ctx->bc, &alu);
8017						if (r)
8018							return r;
8019					}
8020				}
8021				for (i = 0; i < 2; i++) {
8022					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8023					alu.op = ALU_OP3_MULADD;
8024					alu.is_op3 = 1;
8025					alu.dst.sel = ctx->temp_reg;
8026					alu.dst.chan = i;
8027					alu.dst.write = 1;
8028					alu.last = i == 1;
8029					alu.src[0].sel = treg;
8030					alu.src[0].chan = i;
8031					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8032					alu.src[1].neg = 1;
8033					if (src_loaded) {
8034						alu.src[2].sel = ctx->temp_reg;
8035						alu.src[2].chan = i;
8036					} else
8037						r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8038					r = r600_bytecode_add_alu(ctx->bc, &alu);
8039					if (r)
8040						return r;
8041				}
8042			}
8043			src_loaded = TRUE;
8044			src_gpr = ctx->temp_reg;
8045		}
8046	}
8047
8048	if (src_requires_loading && !src_loaded) {
8049		for (i = 0; i < 4; i++) {
8050			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8051			alu.op = ALU_OP1_MOV;
8052			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8053			alu.dst.sel = ctx->temp_reg;
8054			alu.dst.chan = i;
8055			if (i == 3)
8056				alu.last = 1;
8057			alu.dst.write = 1;
8058			r = r600_bytecode_add_alu(ctx->bc, &alu);
8059			if (r)
8060				return r;
8061		}
8062		src_loaded = TRUE;
8063		src_gpr = ctx->temp_reg;
8064	}
8065
8066	/* get offset values */
8067	if (inst->Texture.NumOffsets) {
8068		assert(inst->Texture.NumOffsets == 1);
8069
8070		/* The texture offset feature doesn't work with the TXF instruction
8071		 * and must be emulated by adding the offset to the texture coordinates. */
8072		if (txf_add_offsets) {
8073			const struct tgsi_texture_offset *off = inst->TexOffsets;
8074
8075			switch (inst->Texture.Texture) {
8076			case TGSI_TEXTURE_3D:
8077				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8078				alu.op = ALU_OP2_ADD_INT;
8079				alu.src[0].sel = src_gpr;
8080				alu.src[0].chan = 2;
8081				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8082				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8083				alu.dst.sel = src_gpr;
8084				alu.dst.chan = 2;
8085				alu.dst.write = 1;
8086				alu.last = 1;
8087				r = r600_bytecode_add_alu(ctx->bc, &alu);
8088				if (r)
8089					return r;
8090				/* fall through */
8091
8092			case TGSI_TEXTURE_2D:
8093			case TGSI_TEXTURE_SHADOW2D:
8094			case TGSI_TEXTURE_RECT:
8095			case TGSI_TEXTURE_SHADOWRECT:
8096			case TGSI_TEXTURE_2D_ARRAY:
8097			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8098				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8099				alu.op = ALU_OP2_ADD_INT;
8100				alu.src[0].sel = src_gpr;
8101				alu.src[0].chan = 1;
8102				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8103				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8104				alu.dst.sel = src_gpr;
8105				alu.dst.chan = 1;
8106				alu.dst.write = 1;
8107				alu.last = 1;
8108				r = r600_bytecode_add_alu(ctx->bc, &alu);
8109				if (r)
8110					return r;
8111				/* fall through */
8112
8113			case TGSI_TEXTURE_1D:
8114			case TGSI_TEXTURE_SHADOW1D:
8115			case TGSI_TEXTURE_1D_ARRAY:
8116			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8117				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8118				alu.op = ALU_OP2_ADD_INT;
8119				alu.src[0].sel = src_gpr;
8120				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8121				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8122				alu.dst.sel = src_gpr;
8123				alu.dst.write = 1;
8124				alu.last = 1;
8125				r = r600_bytecode_add_alu(ctx->bc, &alu);
8126				if (r)
8127					return r;
8128				break;
8129				/* texture offsets do not apply to other texture targets */
8130			}
8131		} else {
8132			switch (inst->Texture.Texture) {
8133			case TGSI_TEXTURE_3D:
8134				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8135				/* fallthrough */
8136			case TGSI_TEXTURE_2D:
8137			case TGSI_TEXTURE_SHADOW2D:
8138			case TGSI_TEXTURE_RECT:
8139			case TGSI_TEXTURE_SHADOWRECT:
8140			case TGSI_TEXTURE_2D_ARRAY:
8141			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8142				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8143				/* fallthrough */
8144			case TGSI_TEXTURE_1D:
8145			case TGSI_TEXTURE_SHADOW1D:
8146			case TGSI_TEXTURE_1D_ARRAY:
8147			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8148				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8149			}
8150		}
8151	}
8152
8153	/* Obtain the sample index for reading a compressed MSAA color texture.
8154	 * To read the FMASK, we use the ldfptr instruction, which tells us
8155	 * where the samples are stored.
8156	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8157	 * which is the identity mapping. Each nibble says which physical sample
8158	 * should be fetched to get that sample.
8159	 *
8160	 * Assume src.z contains the sample index. It should be modified like this:
8161	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8162	 * Then fetch the texel with src.
8163	 */
8164	if (read_compressed_msaa) {
8165		unsigned sample_chan = 3;
8166		unsigned temp = r600_get_temp(ctx);
8167		assert(src_loaded);
8168
8169		/* temp.w = ldfptr() */
8170		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8171		tex.op = FETCH_OP_LD;
8172		tex.inst_mod = 1; /* to indicate this is ldfptr */
8173		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8174		tex.sampler_index_mode = sampler_index_mode;
8175		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8176		tex.resource_index_mode = sampler_index_mode;
8177		tex.src_gpr = src_gpr;
8178		tex.dst_gpr = temp;
8179		tex.dst_sel_x = 7; /* mask out these components */
8180		tex.dst_sel_y = 7;
8181		tex.dst_sel_z = 7;
8182		tex.dst_sel_w = 0; /* store X */
8183		tex.src_sel_x = 0;
8184		tex.src_sel_y = 1;
8185		tex.src_sel_z = 2;
8186		tex.src_sel_w = 3;
8187		tex.offset_x = offset_x;
8188		tex.offset_y = offset_y;
8189		tex.offset_z = offset_z;
8190		r = r600_bytecode_add_tex(ctx->bc, &tex);
8191		if (r)
8192			return r;
8193
8194		/* temp.x = sample_index*4 */
8195		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8196		alu.op = ALU_OP2_MULLO_INT;
8197		alu.src[0].sel = src_gpr;
8198		alu.src[0].chan = sample_chan;
8199		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8200		alu.src[1].value = 4;
8201		alu.dst.sel = temp;
8202		alu.dst.chan = 0;
8203		alu.dst.write = 1;
8204		r = emit_mul_int_op(ctx->bc, &alu);
8205		if (r)
8206			return r;
8207
8208		/* sample_index = temp.w >> temp.x */
8209		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8210		alu.op = ALU_OP2_LSHR_INT;
8211		alu.src[0].sel = temp;
8212		alu.src[0].chan = 3;
8213		alu.src[1].sel = temp;
8214		alu.src[1].chan = 0;
8215		alu.dst.sel = src_gpr;
8216		alu.dst.chan = sample_chan;
8217		alu.dst.write = 1;
8218		alu.last = 1;
8219		r = r600_bytecode_add_alu(ctx->bc, &alu);
8220		if (r)
8221			return r;
8222
8223		/* sample_index & 0xF */
8224		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8225		alu.op = ALU_OP2_AND_INT;
8226		alu.src[0].sel = src_gpr;
8227		alu.src[0].chan = sample_chan;
8228		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8229		alu.src[1].value = 0xF;
8230		alu.dst.sel = src_gpr;
8231		alu.dst.chan = sample_chan;
8232		alu.dst.write = 1;
8233		alu.last = 1;
8234		r = r600_bytecode_add_alu(ctx->bc, &alu);
8235		if (r)
8236			return r;
8237#if 0
8238		/* visualize the FMASK */
8239		for (i = 0; i < 4; i++) {
8240			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8241			alu.op = ALU_OP1_INT_TO_FLT;
8242			alu.src[0].sel = src_gpr;
8243			alu.src[0].chan = sample_chan;
8244			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8245			alu.dst.chan = i;
8246			alu.dst.write = 1;
8247			alu.last = 1;
8248			r = r600_bytecode_add_alu(ctx->bc, &alu);
8249			if (r)
8250				return r;
8251		}
8252		return 0;
8253#endif
8254	}
8255
8256	/* does this shader want a num layers from TXQ for a cube array? */
8257	if (has_txq_cube_array_z) {
8258		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8259
8260		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8261		alu.op = ALU_OP1_MOV;
8262
8263		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8264		if (ctx->bc->chip_class >= EVERGREEN) {
8265			/* with eg each dword is number of cubes */
8266			alu.src[0].sel += id / 4;
8267			alu.src[0].chan = id % 4;
8268		} else {
8269			/* r600 we have them at channel 2 of the second dword */
8270			alu.src[0].sel += (id * 2) + 1;
8271			alu.src[0].chan = 2;
8272		}
8273		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8274		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8275		alu.last = 1;
8276		r = r600_bytecode_add_alu(ctx->bc, &alu);
8277		if (r)
8278			return r;
8279		/* disable writemask from texture instruction */
8280		inst->Dst[0].Register.WriteMask &= ~4;
8281	}
8282
8283	opcode = ctx->inst_info->op;
8284	if (opcode == FETCH_OP_GATHER4 &&
8285		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8286		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8287		struct r600_bytecode_tex *t;
8288		opcode = FETCH_OP_GATHER4_O;
8289
8290		/* GATHER4_O/GATHER4_C_O use offset values loaded by
8291		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
8292		   encoded in the instruction are ignored. */
8293		t = &grad_offs[n_grad_offs++];
8294		memset(t, 0, sizeof(struct r600_bytecode_tex));
8295		t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8296		t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8297		t->sampler_index_mode = sampler_index_mode;
8298		t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8299		t->resource_index_mode = sampler_index_mode;
8300
8301		t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8302		t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8303		t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8304		if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8305			 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8306			/* make sure array index selector is 0, this is just a safety
8307			 * precausion because TGSI seems to emit something strange here */
8308			t->src_sel_z = 4;
8309		else
8310			t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8311
8312		t->src_sel_w = 4;
8313
8314		t->dst_sel_x = 7;
8315		t->dst_sel_y = 7;
8316		t->dst_sel_z = 7;
8317		t->dst_sel_w = 7;
8318	}
8319
8320	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8321	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8322	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8323	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8324	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8325	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8326	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8327		switch (opcode) {
8328		case FETCH_OP_SAMPLE:
8329			opcode = FETCH_OP_SAMPLE_C;
8330			break;
8331		case FETCH_OP_SAMPLE_L:
8332			opcode = FETCH_OP_SAMPLE_C_L;
8333			break;
8334		case FETCH_OP_SAMPLE_LB:
8335			opcode = FETCH_OP_SAMPLE_C_LB;
8336			break;
8337		case FETCH_OP_SAMPLE_G:
8338			opcode = FETCH_OP_SAMPLE_C_G;
8339			break;
8340		/* Texture gather variants */
8341		case FETCH_OP_GATHER4:
8342			opcode = FETCH_OP_GATHER4_C;
8343			break;
8344		case FETCH_OP_GATHER4_O:
8345			opcode = FETCH_OP_GATHER4_C_O;
8346			break;
8347		}
8348	}
8349
8350	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8351	tex.op = opcode;
8352
8353	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8354	tex.sampler_index_mode = sampler_index_mode;
8355	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8356	tex.resource_index_mode = sampler_index_mode;
8357	tex.src_gpr = src_gpr;
8358	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8359
8360	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8361		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8362		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8363	}
8364
8365	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8366		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8367		tex.inst_mod = texture_component_select;
8368
8369		if (ctx->bc->chip_class == CAYMAN) {
8370			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8371			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8372			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8373			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8374		} else {
8375			/* GATHER4 result order is different from TGSI TG4 */
8376			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8377			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8378			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8379			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8380		}
8381	}
8382	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8383		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8384		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8385		tex.dst_sel_z = 7;
8386		tex.dst_sel_w = 7;
8387	}
8388	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8389		tex.dst_sel_x = 3;
8390		tex.dst_sel_y = 7;
8391		tex.dst_sel_z = 7;
8392		tex.dst_sel_w = 7;
8393	}
8394	else {
8395		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8396		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8397		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8398		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8399	}
8400
8401
8402	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8403		tex.src_sel_x = 4;
8404		tex.src_sel_y = 4;
8405		tex.src_sel_z = 4;
8406		tex.src_sel_w = 4;
8407	} else if (src_loaded) {
8408		tex.src_sel_x = 0;
8409		tex.src_sel_y = 1;
8410		tex.src_sel_z = 2;
8411		tex.src_sel_w = 3;
8412	} else {
8413		tex.src_sel_x = ctx->src[0].swizzle[0];
8414		tex.src_sel_y = ctx->src[0].swizzle[1];
8415		tex.src_sel_z = ctx->src[0].swizzle[2];
8416		tex.src_sel_w = ctx->src[0].swizzle[3];
8417		tex.src_rel = ctx->src[0].rel;
8418	}
8419
8420	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8421	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8422	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8423	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8424		tex.src_sel_x = 1;
8425		tex.src_sel_y = 0;
8426		tex.src_sel_z = 3;
8427		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8428	}
8429
8430	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8431	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8432		tex.coord_type_x = 1;
8433		tex.coord_type_y = 1;
8434	}
8435	tex.coord_type_z = 1;
8436	tex.coord_type_w = 1;
8437
8438	tex.offset_x = offset_x;
8439	tex.offset_y = offset_y;
8440	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8441		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8442		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8443		tex.offset_z = 0;
8444	}
8445	else {
8446		tex.offset_z = offset_z;
8447	}
8448
8449	/* Put the depth for comparison in W.
8450	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8451	 * Some instructions expect the depth in Z. */
8452	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8453	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8454	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8455	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8456	    opcode != FETCH_OP_SAMPLE_C_L &&
8457	    opcode != FETCH_OP_SAMPLE_C_LB) {
8458		tex.src_sel_w = tex.src_sel_z;
8459	}
8460
8461	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8462	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8463		if (opcode == FETCH_OP_SAMPLE_C_L ||
8464		    opcode == FETCH_OP_SAMPLE_C_LB) {
8465			/* the array index is read from Y */
8466			tex.coord_type_y = 0;
8467			array_index_offset_channel = tex.src_sel_y;
8468		} else {
8469			/* the array index is read from Z */
8470			tex.coord_type_z = 0;
8471			tex.src_sel_z = tex.src_sel_y;
8472			array_index_offset_channel = tex.src_sel_z;
8473		}
8474	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8475		    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8476		tex.coord_type_z = 0;
8477		array_index_offset_channel = tex.src_sel_z;
8478	} else if  ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8479		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8480		    (ctx->bc->chip_class >= EVERGREEN))
8481		/* the array index is read from Z, coordinate will be corrected elsewhere  */
8482		tex.coord_type_z = 0;
8483
8484	/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8485	 * evaluate the array index  */
8486	if (array_index_offset_channel >= 0 &&
8487		 opcode != FETCH_OP_LD &&
8488		 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8489		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8490		alu.src[0].sel =  tex.src_gpr;
8491		alu.src[0].chan =  array_index_offset_channel;
8492		alu.src[0].rel = tex.src_rel;
8493		alu.op = ALU_OP1_RNDNE;
8494		alu.dst.sel = tex.src_gpr;
8495		alu.dst.chan = array_index_offset_channel;
8496		alu.dst.rel = tex.src_rel;
8497		alu.dst.write = 1;
8498		alu.last = 1;
8499		r = r600_bytecode_add_alu(ctx->bc, &alu);
8500		if (r)
8501			return r;
8502	}
8503
8504	/* mask unused source components */
8505	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8506		switch (inst->Texture.Texture) {
8507		case TGSI_TEXTURE_2D:
8508		case TGSI_TEXTURE_RECT:
8509			tex.src_sel_z = 7;
8510			tex.src_sel_w = 7;
8511			break;
8512		case TGSI_TEXTURE_1D_ARRAY:
8513			tex.src_sel_y = 7;
8514			tex.src_sel_w = 7;
8515			break;
8516		case TGSI_TEXTURE_1D:
8517			tex.src_sel_y = 7;
8518			tex.src_sel_z = 7;
8519			tex.src_sel_w = 7;
8520			break;
8521		}
8522	}
8523
8524	/* Emit set gradient and offset instructions. */
8525	for (i = 0; i < n_grad_offs; ++i) {
8526		r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8527		if (r)
8528			return r;
8529	}
8530
8531	r = r600_bytecode_add_tex(ctx->bc, &tex);
8532	if (r)
8533		return r;
8534
8535	/* add shadow ambient support  - gallium doesn't do it yet */
8536	return 0;
8537}
8538
8539static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8540				  struct tgsi_full_src_register *src)
8541{
8542	unsigned i;
8543
8544	if (src->Register.Indirect) {
8545		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8546			if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8547				return ctx->shader->atomics[i].hw_idx;
8548		}
8549	} else {
8550		uint32_t index = src->Register.Index;
8551		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8552			if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8553				continue;
8554			if (index > ctx->shader->atomics[i].end)
8555				continue;
8556			if (index < ctx->shader->atomics[i].start)
8557				continue;
8558			uint32_t offset = (index - ctx->shader->atomics[i].start);
8559			return ctx->shader->atomics[i].hw_idx + offset;
8560		}
8561	}
8562	assert(0);
8563	return -1;
8564}
8565
8566static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8567			     int *uav_id_p, int *uav_index_mode_p)
8568{
8569	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8570	int uav_id, uav_index_mode = 0;
8571	int r;
8572	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8573
8574	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8575
8576	if (inst->Src[0].Register.Indirect) {
8577		if (is_cm) {
8578			struct r600_bytecode_alu alu;
8579			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8580			alu.op = ALU_OP2_LSHL_INT;
8581			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8582			alu.src[0].chan = 0;
8583			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8584			alu.src[1].value = 2;
8585			alu.dst.sel = ctx->temp_reg;
8586			alu.dst.chan = 0;
8587			alu.dst.write = 1;
8588			alu.last = 1;
8589			r = r600_bytecode_add_alu(ctx->bc, &alu);
8590			if (r)
8591				return r;
8592
8593			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8594					   ctx->temp_reg, 0,
8595					   ctx->temp_reg, 0,
8596					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8597			if (r)
8598				return r;
8599		} else
8600			uav_index_mode = 2;
8601	} else if (is_cm) {
8602		r = single_alu_op2(ctx, ALU_OP1_MOV,
8603				   ctx->temp_reg, 0,
8604				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8605				   0, 0);
8606		if (r)
8607			return r;
8608	}
8609	*uav_id_p = uav_id;
8610	*uav_index_mode_p = uav_index_mode;
8611	return 0;
8612}
8613
8614static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8615{
8616	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8617	int r;
8618	struct r600_bytecode_gds gds;
8619	int uav_id = 0;
8620	int uav_index_mode = 0;
8621	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8622
8623	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8624	if (r)
8625		return r;
8626
8627	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8628	gds.op = FETCH_OP_GDS_READ_RET;
8629	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8630	gds.uav_id = is_cm ? 0 : uav_id;
8631	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8632	gds.src_gpr = ctx->temp_reg;
8633	gds.src_sel_x = (is_cm) ? 0 : 4;
8634	gds.src_sel_y = 4;
8635	gds.src_sel_z = 4;
8636	gds.dst_sel_x = 0;
8637	gds.dst_sel_y = 7;
8638	gds.dst_sel_z = 7;
8639	gds.dst_sel_w = 7;
8640	gds.src_gpr2 = 0;
8641	gds.alloc_consume = !is_cm;
8642	r = r600_bytecode_add_gds(ctx->bc, &gds);
8643	if (r)
8644		return r;
8645
8646	ctx->bc->cf_last->vpm = 1;
8647	return 0;
8648}
8649
8650/* this fixes up 1D arrays properly */
8651static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8652{
8653	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8654	int r, i;
8655	struct r600_bytecode_alu alu;
8656	int temp_reg = r600_get_temp(ctx);
8657
8658	for (i = 0; i < 4; i++) {
8659		bool def_val = true, write_zero = false;
8660		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8661		alu.op = ALU_OP1_MOV;
8662		alu.dst.sel = temp_reg;
8663		alu.dst.chan = i;
8664
8665		switch (inst->Memory.Texture) {
8666		case TGSI_TEXTURE_BUFFER:
8667		case TGSI_TEXTURE_1D:
8668			if (i == 1 || i == 2 || i == 3) {
8669				write_zero = true;
8670			}
8671			break;
8672		case TGSI_TEXTURE_1D_ARRAY:
8673			if (i == 1 || i == 3)
8674				write_zero = true;
8675			else if (i == 2) {
8676				r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8677				def_val = false;
8678			}
8679			break;
8680		case TGSI_TEXTURE_2D:
8681			if (i == 2 || i == 3)
8682				write_zero = true;
8683			break;
8684		default:
8685			if (i == 3)
8686				write_zero = true;
8687			break;
8688		}
8689
8690		if (write_zero) {
8691			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8692			alu.src[0].value = 0;
8693		} else if (def_val) {
8694			r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8695		}
8696
8697		if (i == 3)
8698			alu.last = 1;
8699		alu.dst.write = 1;
8700		r = r600_bytecode_add_alu(ctx->bc, &alu);
8701		if (r)
8702			return r;
8703	}
8704	*idx_gpr = temp_reg;
8705	return 0;
8706}
8707
8708static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8709			     int temp_reg)
8710{
8711	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8712	int r;
8713	if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8714		int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8715		r = single_alu_op2(ctx, ALU_OP1_MOV,
8716				   temp_reg, 0,
8717				   V_SQ_ALU_SRC_LITERAL, value >> 2,
8718				   0, 0);
8719		if (r)
8720			return r;
8721	} else {
8722		struct r600_bytecode_alu alu;
8723		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8724		alu.op = ALU_OP2_LSHR_INT;
8725		r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8726		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8727		alu.src[1].value = 2;
8728		alu.dst.sel = temp_reg;
8729		alu.dst.write = 1;
8730		alu.last = 1;
8731		r = r600_bytecode_add_alu(ctx->bc, &alu);
8732		if (r)
8733			return r;
8734	}
8735	return 0;
8736}
8737
8738static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8739{
8740	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8741	/* have to work out the offset into the RAT immediate return buffer */
8742	struct r600_bytecode_vtx vtx;
8743	struct r600_bytecode_cf *cf;
8744	int r;
8745	int temp_reg = r600_get_temp(ctx);
8746	unsigned rat_index_mode;
8747	unsigned base;
8748
8749	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8750	base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8751
8752	r = load_buffer_coord(ctx, 1, temp_reg);
8753	if (r)
8754		return r;
8755	ctx->bc->cf_last->barrier = 1;
8756	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8757	vtx.op = FETCH_OP_VFETCH;
8758	vtx.buffer_id = inst->Src[0].Register.Index + base;
8759	vtx.buffer_index_mode = rat_index_mode;
8760	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8761	vtx.src_gpr = temp_reg;
8762	vtx.src_sel_x = 0;
8763	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8764	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
8765	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
8766	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
8767	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
8768	vtx.num_format_all = 1;
8769	vtx.format_comp_all = 1;
8770	vtx.srf_mode_all = 0;
8771
8772	if (inst->Dst[0].Register.WriteMask & 8) {
8773		vtx.data_format = FMT_32_32_32_32;
8774		vtx.use_const_fields = 0;
8775	} else if (inst->Dst[0].Register.WriteMask & 4) {
8776		vtx.data_format = FMT_32_32_32;
8777		vtx.use_const_fields = 0;
8778	} else if (inst->Dst[0].Register.WriteMask & 2) {
8779		vtx.data_format = FMT_32_32;
8780		vtx.use_const_fields = 0;
8781	} else {
8782		vtx.data_format = FMT_32;
8783		vtx.use_const_fields = 0;
8784	}
8785
8786	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8787	if (r)
8788		return r;
8789	cf = ctx->bc->cf_last;
8790	cf->barrier = 1;
8791	return 0;
8792}
8793
8794static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8795{
8796	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8797	/* have to work out the offset into the RAT immediate return buffer */
8798	struct r600_bytecode_vtx vtx;
8799	struct r600_bytecode_cf *cf;
8800	int r;
8801	int idx_gpr;
8802	unsigned format, num_format, format_comp, endian;
8803	const struct util_format_description *desc;
8804	unsigned rat_index_mode;
8805	unsigned immed_base;
8806
8807	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8808
8809	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8810	r = load_index_src(ctx, 1, &idx_gpr);
8811	if (r)
8812		return r;
8813
8814	if (rat_index_mode)
8815		egcm_load_index_reg(ctx->bc, 1, false);
8816
8817	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8818	cf = ctx->bc->cf_last;
8819
8820	cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8821	cf->rat.inst = V_RAT_INST_NOP_RTN;
8822	cf->rat.index_mode = rat_index_mode;
8823	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8824	cf->output.gpr = ctx->thread_id_gpr;
8825	cf->output.index_gpr = idx_gpr;
8826	cf->output.comp_mask = 0xf;
8827	cf->output.burst_count = 1;
8828	cf->vpm = 1;
8829	cf->barrier = 1;
8830	cf->mark = 1;
8831	cf->output.elem_size = 0;
8832
8833	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8834	cf = ctx->bc->cf_last;
8835	cf->barrier = 1;
8836
8837	desc = util_format_description(inst->Memory.Format);
8838	r600_vertex_data_type(inst->Memory.Format,
8839			      &format, &num_format, &format_comp, &endian);
8840	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8841	vtx.op = FETCH_OP_VFETCH;
8842	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8843	vtx.buffer_index_mode = rat_index_mode;
8844	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8845	vtx.src_gpr = ctx->thread_id_gpr;
8846	vtx.src_sel_x = 1;
8847	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8848	vtx.dst_sel_x = desc->swizzle[0];
8849	vtx.dst_sel_y = desc->swizzle[1];
8850	vtx.dst_sel_z = desc->swizzle[2];
8851	vtx.dst_sel_w = desc->swizzle[3];
8852	vtx.srf_mode_all = 1;
8853	vtx.data_format = format;
8854	vtx.num_format_all = num_format;
8855	vtx.format_comp_all = format_comp;
8856	vtx.endian = endian;
8857	vtx.offset = 0;
8858	vtx.mega_fetch_count = 3;
8859	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8860	if (r)
8861		return r;
8862	cf = ctx->bc->cf_last;
8863	cf->barrier = 1;
8864	return 0;
8865}
8866
8867static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8868{
8869	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8870	struct r600_bytecode_alu alu;
8871	int r;
8872	int temp_reg = r600_get_temp(ctx);
8873
8874	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8875	alu.op = ALU_OP1_MOV;
8876	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8877	alu.dst.sel = temp_reg;
8878	alu.dst.write = 1;
8879	alu.last = 1;
8880	r = r600_bytecode_add_alu(ctx->bc, &alu);
8881	if (r)
8882		return r;
8883
8884	r = do_lds_fetch_values(ctx, temp_reg,
8885				ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8886	if (r)
8887		return r;
8888	return 0;
8889}
8890
8891static int tgsi_load(struct r600_shader_ctx *ctx)
8892{
8893	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8894	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8895		return tgsi_load_rat(ctx);
8896	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8897		return tgsi_load_gds(ctx);
8898	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8899		return tgsi_load_buffer(ctx);
8900	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8901		return tgsi_load_lds(ctx);
8902	return 0;
8903}
8904
8905static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8906{
8907	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8908	struct r600_bytecode_cf *cf;
8909	int r, i;
8910	unsigned rat_index_mode;
8911	int lasti;
8912	int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8913
8914	r = load_buffer_coord(ctx, 0, treg2);
8915	if (r)
8916		return r;
8917
8918	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8919	if (rat_index_mode)
8920		egcm_load_index_reg(ctx->bc, 1, false);
8921
8922	for (i = 0; i <= 3; i++) {
8923		struct r600_bytecode_alu alu;
8924		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8925		alu.op = ALU_OP1_MOV;
8926		alu.dst.sel = temp_reg;
8927		alu.dst.chan = i;
8928		alu.src[0].sel = V_SQ_ALU_SRC_0;
8929		alu.last = (i == 3);
8930		alu.dst.write = 1;
8931		r = r600_bytecode_add_alu(ctx->bc, &alu);
8932		if (r)
8933			return r;
8934	}
8935
8936	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8937	for (i = 0; i <= lasti; i++) {
8938		struct r600_bytecode_alu alu;
8939		if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8940			continue;
8941
8942		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8943				   temp_reg, 0,
8944				   treg2, 0,
8945				   V_SQ_ALU_SRC_LITERAL, i);
8946		if (r)
8947			return r;
8948
8949		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8950		alu.op = ALU_OP1_MOV;
8951		alu.dst.sel = ctx->temp_reg;
8952		alu.dst.chan = 0;
8953
8954		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8955		alu.last = 1;
8956		alu.dst.write = 1;
8957		r = r600_bytecode_add_alu(ctx->bc, &alu);
8958		if (r)
8959			return r;
8960
8961		r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8962		cf = ctx->bc->cf_last;
8963
8964		cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8965		cf->rat.inst = V_RAT_INST_STORE_TYPED;
8966		cf->rat.index_mode = rat_index_mode;
8967		cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8968		cf->output.gpr = ctx->temp_reg;
8969		cf->output.index_gpr = temp_reg;
8970		cf->output.comp_mask = 1;
8971		cf->output.burst_count = 1;
8972		cf->vpm = 1;
8973		cf->barrier = 1;
8974		cf->output.elem_size = 0;
8975	}
8976	return 0;
8977}
8978
8979static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8980{
8981	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8982	struct r600_bytecode_cf *cf;
8983	bool src_requires_loading = false;
8984	int val_gpr, idx_gpr;
8985	int r, i;
8986	unsigned rat_index_mode;
8987
8988	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8989
8990	r = load_index_src(ctx, 0, &idx_gpr);
8991	if (r)
8992		return r;
8993
8994	if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8995		src_requires_loading = true;
8996
8997	if (src_requires_loading) {
8998		struct r600_bytecode_alu alu;
8999		for (i = 0; i < 4; i++) {
9000			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9001			alu.op = ALU_OP1_MOV;
9002			alu.dst.sel = ctx->temp_reg;
9003			alu.dst.chan = i;
9004
9005			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9006			if (i == 3)
9007				alu.last = 1;
9008			alu.dst.write = 1;
9009			r = r600_bytecode_add_alu(ctx->bc, &alu);
9010			if (r)
9011				return r;
9012		}
9013		val_gpr = ctx->temp_reg;
9014	} else
9015		val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9016	if (rat_index_mode)
9017		egcm_load_index_reg(ctx->bc, 1, false);
9018
9019	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9020	cf = ctx->bc->cf_last;
9021
9022	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9023	cf->rat.inst = V_RAT_INST_STORE_TYPED;
9024	cf->rat.index_mode = rat_index_mode;
9025	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9026	cf->output.gpr = val_gpr;
9027	cf->output.index_gpr = idx_gpr;
9028	cf->output.comp_mask = 0xf;
9029	cf->output.burst_count = 1;
9030	cf->vpm = 1;
9031	cf->barrier = 1;
9032	cf->output.elem_size = 0;
9033	return 0;
9034}
9035
9036static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9037{
9038	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9039	struct r600_bytecode_alu alu;
9040	int r, i, lasti;
9041	int write_mask = inst->Dst[0].Register.WriteMask;
9042	int temp_reg = r600_get_temp(ctx);
9043
9044	/* LDS write */
9045	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9046	alu.op = ALU_OP1_MOV;
9047	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9048	alu.dst.sel = temp_reg;
9049	alu.dst.write = 1;
9050	alu.last = 1;
9051	r = r600_bytecode_add_alu(ctx->bc, &alu);
9052	if (r)
9053		return r;
9054
9055	lasti = tgsi_last_instruction(write_mask);
9056	for (i = 1; i <= lasti; i++) {
9057		if (!(write_mask & (1 << i)))
9058			continue;
9059		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9060				   temp_reg, i,
9061				   temp_reg, 0,
9062				   V_SQ_ALU_SRC_LITERAL, 4 * i);
9063		if (r)
9064			return r;
9065	}
9066	for (i = 0; i <= lasti; i++) {
9067		if (!(write_mask & (1 << i)))
9068			continue;
9069
9070		if ((i == 0 && ((write_mask & 3) == 3)) ||
9071		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
9072			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9073			alu.op = LDS_OP3_LDS_WRITE_REL;
9074
9075			alu.src[0].sel = temp_reg;
9076			alu.src[0].chan = i;
9077			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9078			r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9079			alu.last = 1;
9080			alu.is_lds_idx_op = true;
9081			alu.lds_idx = 1;
9082			r = r600_bytecode_add_alu(ctx->bc, &alu);
9083			if (r)
9084				return r;
9085			i += 1;
9086			continue;
9087		}
9088		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9089		alu.op = LDS_OP2_LDS_WRITE;
9090
9091		alu.src[0].sel = temp_reg;
9092		alu.src[0].chan = i;
9093		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9094
9095		alu.last = 1;
9096		alu.is_lds_idx_op = true;
9097
9098		r = r600_bytecode_add_alu(ctx->bc, &alu);
9099		if (r)
9100			return r;
9101	}
9102	return 0;
9103}
9104
9105static int tgsi_store(struct r600_shader_ctx *ctx)
9106{
9107	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9108	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9109		return tgsi_store_buffer_rat(ctx);
9110	else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9111		return tgsi_store_lds(ctx);
9112	else
9113		return tgsi_store_rat(ctx);
9114}
9115
9116static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9117{
9118	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9119	/* have to work out the offset into the RAT immediate return buffer */
9120	struct r600_bytecode_alu alu;
9121	struct r600_bytecode_vtx vtx;
9122	struct r600_bytecode_cf *cf;
9123	int r;
9124	int idx_gpr;
9125	unsigned format, num_format, format_comp, endian;
9126	const struct util_format_description *desc;
9127	unsigned rat_index_mode;
9128	unsigned immed_base;
9129	unsigned rat_base;
9130
9131	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9132	rat_base = ctx->shader->rat_base;
9133
9134        if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9135		immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9136		rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9137
9138		r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9139		if (r)
9140			return r;
9141		idx_gpr = ctx->temp_reg;
9142	} else {
9143		r = load_index_src(ctx, 1, &idx_gpr);
9144		if (r)
9145			return r;
9146	}
9147
9148	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9149
9150	if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9151		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9152		alu.op = ALU_OP1_MOV;
9153		alu.dst.sel = ctx->thread_id_gpr;
9154		alu.dst.chan = 0;
9155		alu.dst.write = 1;
9156		r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9157		alu.last = 1;
9158		r = r600_bytecode_add_alu(ctx->bc, &alu);
9159		if (r)
9160			return r;
9161
9162		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9163		alu.op = ALU_OP1_MOV;
9164		alu.dst.sel = ctx->thread_id_gpr;
9165		if (ctx->bc->chip_class == CAYMAN)
9166			alu.dst.chan = 2;
9167		else
9168			alu.dst.chan = 3;
9169		alu.dst.write = 1;
9170		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9171		alu.last = 1;
9172		r = r600_bytecode_add_alu(ctx->bc, &alu);
9173		if (r)
9174			return r;
9175	} else {
9176		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9177		alu.op = ALU_OP1_MOV;
9178		alu.dst.sel = ctx->thread_id_gpr;
9179		alu.dst.chan = 0;
9180		alu.dst.write = 1;
9181		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9182		alu.last = 1;
9183		r = r600_bytecode_add_alu(ctx->bc, &alu);
9184		if (r)
9185			return r;
9186	}
9187
9188	if (rat_index_mode)
9189		egcm_load_index_reg(ctx->bc, 1, false);
9190	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9191	cf = ctx->bc->cf_last;
9192
9193	cf->rat.id = rat_base + inst->Src[0].Register.Index;
9194	cf->rat.inst = ctx->inst_info->op;
9195	cf->rat.index_mode = rat_index_mode;
9196	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9197	cf->output.gpr = ctx->thread_id_gpr;
9198	cf->output.index_gpr = idx_gpr;
9199	cf->output.comp_mask = 0xf;
9200	cf->output.burst_count = 1;
9201	cf->vpm = 1;
9202	cf->barrier = 1;
9203	cf->mark = 1;
9204	cf->output.elem_size = 0;
9205	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9206	cf = ctx->bc->cf_last;
9207	cf->barrier = 1;
9208	cf->cf_addr = 1;
9209
9210	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9211	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9212		desc = util_format_description(inst->Memory.Format);
9213		r600_vertex_data_type(inst->Memory.Format,
9214				      &format, &num_format, &format_comp, &endian);
9215		vtx.dst_sel_x = desc->swizzle[0];
9216	} else {
9217		format = FMT_32;
9218		num_format = 1;
9219		format_comp = 0;
9220		endian = 0;
9221		vtx.dst_sel_x = 0;
9222	}
9223	vtx.op = FETCH_OP_VFETCH;
9224	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9225	vtx.buffer_index_mode = rat_index_mode;
9226	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9227	vtx.src_gpr = ctx->thread_id_gpr;
9228	vtx.src_sel_x = 1;
9229	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9230	vtx.dst_sel_y = 7;
9231	vtx.dst_sel_z = 7;
9232	vtx.dst_sel_w = 7;
9233	vtx.use_const_fields = 0;
9234	vtx.srf_mode_all = 1;
9235	vtx.data_format = format;
9236	vtx.num_format_all = num_format;
9237	vtx.format_comp_all = format_comp;
9238	vtx.endian = endian;
9239	vtx.offset = 0;
9240	vtx.mega_fetch_count = 0xf;
9241	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9242	if (r)
9243		return r;
9244	cf = ctx->bc->cf_last;
9245	cf->vpm = 1;
9246	cf->barrier = 1;
9247	return 0;
9248}
9249
9250static int get_gds_op(int opcode)
9251{
9252	switch (opcode) {
9253	case TGSI_OPCODE_ATOMUADD:
9254		return FETCH_OP_GDS_ADD_RET;
9255	case TGSI_OPCODE_ATOMAND:
9256		return FETCH_OP_GDS_AND_RET;
9257	case TGSI_OPCODE_ATOMOR:
9258		return FETCH_OP_GDS_OR_RET;
9259	case TGSI_OPCODE_ATOMXOR:
9260		return FETCH_OP_GDS_XOR_RET;
9261	case TGSI_OPCODE_ATOMUMIN:
9262		return FETCH_OP_GDS_MIN_UINT_RET;
9263	case TGSI_OPCODE_ATOMUMAX:
9264		return FETCH_OP_GDS_MAX_UINT_RET;
9265	case TGSI_OPCODE_ATOMXCHG:
9266		return FETCH_OP_GDS_XCHG_RET;
9267	case TGSI_OPCODE_ATOMCAS:
9268		return FETCH_OP_GDS_CMP_XCHG_RET;
9269	default:
9270		return -1;
9271	}
9272}
9273
9274static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9275{
9276	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9277	struct r600_bytecode_gds gds;
9278	struct r600_bytecode_alu alu;
9279	int gds_op = get_gds_op(inst->Instruction.Opcode);
9280	int r;
9281	int uav_id = 0;
9282	int uav_index_mode = 0;
9283	bool is_cm = (ctx->bc->chip_class == CAYMAN);
9284
9285	if (gds_op == -1) {
9286		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9287		return -1;
9288	}
9289
9290	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9291	if (r)
9292		return r;
9293
9294	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9295		if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9296			int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9297			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9298			alu.op = ALU_OP1_MOV;
9299			alu.dst.sel = ctx->temp_reg;
9300			alu.dst.chan = is_cm ? 2 : 1;
9301			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9302			alu.src[0].value = value;
9303			alu.last = 1;
9304			alu.dst.write = 1;
9305			r = r600_bytecode_add_alu(ctx->bc, &alu);
9306			if (r)
9307				return r;
9308		} else {
9309			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9310			alu.op = ALU_OP1_MOV;
9311			alu.dst.sel = ctx->temp_reg;
9312			alu.dst.chan = is_cm ? 2 : 1;
9313			r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9314			alu.last = 1;
9315			alu.dst.write = 1;
9316			r = r600_bytecode_add_alu(ctx->bc, &alu);
9317			if (r)
9318				return r;
9319		}
9320	}
9321	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9322		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9323		int abs_value = abs(value);
9324		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9325			gds_op = FETCH_OP_GDS_SUB_RET;
9326		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9327		alu.op = ALU_OP1_MOV;
9328		alu.dst.sel = ctx->temp_reg;
9329		alu.dst.chan = is_cm ? 1 : 0;
9330		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9331		alu.src[0].value = abs_value;
9332		alu.last = 1;
9333		alu.dst.write = 1;
9334		r = r600_bytecode_add_alu(ctx->bc, &alu);
9335		if (r)
9336			return r;
9337	} else {
9338		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9339		alu.op = ALU_OP1_MOV;
9340		alu.dst.sel = ctx->temp_reg;
9341		alu.dst.chan = is_cm ? 1 : 0;
9342		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9343		alu.last = 1;
9344		alu.dst.write = 1;
9345		r = r600_bytecode_add_alu(ctx->bc, &alu);
9346		if (r)
9347			return r;
9348	}
9349
9350
9351	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9352	gds.op = gds_op;
9353	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9354	gds.uav_id = is_cm ? 0 : uav_id;
9355	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9356	gds.src_gpr = ctx->temp_reg;
9357	gds.src_gpr2 = 0;
9358	gds.src_sel_x = is_cm ? 0 : 4;
9359	gds.src_sel_y = is_cm ? 1 : 0;
9360	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9361		gds.src_sel_z = is_cm ? 2 : 1;
9362	else
9363		gds.src_sel_z = 7;
9364	gds.dst_sel_x = 0;
9365	gds.dst_sel_y = 7;
9366	gds.dst_sel_z = 7;
9367	gds.dst_sel_w = 7;
9368	gds.alloc_consume = !is_cm;
9369
9370	r = r600_bytecode_add_gds(ctx->bc, &gds);
9371	if (r)
9372		return r;
9373	ctx->bc->cf_last->vpm = 1;
9374	return 0;
9375}
9376
9377static int get_lds_op(int opcode)
9378{
9379	switch (opcode) {
9380	case TGSI_OPCODE_ATOMUADD:
9381		return LDS_OP2_LDS_ADD_RET;
9382	case TGSI_OPCODE_ATOMAND:
9383		return LDS_OP2_LDS_AND_RET;
9384	case TGSI_OPCODE_ATOMOR:
9385		return LDS_OP2_LDS_OR_RET;
9386	case TGSI_OPCODE_ATOMXOR:
9387		return LDS_OP2_LDS_XOR_RET;
9388	case TGSI_OPCODE_ATOMUMIN:
9389		return LDS_OP2_LDS_MIN_UINT_RET;
9390	case TGSI_OPCODE_ATOMUMAX:
9391		return LDS_OP2_LDS_MAX_UINT_RET;
9392	case TGSI_OPCODE_ATOMIMIN:
9393		return LDS_OP2_LDS_MIN_INT_RET;
9394	case TGSI_OPCODE_ATOMIMAX:
9395		return LDS_OP2_LDS_MAX_INT_RET;
9396	case TGSI_OPCODE_ATOMXCHG:
9397		return LDS_OP2_LDS_XCHG_RET;
9398	case TGSI_OPCODE_ATOMCAS:
9399		return LDS_OP3_LDS_CMP_XCHG_RET;
9400	default:
9401		return -1;
9402	}
9403}
9404
9405static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9406{
9407	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9408	int lds_op = get_lds_op(inst->Instruction.Opcode);
9409	int r;
9410
9411	struct r600_bytecode_alu alu;
9412	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9413	alu.op = lds_op;
9414	alu.is_lds_idx_op = true;
9415	alu.last = 1;
9416	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9417	r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9418	if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9419		r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9420	else
9421		alu.src[2].sel = V_SQ_ALU_SRC_0;
9422	r = r600_bytecode_add_alu(ctx->bc, &alu);
9423	if (r)
9424		return r;
9425
9426	/* then read from LDS_OQ_A_POP */
9427	memset(&alu, 0, sizeof(alu));
9428
9429	alu.op = ALU_OP1_MOV;
9430	alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9431	alu.src[0].chan = 0;
9432	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9433	alu.dst.write = 1;
9434	alu.last = 1;
9435	r = r600_bytecode_add_alu(ctx->bc, &alu);
9436	if (r)
9437		return r;
9438
9439	return 0;
9440}
9441
9442static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9443{
9444	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9445	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9446		return tgsi_atomic_op_rat(ctx);
9447	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9448		return tgsi_atomic_op_gds(ctx);
9449	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9450		return tgsi_atomic_op_rat(ctx);
9451	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9452		return tgsi_atomic_op_lds(ctx);
9453	return 0;
9454}
9455
9456static int tgsi_resq(struct r600_shader_ctx *ctx)
9457{
9458	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9459	unsigned sampler_index_mode;
9460	struct r600_bytecode_tex tex;
9461	int r;
9462	boolean has_txq_cube_array_z = false;
9463
9464	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9465	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9466		if (ctx->bc->chip_class < EVERGREEN)
9467			ctx->shader->uses_tex_buffers = true;
9468		unsigned eg_buffer_base = 0;
9469		eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9470		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9471			eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9472		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9473	}
9474
9475	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9476	    inst->Dst[0].Register.WriteMask & 4) {
9477		ctx->shader->has_txq_cube_array_z_comp = true;
9478		has_txq_cube_array_z = true;
9479	}
9480
9481	sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9482	if (sampler_index_mode)
9483		egcm_load_index_reg(ctx->bc, 1, false);
9484
9485
9486	/* does this shader want a num layers from TXQ for a cube array? */
9487	if (has_txq_cube_array_z) {
9488		int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9489		struct r600_bytecode_alu alu;
9490
9491		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9492		alu.op = ALU_OP1_MOV;
9493
9494		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9495		/* with eg each dword is either number of cubes */
9496		alu.src[0].sel += id / 4;
9497		alu.src[0].chan = id % 4;
9498		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9499		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9500		alu.last = 1;
9501		r = r600_bytecode_add_alu(ctx->bc, &alu);
9502		if (r)
9503			return r;
9504		/* disable writemask from texture instruction */
9505		inst->Dst[0].Register.WriteMask &= ~4;
9506	}
9507	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9508	tex.op = ctx->inst_info->op;
9509	tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9510	tex.sampler_index_mode = sampler_index_mode;
9511	tex.resource_id = tex.sampler_id;
9512	tex.resource_index_mode = sampler_index_mode;
9513	tex.src_sel_x = 4;
9514	tex.src_sel_y = 4;
9515	tex.src_sel_z = 4;
9516	tex.src_sel_w = 4;
9517	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9518	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9519	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9520	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9521	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9522	r = r600_bytecode_add_tex(ctx->bc, &tex);
9523	if (r)
9524		return r;
9525
9526	return 0;
9527}
9528
9529static int tgsi_lrp(struct r600_shader_ctx *ctx)
9530{
9531	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9532	struct r600_bytecode_alu alu;
9533	unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9534	struct r600_bytecode_alu_src srcs[2][4];
9535	unsigned i;
9536	int r;
9537
9538	/* optimize if it's just an equal balance */
9539	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9540		for (i = 0; i < lasti + 1; i++) {
9541			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9542				continue;
9543
9544			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9545			alu.op = ALU_OP2_ADD;
9546			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9547			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9548			alu.omod = 3;
9549			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9550			alu.dst.chan = i;
9551			if (i == lasti) {
9552				alu.last = 1;
9553			}
9554			r = r600_bytecode_add_alu(ctx->bc, &alu);
9555			if (r)
9556				return r;
9557		}
9558		return 0;
9559	}
9560
9561	/* 1 - src0 */
9562	for (i = 0; i < lasti + 1; i++) {
9563		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9564			continue;
9565
9566		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9567		alu.op = ALU_OP2_ADD;
9568		alu.src[0].sel = V_SQ_ALU_SRC_1;
9569		alu.src[0].chan = 0;
9570		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9571		r600_bytecode_src_toggle_neg(&alu.src[1]);
9572		alu.dst.sel = ctx->temp_reg;
9573		alu.dst.chan = i;
9574		if (i == lasti) {
9575			alu.last = 1;
9576		}
9577		alu.dst.write = 1;
9578		r = r600_bytecode_add_alu(ctx->bc, &alu);
9579		if (r)
9580			return r;
9581	}
9582
9583	/* (1 - src0) * src2 */
9584	for (i = 0; i < lasti + 1; i++) {
9585		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9586			continue;
9587
9588		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9589		alu.op = ALU_OP2_MUL;
9590		alu.src[0].sel = ctx->temp_reg;
9591		alu.src[0].chan = i;
9592		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9593		alu.dst.sel = ctx->temp_reg;
9594		alu.dst.chan = i;
9595		if (i == lasti) {
9596			alu.last = 1;
9597		}
9598		alu.dst.write = 1;
9599		r = r600_bytecode_add_alu(ctx->bc, &alu);
9600		if (r)
9601			return r;
9602	}
9603
9604	/* src0 * src1 + (1 - src0) * src2 */
9605
9606	for (i = 0; i < 2; i++) {
9607		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9608					  srcs[i], &ctx->src[i]);
9609		if (r)
9610			return r;
9611	}
9612
9613	for (i = 0; i < lasti + 1; i++) {
9614		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9615			continue;
9616
9617		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9618		alu.op = ALU_OP3_MULADD;
9619		alu.is_op3 = 1;
9620		alu.src[0] = srcs[0][i];
9621		alu.src[1] = srcs[1][i];
9622		alu.src[2].sel = ctx->temp_reg;
9623		alu.src[2].chan = i;
9624
9625		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9626		alu.dst.chan = i;
9627		if (i == lasti) {
9628			alu.last = 1;
9629		}
9630		r = r600_bytecode_add_alu(ctx->bc, &alu);
9631		if (r)
9632			return r;
9633	}
9634	return 0;
9635}
9636
9637static int tgsi_cmp(struct r600_shader_ctx *ctx)
9638{
9639	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9640	struct r600_bytecode_alu alu;
9641	int i, r, j;
9642	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9643	struct r600_bytecode_alu_src srcs[3][4];
9644
9645	unsigned op;
9646
9647	if (ctx->src[0].abs && ctx->src[0].neg) {
9648		op = ALU_OP3_CNDE;
9649		ctx->src[0].abs = 0;
9650		ctx->src[0].neg = 0;
9651	} else {
9652		op = ALU_OP3_CNDGE;
9653	}
9654
9655	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9656		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9657					  srcs[j], &ctx->src[j]);
9658		if (r)
9659			return r;
9660	}
9661
9662	for (i = 0; i < lasti + 1; i++) {
9663		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9664			continue;
9665
9666		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9667		alu.op = op;
9668		alu.src[0] = srcs[0][i];
9669		alu.src[1] = srcs[2][i];
9670		alu.src[2] = srcs[1][i];
9671
9672		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9673		alu.dst.chan = i;
9674		alu.dst.write = 1;
9675		alu.is_op3 = 1;
9676		if (i == lasti)
9677			alu.last = 1;
9678		r = r600_bytecode_add_alu(ctx->bc, &alu);
9679		if (r)
9680			return r;
9681	}
9682	return 0;
9683}
9684
9685static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9686{
9687	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9688	struct r600_bytecode_alu alu;
9689	int i, r;
9690	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9691
9692	for (i = 0; i < lasti + 1; i++) {
9693		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9694			continue;
9695
9696		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9697		alu.op = ALU_OP3_CNDE_INT;
9698		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9699		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9700		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9701		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9702		alu.dst.chan = i;
9703		alu.dst.write = 1;
9704		alu.is_op3 = 1;
9705		if (i == lasti)
9706			alu.last = 1;
9707		r = r600_bytecode_add_alu(ctx->bc, &alu);
9708		if (r)
9709			return r;
9710	}
9711	return 0;
9712}
9713
9714static int tgsi_exp(struct r600_shader_ctx *ctx)
9715{
9716	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9717	struct r600_bytecode_alu alu;
9718	int r;
9719	unsigned i;
9720
9721	/* result.x = 2^floor(src); */
9722	if (inst->Dst[0].Register.WriteMask & 1) {
9723		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9724
9725		alu.op = ALU_OP1_FLOOR;
9726		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9727
9728		alu.dst.sel = ctx->temp_reg;
9729		alu.dst.chan = 0;
9730		alu.dst.write = 1;
9731		alu.last = 1;
9732		r = r600_bytecode_add_alu(ctx->bc, &alu);
9733		if (r)
9734			return r;
9735
9736		if (ctx->bc->chip_class == CAYMAN) {
9737			for (i = 0; i < 3; i++) {
9738				alu.op = ALU_OP1_EXP_IEEE;
9739				alu.src[0].sel = ctx->temp_reg;
9740				alu.src[0].chan = 0;
9741
9742				alu.dst.sel = ctx->temp_reg;
9743				alu.dst.chan = i;
9744				alu.dst.write = i == 0;
9745				alu.last = i == 2;
9746				r = r600_bytecode_add_alu(ctx->bc, &alu);
9747				if (r)
9748					return r;
9749			}
9750		} else {
9751			alu.op = ALU_OP1_EXP_IEEE;
9752			alu.src[0].sel = ctx->temp_reg;
9753			alu.src[0].chan = 0;
9754
9755			alu.dst.sel = ctx->temp_reg;
9756			alu.dst.chan = 0;
9757			alu.dst.write = 1;
9758			alu.last = 1;
9759			r = r600_bytecode_add_alu(ctx->bc, &alu);
9760			if (r)
9761				return r;
9762		}
9763	}
9764
9765	/* result.y = tmp - floor(tmp); */
9766	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9767		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9768
9769		alu.op = ALU_OP1_FRACT;
9770		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9771
9772		alu.dst.sel = ctx->temp_reg;
9773#if 0
9774		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9775		if (r)
9776			return r;
9777#endif
9778		alu.dst.write = 1;
9779		alu.dst.chan = 1;
9780
9781		alu.last = 1;
9782
9783		r = r600_bytecode_add_alu(ctx->bc, &alu);
9784		if (r)
9785			return r;
9786	}
9787
9788	/* result.z = RoughApprox2ToX(tmp);*/
9789	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9790		if (ctx->bc->chip_class == CAYMAN) {
9791			for (i = 0; i < 3; i++) {
9792				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9793				alu.op = ALU_OP1_EXP_IEEE;
9794				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9795
9796				alu.dst.sel = ctx->temp_reg;
9797				alu.dst.chan = i;
9798				if (i == 2) {
9799					alu.dst.write = 1;
9800					alu.last = 1;
9801				}
9802
9803				r = r600_bytecode_add_alu(ctx->bc, &alu);
9804				if (r)
9805					return r;
9806			}
9807		} else {
9808			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9809			alu.op = ALU_OP1_EXP_IEEE;
9810			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9811
9812			alu.dst.sel = ctx->temp_reg;
9813			alu.dst.write = 1;
9814			alu.dst.chan = 2;
9815
9816			alu.last = 1;
9817
9818			r = r600_bytecode_add_alu(ctx->bc, &alu);
9819			if (r)
9820				return r;
9821		}
9822	}
9823
9824	/* result.w = 1.0;*/
9825	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9826		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9827
9828		alu.op = ALU_OP1_MOV;
9829		alu.src[0].sel = V_SQ_ALU_SRC_1;
9830		alu.src[0].chan = 0;
9831
9832		alu.dst.sel = ctx->temp_reg;
9833		alu.dst.chan = 3;
9834		alu.dst.write = 1;
9835		alu.last = 1;
9836		r = r600_bytecode_add_alu(ctx->bc, &alu);
9837		if (r)
9838			return r;
9839	}
9840	return tgsi_helper_copy(ctx, inst);
9841}
9842
9843static int tgsi_log(struct r600_shader_ctx *ctx)
9844{
9845	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9846	struct r600_bytecode_alu alu;
9847	int r;
9848	unsigned i;
9849
9850	/* result.x = floor(log2(|src|)); */
9851	if (inst->Dst[0].Register.WriteMask & 1) {
9852		if (ctx->bc->chip_class == CAYMAN) {
9853			for (i = 0; i < 3; i++) {
9854				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9855
9856				alu.op = ALU_OP1_LOG_IEEE;
9857				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9858				r600_bytecode_src_set_abs(&alu.src[0]);
9859
9860				alu.dst.sel = ctx->temp_reg;
9861				alu.dst.chan = i;
9862				if (i == 0)
9863					alu.dst.write = 1;
9864				if (i == 2)
9865					alu.last = 1;
9866				r = r600_bytecode_add_alu(ctx->bc, &alu);
9867				if (r)
9868					return r;
9869			}
9870
9871		} else {
9872			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9873
9874			alu.op = ALU_OP1_LOG_IEEE;
9875			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9876			r600_bytecode_src_set_abs(&alu.src[0]);
9877
9878			alu.dst.sel = ctx->temp_reg;
9879			alu.dst.chan = 0;
9880			alu.dst.write = 1;
9881			alu.last = 1;
9882			r = r600_bytecode_add_alu(ctx->bc, &alu);
9883			if (r)
9884				return r;
9885		}
9886
9887		alu.op = ALU_OP1_FLOOR;
9888		alu.src[0].sel = ctx->temp_reg;
9889		alu.src[0].chan = 0;
9890
9891		alu.dst.sel = ctx->temp_reg;
9892		alu.dst.chan = 0;
9893		alu.dst.write = 1;
9894		alu.last = 1;
9895
9896		r = r600_bytecode_add_alu(ctx->bc, &alu);
9897		if (r)
9898			return r;
9899	}
9900
9901	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9902	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9903
9904		if (ctx->bc->chip_class == CAYMAN) {
9905			for (i = 0; i < 3; i++) {
9906				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9907
9908				alu.op = ALU_OP1_LOG_IEEE;
9909				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9910				r600_bytecode_src_set_abs(&alu.src[0]);
9911
9912				alu.dst.sel = ctx->temp_reg;
9913				alu.dst.chan = i;
9914				if (i == 1)
9915					alu.dst.write = 1;
9916				if (i == 2)
9917					alu.last = 1;
9918
9919				r = r600_bytecode_add_alu(ctx->bc, &alu);
9920				if (r)
9921					return r;
9922			}
9923		} else {
9924			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9925
9926			alu.op = ALU_OP1_LOG_IEEE;
9927			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9928			r600_bytecode_src_set_abs(&alu.src[0]);
9929
9930			alu.dst.sel = ctx->temp_reg;
9931			alu.dst.chan = 1;
9932			alu.dst.write = 1;
9933			alu.last = 1;
9934
9935			r = r600_bytecode_add_alu(ctx->bc, &alu);
9936			if (r)
9937				return r;
9938		}
9939
9940		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9941
9942		alu.op = ALU_OP1_FLOOR;
9943		alu.src[0].sel = ctx->temp_reg;
9944		alu.src[0].chan = 1;
9945
9946		alu.dst.sel = ctx->temp_reg;
9947		alu.dst.chan = 1;
9948		alu.dst.write = 1;
9949		alu.last = 1;
9950
9951		r = r600_bytecode_add_alu(ctx->bc, &alu);
9952		if (r)
9953			return r;
9954
9955		if (ctx->bc->chip_class == CAYMAN) {
9956			for (i = 0; i < 3; i++) {
9957				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9958				alu.op = ALU_OP1_EXP_IEEE;
9959				alu.src[0].sel = ctx->temp_reg;
9960				alu.src[0].chan = 1;
9961
9962				alu.dst.sel = ctx->temp_reg;
9963				alu.dst.chan = i;
9964				if (i == 1)
9965					alu.dst.write = 1;
9966				if (i == 2)
9967					alu.last = 1;
9968
9969				r = r600_bytecode_add_alu(ctx->bc, &alu);
9970				if (r)
9971					return r;
9972			}
9973		} else {
9974			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9975			alu.op = ALU_OP1_EXP_IEEE;
9976			alu.src[0].sel = ctx->temp_reg;
9977			alu.src[0].chan = 1;
9978
9979			alu.dst.sel = ctx->temp_reg;
9980			alu.dst.chan = 1;
9981			alu.dst.write = 1;
9982			alu.last = 1;
9983
9984			r = r600_bytecode_add_alu(ctx->bc, &alu);
9985			if (r)
9986				return r;
9987		}
9988
9989		if (ctx->bc->chip_class == CAYMAN) {
9990			for (i = 0; i < 3; i++) {
9991				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9992				alu.op = ALU_OP1_RECIP_IEEE;
9993				alu.src[0].sel = ctx->temp_reg;
9994				alu.src[0].chan = 1;
9995
9996				alu.dst.sel = ctx->temp_reg;
9997				alu.dst.chan = i;
9998				if (i == 1)
9999					alu.dst.write = 1;
10000				if (i == 2)
10001					alu.last = 1;
10002
10003				r = r600_bytecode_add_alu(ctx->bc, &alu);
10004				if (r)
10005					return r;
10006			}
10007		} else {
10008			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10009			alu.op = ALU_OP1_RECIP_IEEE;
10010			alu.src[0].sel = ctx->temp_reg;
10011			alu.src[0].chan = 1;
10012
10013			alu.dst.sel = ctx->temp_reg;
10014			alu.dst.chan = 1;
10015			alu.dst.write = 1;
10016			alu.last = 1;
10017
10018			r = r600_bytecode_add_alu(ctx->bc, &alu);
10019			if (r)
10020				return r;
10021		}
10022
10023		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10024
10025		alu.op = ALU_OP2_MUL;
10026
10027		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10028		r600_bytecode_src_set_abs(&alu.src[0]);
10029
10030		alu.src[1].sel = ctx->temp_reg;
10031		alu.src[1].chan = 1;
10032
10033		alu.dst.sel = ctx->temp_reg;
10034		alu.dst.chan = 1;
10035		alu.dst.write = 1;
10036		alu.last = 1;
10037
10038		r = r600_bytecode_add_alu(ctx->bc, &alu);
10039		if (r)
10040			return r;
10041	}
10042
10043	/* result.z = log2(|src|);*/
10044	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10045		if (ctx->bc->chip_class == CAYMAN) {
10046			for (i = 0; i < 3; i++) {
10047				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10048
10049				alu.op = ALU_OP1_LOG_IEEE;
10050				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10051				r600_bytecode_src_set_abs(&alu.src[0]);
10052
10053				alu.dst.sel = ctx->temp_reg;
10054				if (i == 2)
10055					alu.dst.write = 1;
10056				alu.dst.chan = i;
10057				if (i == 2)
10058					alu.last = 1;
10059
10060				r = r600_bytecode_add_alu(ctx->bc, &alu);
10061				if (r)
10062					return r;
10063			}
10064		} else {
10065			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10066
10067			alu.op = ALU_OP1_LOG_IEEE;
10068			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10069			r600_bytecode_src_set_abs(&alu.src[0]);
10070
10071			alu.dst.sel = ctx->temp_reg;
10072			alu.dst.write = 1;
10073			alu.dst.chan = 2;
10074			alu.last = 1;
10075
10076			r = r600_bytecode_add_alu(ctx->bc, &alu);
10077			if (r)
10078				return r;
10079		}
10080	}
10081
10082	/* result.w = 1.0; */
10083	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10084		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10085
10086		alu.op = ALU_OP1_MOV;
10087		alu.src[0].sel = V_SQ_ALU_SRC_1;
10088		alu.src[0].chan = 0;
10089
10090		alu.dst.sel = ctx->temp_reg;
10091		alu.dst.chan = 3;
10092		alu.dst.write = 1;
10093		alu.last = 1;
10094
10095		r = r600_bytecode_add_alu(ctx->bc, &alu);
10096		if (r)
10097			return r;
10098	}
10099
10100	return tgsi_helper_copy(ctx, inst);
10101}
10102
10103static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10104{
10105	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10106	struct r600_bytecode_alu alu;
10107	int r;
10108	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10109	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10110
10111	assert(inst->Dst[0].Register.Index < 3);
10112	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10113
10114	switch (inst->Instruction.Opcode) {
10115	case TGSI_OPCODE_ARL:
10116		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10117		break;
10118	case TGSI_OPCODE_ARR:
10119		alu.op = ALU_OP1_FLT_TO_INT;
10120		break;
10121	case TGSI_OPCODE_UARL:
10122		alu.op = ALU_OP1_MOV;
10123		break;
10124	default:
10125		assert(0);
10126		return -1;
10127	}
10128
10129	for (i = 0; i <= lasti; ++i) {
10130		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10131			continue;
10132		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10133		alu.last = i == lasti;
10134		alu.dst.sel = reg;
10135	        alu.dst.chan = i;
10136		alu.dst.write = 1;
10137		r = r600_bytecode_add_alu(ctx->bc, &alu);
10138		if (r)
10139			return r;
10140	}
10141
10142	if (inst->Dst[0].Register.Index > 0)
10143		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10144	else
10145		ctx->bc->ar_loaded = 0;
10146
10147	return 0;
10148}
10149static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10150{
10151	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10152	struct r600_bytecode_alu alu;
10153	int r;
10154	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10155
10156	switch (inst->Instruction.Opcode) {
10157	case TGSI_OPCODE_ARL:
10158		memset(&alu, 0, sizeof(alu));
10159		alu.op = ALU_OP1_FLOOR;
10160		alu.dst.sel = ctx->bc->ar_reg;
10161		alu.dst.write = 1;
10162		for (i = 0; i <= lasti; ++i) {
10163			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
10164				alu.dst.chan = i;
10165				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10166				alu.last = i == lasti;
10167				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10168					return r;
10169			}
10170		}
10171
10172		memset(&alu, 0, sizeof(alu));
10173		alu.op = ALU_OP1_FLT_TO_INT;
10174		alu.src[0].sel = ctx->bc->ar_reg;
10175		alu.dst.sel = ctx->bc->ar_reg;
10176		alu.dst.write = 1;
10177		/* FLT_TO_INT is trans-only on r600/r700 */
10178		alu.last = TRUE;
10179		for (i = 0; i <= lasti; ++i) {
10180			alu.dst.chan = i;
10181			alu.src[0].chan = i;
10182			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10183				return r;
10184		}
10185		break;
10186	case TGSI_OPCODE_ARR:
10187		memset(&alu, 0, sizeof(alu));
10188		alu.op = ALU_OP1_FLT_TO_INT;
10189		alu.dst.sel = ctx->bc->ar_reg;
10190		alu.dst.write = 1;
10191		/* FLT_TO_INT is trans-only on r600/r700 */
10192		alu.last = TRUE;
10193		for (i = 0; i <= lasti; ++i) {
10194			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10195				alu.dst.chan = i;
10196				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10197				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10198					return r;
10199			}
10200		}
10201		break;
10202	case TGSI_OPCODE_UARL:
10203		memset(&alu, 0, sizeof(alu));
10204		alu.op = ALU_OP1_MOV;
10205		alu.dst.sel = ctx->bc->ar_reg;
10206		alu.dst.write = 1;
10207		for (i = 0; i <= lasti; ++i) {
10208			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10209				alu.dst.chan = i;
10210				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10211				alu.last = i == lasti;
10212				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10213					return r;
10214			}
10215		}
10216		break;
10217	default:
10218		assert(0);
10219		return -1;
10220	}
10221
10222	ctx->bc->ar_loaded = 0;
10223	return 0;
10224}
10225
10226static int tgsi_opdst(struct r600_shader_ctx *ctx)
10227{
10228	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10229	struct r600_bytecode_alu alu;
10230	int i, r = 0;
10231
10232	for (i = 0; i < 4; i++) {
10233		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10234
10235		alu.op = ALU_OP2_MUL;
10236		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10237
10238		if (i == 0 || i == 3) {
10239			alu.src[0].sel = V_SQ_ALU_SRC_1;
10240		} else {
10241			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10242		}
10243
10244		if (i == 0 || i == 2) {
10245			alu.src[1].sel = V_SQ_ALU_SRC_1;
10246		} else {
10247			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10248		}
10249		if (i == 3)
10250			alu.last = 1;
10251		r = r600_bytecode_add_alu(ctx->bc, &alu);
10252		if (r)
10253			return r;
10254	}
10255	return 0;
10256}
10257
10258static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10259			   struct r600_bytecode_alu_src *src)
10260{
10261	struct r600_bytecode_alu alu;
10262	int r;
10263
10264	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10265	alu.op = opcode;
10266	alu.execute_mask = 1;
10267	alu.update_pred = 1;
10268
10269	alu.dst.sel = ctx->temp_reg;
10270	alu.dst.write = 1;
10271	alu.dst.chan = 0;
10272
10273	alu.src[0] = *src;
10274	alu.src[1].sel = V_SQ_ALU_SRC_0;
10275	alu.src[1].chan = 0;
10276
10277	alu.last = 1;
10278
10279	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10280	if (r)
10281		return r;
10282	return 0;
10283}
10284
10285static int pops(struct r600_shader_ctx *ctx, int pops)
10286{
10287	unsigned force_pop = ctx->bc->force_add_cf;
10288
10289	if (!force_pop) {
10290		int alu_pop = 3;
10291		if (ctx->bc->cf_last) {
10292			if (ctx->bc->cf_last->op == CF_OP_ALU)
10293				alu_pop = 0;
10294			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10295				alu_pop = 1;
10296		}
10297		alu_pop += pops;
10298		if (alu_pop == 1) {
10299			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10300			ctx->bc->force_add_cf = 1;
10301		} else if (alu_pop == 2) {
10302			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10303			ctx->bc->force_add_cf = 1;
10304		} else {
10305			force_pop = 1;
10306		}
10307	}
10308
10309	if (force_pop) {
10310		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10311		ctx->bc->cf_last->pop_count = pops;
10312		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10313	}
10314
10315	return 0;
10316}
10317
10318static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10319                                              unsigned reason)
10320{
10321	struct r600_stack_info *stack = &ctx->bc->stack;
10322	unsigned elements;
10323	int entries;
10324
10325	unsigned entry_size = stack->entry_size;
10326
10327	elements = (stack->loop + stack->push_wqm ) * entry_size;
10328	elements += stack->push;
10329
10330	switch (ctx->bc->chip_class) {
10331	case R600:
10332	case R700:
10333		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10334		 * the stack must be reserved to hold the current active/continue
10335		 * masks */
10336		if (reason == FC_PUSH_VPM || stack->push > 0) {
10337			elements += 2;
10338		}
10339		break;
10340
10341	case CAYMAN:
10342		/* r9xx: any stack operation on empty stack consumes 2 additional
10343		 * elements */
10344		elements += 2;
10345
10346		/* fallthrough */
10347		/* FIXME: do the two elements added above cover the cases for the
10348		 * r8xx+ below? */
10349
10350	case EVERGREEN:
10351		/* r8xx+: 2 extra elements are not always required, but one extra
10352		 * element must be added for each of the following cases:
10353		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10354		 *    stack usage.
10355		 *    (Currently we don't use ALU_ELSE_AFTER.)
10356		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10357		 *    PUSH instruction executed.
10358		 *
10359		 *    NOTE: it seems we also need to reserve additional element in some
10360		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10361		 *    then STACK_SIZE should be 2 instead of 1 */
10362		if (reason == FC_PUSH_VPM || stack->push > 0) {
10363			elements += 1;
10364		}
10365		break;
10366
10367	default:
10368		assert(0);
10369		break;
10370	}
10371
10372	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10373	 * for all chips, so we use 4 in the final formula, not the real entry_size
10374	 * for the chip */
10375	entry_size = 4;
10376
10377	entries = (elements + (entry_size - 1)) / entry_size;
10378
10379	if (entries > stack->max_entries)
10380		stack->max_entries = entries;
10381	return elements;
10382}
10383
10384static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10385{
10386	switch(reason) {
10387	case FC_PUSH_VPM:
10388		--ctx->bc->stack.push;
10389		assert(ctx->bc->stack.push >= 0);
10390		break;
10391	case FC_PUSH_WQM:
10392		--ctx->bc->stack.push_wqm;
10393		assert(ctx->bc->stack.push_wqm >= 0);
10394		break;
10395	case FC_LOOP:
10396		--ctx->bc->stack.loop;
10397		assert(ctx->bc->stack.loop >= 0);
10398		break;
10399	default:
10400		assert(0);
10401		break;
10402	}
10403}
10404
10405static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10406{
10407	switch (reason) {
10408	case FC_PUSH_VPM:
10409		++ctx->bc->stack.push;
10410		break;
10411	case FC_PUSH_WQM:
10412		++ctx->bc->stack.push_wqm;
10413		break;
10414	case FC_LOOP:
10415		++ctx->bc->stack.loop;
10416		break;
10417	default:
10418		assert(0);
10419	}
10420
10421	return callstack_update_max_depth(ctx, reason);
10422}
10423
10424static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10425{
10426	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10427
10428	sp->mid = realloc((void *)sp->mid,
10429						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10430	sp->mid[sp->num_mid] = ctx->bc->cf_last;
10431	sp->num_mid++;
10432}
10433
10434static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10435{
10436	assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10437	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10438	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10439	ctx->bc->fc_sp++;
10440}
10441
10442static void fc_poplevel(struct r600_shader_ctx *ctx)
10443{
10444	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10445	free(sp->mid);
10446	sp->mid = NULL;
10447	sp->num_mid = 0;
10448	sp->start = NULL;
10449	sp->type = 0;
10450	ctx->bc->fc_sp--;
10451}
10452
10453#if 0
10454static int emit_return(struct r600_shader_ctx *ctx)
10455{
10456	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10457	return 0;
10458}
10459
10460static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10461{
10462
10463	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10464	ctx->bc->cf_last->pop_count = pops;
10465	/* XXX work out offset */
10466	return 0;
10467}
10468
10469static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10470{
10471	return 0;
10472}
10473
10474static void emit_testflag(struct r600_shader_ctx *ctx)
10475{
10476
10477}
10478
10479static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10480{
10481	emit_testflag(ctx);
10482	emit_jump_to_offset(ctx, 1, 4);
10483	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10484	pops(ctx, ifidx + 1);
10485	emit_return(ctx);
10486}
10487
10488static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10489{
10490	emit_testflag(ctx);
10491
10492	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10493	ctx->bc->cf_last->pop_count = 1;
10494
10495	fc_set_mid(ctx, fc_sp);
10496
10497	pops(ctx, 1);
10498}
10499#endif
10500
10501static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10502		   struct r600_bytecode_alu_src *src)
10503{
10504	int alu_type = CF_OP_ALU_PUSH_BEFORE;
10505	bool needs_workaround = false;
10506	int elems = callstack_push(ctx, FC_PUSH_VPM);
10507
10508	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
10509		needs_workaround = true;
10510
10511	if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10512		unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10513		unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10514
10515		if (elems && (!dmod1 || !dmod2))
10516			needs_workaround = true;
10517	}
10518
10519	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10520	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10521	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10522	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10523	if (needs_workaround) {
10524		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10525		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10526		alu_type = CF_OP_ALU;
10527	}
10528
10529	emit_logic_pred(ctx, opcode, alu_type, src);
10530
10531	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10532
10533	fc_pushlevel(ctx, FC_IF);
10534
10535	return 0;
10536}
10537
10538static int tgsi_if(struct r600_shader_ctx *ctx)
10539{
10540	struct r600_bytecode_alu_src alu_src;
10541	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10542
10543	return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10544}
10545
10546static int tgsi_uif(struct r600_shader_ctx *ctx)
10547{
10548	struct r600_bytecode_alu_src alu_src;
10549	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10550	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10551}
10552
10553static int tgsi_else(struct r600_shader_ctx *ctx)
10554{
10555	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10556	ctx->bc->cf_last->pop_count = 1;
10557
10558	fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10559	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10560	return 0;
10561}
10562
10563static int tgsi_endif(struct r600_shader_ctx *ctx)
10564{
10565	int offset = 2;
10566	pops(ctx, 1);
10567	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10568		R600_ERR("if/endif unbalanced in shader\n");
10569		return -1;
10570	}
10571
10572	/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10573	if (ctx->bc->cf_last->eg_alu_extended)
10574			offset += 2;
10575
10576	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10577		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10578		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10579	} else {
10580		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10581	}
10582	fc_poplevel(ctx);
10583
10584	callstack_pop(ctx, FC_PUSH_VPM);
10585	return 0;
10586}
10587
10588static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10589{
10590	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10591	 * limited to 4096 iterations, like the other LOOP_* instructions. */
10592	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10593
10594	fc_pushlevel(ctx, FC_LOOP);
10595
10596	/* check stack depth */
10597	callstack_push(ctx, FC_LOOP);
10598	return 0;
10599}
10600
10601static int tgsi_endloop(struct r600_shader_ctx *ctx)
10602{
10603	int i;
10604
10605	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10606
10607	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10608		R600_ERR("loop/endloop in shader code are not paired.\n");
10609		return -EINVAL;
10610	}
10611
10612	/* fixup loop pointers - from r600isa
10613	   LOOP END points to CF after LOOP START,
10614	   LOOP START point to CF after LOOP END
10615	   BRK/CONT point to LOOP END CF
10616	*/
10617	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10618
10619	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10620
10621	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10622		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10623	}
10624	/* XXX add LOOPRET support */
10625	fc_poplevel(ctx);
10626	callstack_pop(ctx, FC_LOOP);
10627	return 0;
10628}
10629
10630static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10631{
10632	unsigned int fscp;
10633
10634	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10635	{
10636		if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10637			break;
10638	}
10639
10640	if (fscp == 0) {
10641		R600_ERR("Break not inside loop/endloop pair\n");
10642		return -EINVAL;
10643	}
10644
10645	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10646
10647	fc_set_mid(ctx, fscp - 1);
10648
10649	return 0;
10650}
10651
10652static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10653{
10654	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10655	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10656	int r;
10657
10658	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10659		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10660
10661	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10662	if (!r) {
10663		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10664		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10665			return emit_inc_ring_offset(ctx, stream, TRUE);
10666	}
10667	return r;
10668}
10669
10670static int tgsi_umad(struct r600_shader_ctx *ctx)
10671{
10672	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10673	struct r600_bytecode_alu alu;
10674	int i, j, r;
10675	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10676
10677	/* src0 * src1 */
10678	for (i = 0; i < lasti + 1; i++) {
10679		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10680			continue;
10681
10682		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10683
10684		alu.dst.chan = i;
10685		alu.dst.sel = ctx->temp_reg;
10686		alu.dst.write = 1;
10687
10688		alu.op = ALU_OP2_MULLO_UINT;
10689		for (j = 0; j < 2; j++) {
10690			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10691		}
10692
10693		alu.last = 1;
10694		r = emit_mul_int_op(ctx->bc, &alu);
10695		if (r)
10696			return r;
10697	}
10698
10699
10700	for (i = 0; i < lasti + 1; i++) {
10701		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10702			continue;
10703
10704		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10705		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10706
10707		alu.op = ALU_OP2_ADD_INT;
10708
10709		alu.src[0].sel = ctx->temp_reg;
10710		alu.src[0].chan = i;
10711
10712		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10713		if (i == lasti) {
10714			alu.last = 1;
10715		}
10716		r = r600_bytecode_add_alu(ctx->bc, &alu);
10717		if (r)
10718			return r;
10719	}
10720	return 0;
10721}
10722
10723static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10724{
10725	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10726	struct r600_bytecode_alu alu;
10727	int r, i;
10728	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10729
10730	/* temp.xy = f32_to_f16(src) */
10731	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10732	alu.op = ALU_OP1_FLT32_TO_FLT16;
10733	alu.dst.chan = 0;
10734	alu.dst.sel = ctx->temp_reg;
10735	alu.dst.write = 1;
10736	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10737	r = r600_bytecode_add_alu(ctx->bc, &alu);
10738	if (r)
10739		return r;
10740	alu.dst.chan = 1;
10741	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10742	alu.last = 1;
10743	r = r600_bytecode_add_alu(ctx->bc, &alu);
10744	if (r)
10745		return r;
10746
10747	/* dst.x = temp.y * 0x10000 + temp.x */
10748	for (i = 0; i < lasti + 1; i++) {
10749		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10750			continue;
10751
10752		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10753		alu.op = ALU_OP3_MULADD_UINT24;
10754		alu.is_op3 = 1;
10755		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10756		alu.last = i == lasti;
10757		alu.src[0].sel = ctx->temp_reg;
10758		alu.src[0].chan = 1;
10759		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10760		alu.src[1].value = 0x10000;
10761		alu.src[2].sel = ctx->temp_reg;
10762		alu.src[2].chan = 0;
10763		r = r600_bytecode_add_alu(ctx->bc, &alu);
10764		if (r)
10765			return r;
10766	}
10767
10768	return 0;
10769}
10770
10771static int tgsi_up2h(struct r600_shader_ctx *ctx)
10772{
10773	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10774	struct r600_bytecode_alu alu;
10775	int r, i;
10776	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10777
10778	/* temp.x = src.x */
10779	/* note: no need to mask out the high bits */
10780	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10781	alu.op = ALU_OP1_MOV;
10782	alu.dst.chan = 0;
10783	alu.dst.sel = ctx->temp_reg;
10784	alu.dst.write = 1;
10785	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10786	r = r600_bytecode_add_alu(ctx->bc, &alu);
10787	if (r)
10788		return r;
10789
10790	/* temp.y = src.x >> 16 */
10791	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10792	alu.op = ALU_OP2_LSHR_INT;
10793	alu.dst.chan = 1;
10794	alu.dst.sel = ctx->temp_reg;
10795	alu.dst.write = 1;
10796	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10797	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10798	alu.src[1].value = 16;
10799	alu.last = 1;
10800	r = r600_bytecode_add_alu(ctx->bc, &alu);
10801	if (r)
10802		return r;
10803
10804	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10805	for (i = 0; i < lasti + 1; i++) {
10806		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10807			continue;
10808		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10809		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10810		alu.op = ALU_OP1_FLT16_TO_FLT32;
10811		alu.src[0].sel = ctx->temp_reg;
10812		alu.src[0].chan = i % 2;
10813		alu.last = i == lasti;
10814		r = r600_bytecode_add_alu(ctx->bc, &alu);
10815		if (r)
10816			return r;
10817	}
10818
10819	return 0;
10820}
10821
10822static int tgsi_bfe(struct r600_shader_ctx *ctx)
10823{
10824	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10825	struct r600_bytecode_alu alu;
10826	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10827	int r, i;
10828	int dst = -1;
10829
10830	if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10831	     inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10832	    (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10833	     inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10834		dst = r600_get_temp(ctx);
10835
10836	r = tgsi_op3_dst(ctx, dst);
10837	if (r)
10838		return r;
10839
10840	for (i = 0; i < lasti + 1; i++) {
10841		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10842		alu.op = ALU_OP2_SETGE_INT;
10843		r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10844		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10845		alu.src[1].value = 32;
10846		alu.dst.sel = ctx->temp_reg;
10847		alu.dst.chan = i;
10848		alu.dst.write = 1;
10849		if (i == lasti)
10850			alu.last = 1;
10851		r = r600_bytecode_add_alu(ctx->bc, &alu);
10852		if (r)
10853			return r;
10854	}
10855
10856	for (i = 0; i < lasti + 1; i++) {
10857		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10858		alu.op = ALU_OP3_CNDE_INT;
10859		alu.is_op3 = 1;
10860		alu.src[0].sel = ctx->temp_reg;
10861		alu.src[0].chan = i;
10862
10863		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10864		if (dst != -1)
10865			alu.src[1].sel = dst;
10866		else
10867			alu.src[1].sel = alu.dst.sel;
10868		alu.src[1].chan = i;
10869		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10870		alu.dst.write = 1;
10871		if (i == lasti)
10872			alu.last = 1;
10873		r = r600_bytecode_add_alu(ctx->bc, &alu);
10874		if (r)
10875			return r;
10876	}
10877
10878	return 0;
10879}
10880
10881static int tgsi_clock(struct r600_shader_ctx *ctx)
10882{
10883	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10884	struct r600_bytecode_alu alu;
10885	int r;
10886
10887	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10888	alu.op = ALU_OP1_MOV;
10889	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10890	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10891	r = r600_bytecode_add_alu(ctx->bc, &alu);
10892	if (r)
10893		return r;
10894	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10895	alu.op = ALU_OP1_MOV;
10896	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10897	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10898	alu.last = 1;
10899	r = r600_bytecode_add_alu(ctx->bc, &alu);
10900	if (r)
10901		return r;
10902	return 0;
10903}
10904
10905static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10906		       int treg,
10907		       int src0_sel, int src0_chan,
10908		       int src1_sel, int src1_chan)
10909{
10910	struct r600_bytecode_alu alu;
10911	int r;
10912	int opc;
10913
10914	if (op == ALU_OP2_ADD_INT)
10915		opc = ALU_OP2_ADDC_UINT;
10916	else
10917		opc = ALU_OP2_SUBB_UINT;
10918
10919	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10920	alu.op = op;            ;
10921	alu.dst.sel = treg;
10922	alu.dst.chan = 0;
10923	alu.dst.write = 1;
10924	alu.src[0].sel = src0_sel;
10925	alu.src[0].chan = src0_chan + 0;
10926	alu.src[1].sel = src1_sel;
10927	alu.src[1].chan = src1_chan + 0;
10928	alu.src[1].neg = 0;
10929	r = r600_bytecode_add_alu(ctx->bc, &alu);
10930	if (r)
10931		return r;
10932
10933	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10934	alu.op = op;
10935	alu.dst.sel = treg;
10936	alu.dst.chan = 1;
10937	alu.dst.write = 1;
10938	alu.src[0].sel = src0_sel;
10939	alu.src[0].chan = src0_chan + 1;
10940	alu.src[1].sel = src1_sel;
10941	alu.src[1].chan = src1_chan + 1;
10942	alu.src[1].neg = 0;
10943	r = r600_bytecode_add_alu(ctx->bc, &alu);
10944	if (r)
10945		return r;
10946
10947	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10948	alu.op = opc;
10949	alu.dst.sel = treg;
10950	alu.dst.chan = 2;
10951	alu.dst.write = 1;
10952	alu.last = 1;
10953	alu.src[0].sel = src0_sel;
10954	alu.src[0].chan = src0_chan + 0;
10955	alu.src[1].sel = src1_sel;
10956	alu.src[1].chan = src1_chan + 0;
10957	alu.src[1].neg = 0;
10958	r = r600_bytecode_add_alu(ctx->bc, &alu);
10959	if (r)
10960		return r;
10961
10962	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10963	alu.op = op;
10964	alu.dst.sel = treg;
10965	alu.dst.chan = 1;
10966	alu.dst.write = 1;
10967	alu.src[0].sel = treg;
10968	alu.src[0].chan = 1;
10969	alu.src[1].sel = treg;
10970	alu.src[1].chan = 2;
10971	alu.last = 1;
10972	r = r600_bytecode_add_alu(ctx->bc, &alu);
10973	if (r)
10974		return r;
10975	return 0;
10976}
10977
10978static int egcm_u64add(struct r600_shader_ctx *ctx)
10979{
10980	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10981	struct r600_bytecode_alu alu;
10982	int r;
10983	int treg = ctx->temp_reg;
10984	int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
10985
10986	if (ctx->src[1].neg) {
10987		op = ALU_OP2_SUB_INT;
10988		opc = ALU_OP2_SUBB_UINT;
10989	}
10990	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10991	alu.op = op;            ;
10992	alu.dst.sel = treg;
10993	alu.dst.chan = 0;
10994	alu.dst.write = 1;
10995	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10996	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10997	alu.src[1].neg = 0;
10998	r = r600_bytecode_add_alu(ctx->bc, &alu);
10999	if (r)
11000		return r;
11001
11002	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11003	alu.op = op;
11004	alu.dst.sel = treg;
11005	alu.dst.chan = 1;
11006	alu.dst.write = 1;
11007	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11008	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11009	alu.src[1].neg = 0;
11010	r = r600_bytecode_add_alu(ctx->bc, &alu);
11011	if (r)
11012		return r;
11013
11014	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11015	alu.op = opc              ;
11016	alu.dst.sel = treg;
11017	alu.dst.chan = 2;
11018	alu.dst.write = 1;
11019	alu.last = 1;
11020	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11021	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11022	alu.src[1].neg = 0;
11023	r = r600_bytecode_add_alu(ctx->bc, &alu);
11024	if (r)
11025		return r;
11026
11027	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11028	alu.op = op;
11029	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11030	alu.src[0].sel = treg;
11031	alu.src[0].chan = 1;
11032	alu.src[1].sel = treg;
11033	alu.src[1].chan = 2;
11034	alu.last = 1;
11035	r = r600_bytecode_add_alu(ctx->bc, &alu);
11036	if (r)
11037		return r;
11038	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11039	alu.op = ALU_OP1_MOV;
11040	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11041	alu.src[0].sel = treg;
11042	alu.src[0].chan = 0;
11043	alu.last = 1;
11044	r = r600_bytecode_add_alu(ctx->bc, &alu);
11045	if (r)
11046		return r;
11047	return 0;
11048}
11049
11050/* result.y = mul_high a, b
11051   result.x = mul a,b
11052   result.y += a.x * b.y + a.y * b.x;
11053*/
11054static int egcm_u64mul(struct r600_shader_ctx *ctx)
11055{
11056	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11057	struct r600_bytecode_alu alu;
11058	int r;
11059	int treg = ctx->temp_reg;
11060
11061	/* temp.x = mul_lo a.x, b.x */
11062	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11063	alu.op = ALU_OP2_MULLO_UINT;
11064	alu.dst.sel = treg;
11065	alu.dst.chan = 0;
11066	alu.dst.write = 1;
11067	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11068	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11069	r = emit_mul_int_op(ctx->bc, &alu);
11070	if (r)
11071		return r;
11072
11073	/* temp.y = mul_hi a.x, b.x */
11074	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11075	alu.op = ALU_OP2_MULHI_UINT;
11076	alu.dst.sel = treg;
11077	alu.dst.chan = 1;
11078	alu.dst.write = 1;
11079	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11080	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11081	r = emit_mul_int_op(ctx->bc, &alu);
11082	if (r)
11083		return r;
11084
11085	/* temp.z = mul a.x, b.y */
11086	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11087	alu.op = ALU_OP2_MULLO_UINT;
11088	alu.dst.sel = treg;
11089	alu.dst.chan = 2;
11090	alu.dst.write = 1;
11091	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11092	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11093	r = emit_mul_int_op(ctx->bc, &alu);
11094	if (r)
11095		return r;
11096
11097	/* temp.w = mul a.y, b.x */
11098	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11099	alu.op = ALU_OP2_MULLO_UINT;
11100	alu.dst.sel = treg;
11101	alu.dst.chan = 3;
11102	alu.dst.write = 1;
11103	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11104	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11105	r = emit_mul_int_op(ctx->bc, &alu);
11106	if (r)
11107		return r;
11108
11109	/* temp.z = temp.z + temp.w */
11110	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11111	alu.op = ALU_OP2_ADD_INT;
11112	alu.dst.sel = treg;
11113	alu.dst.chan = 2;
11114	alu.dst.write = 1;
11115	alu.src[0].sel = treg;
11116	alu.src[0].chan = 2;
11117	alu.src[1].sel = treg;
11118	alu.src[1].chan = 3;
11119	alu.last = 1;
11120	r = r600_bytecode_add_alu(ctx->bc, &alu);
11121	if (r)
11122		return r;
11123
11124	/* temp.y = temp.y + temp.z */
11125	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11126	alu.op = ALU_OP2_ADD_INT;
11127	alu.dst.sel = treg;
11128	alu.dst.chan = 1;
11129	alu.dst.write = 1;
11130	alu.src[0].sel = treg;
11131	alu.src[0].chan = 1;
11132	alu.src[1].sel = treg;
11133	alu.src[1].chan = 2;
11134	alu.last = 1;
11135	r = r600_bytecode_add_alu(ctx->bc, &alu);
11136	if (r)
11137		return r;
11138
11139	/* dst.x = temp.x */
11140	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11141	alu.op = ALU_OP1_MOV;
11142	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11143	alu.src[0].sel = treg;
11144	alu.src[0].chan = 0;
11145	r = r600_bytecode_add_alu(ctx->bc, &alu);
11146	if (r)
11147		return r;
11148
11149	/* dst.y = temp.y */
11150	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11151	alu.op = ALU_OP1_MOV;
11152	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11153	alu.src[0].sel = treg;
11154	alu.src[0].chan = 1;
11155	alu.last = 1;
11156	r = r600_bytecode_add_alu(ctx->bc, &alu);
11157	if (r)
11158		return r;
11159
11160	return 0;
11161}
11162
11163static int emit_u64sge(struct r600_shader_ctx *ctx,
11164		       int treg,
11165		       int src0_sel, int src0_base_chan,
11166		       int src1_sel, int src1_base_chan)
11167{
11168	int r;
11169	/* for 64-bit sge */
11170	/* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11171	r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11172			   treg, 1,
11173			   src0_sel, src0_base_chan + 1,
11174			   src1_sel, src1_base_chan + 1);
11175	if (r)
11176		return r;
11177
11178	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11179			   treg, 0,
11180			   src0_sel, src0_base_chan,
11181			   src1_sel, src1_base_chan);
11182	if (r)
11183		return r;
11184
11185	r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11186			   treg, 2,
11187			   src0_sel, src0_base_chan + 1,
11188			   src1_sel, src1_base_chan + 1);
11189	if (r)
11190		return r;
11191
11192	r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11193			   treg, 0,
11194			   treg, 0,
11195			   treg, 2);
11196	if (r)
11197		return r;
11198
11199	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11200			   treg, 0,
11201			   treg, 0,
11202			   treg, 1);
11203	if (r)
11204		return r;
11205	return 0;
11206}
11207
11208/* this isn't a complete div it's just enough for qbo shader to work */
11209static int egcm_u64div(struct r600_shader_ctx *ctx)
11210{
11211	struct r600_bytecode_alu alu;
11212	struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11213	int r, i;
11214	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11215
11216	/* make sure we are dividing my a const with 0 in the high bits */
11217	if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11218		return -1;
11219	if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11220		return -1;
11221	/* make sure we are doing one division */
11222	if (inst->Dst[0].Register.WriteMask != 0x3)
11223		return -1;
11224
11225	/* emit_if uses ctx->temp_reg so we can't */
11226	int treg = r600_get_temp(ctx);
11227	int tmp_num = r600_get_temp(ctx);
11228	int sub_tmp = r600_get_temp(ctx);
11229
11230	/* tmp quot are tmp_num.zw */
11231	r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11232	r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11233	r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11234	r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11235
11236	/* MOV tmp_num.xy, numerator */
11237	r = single_alu_op2(ctx, ALU_OP1_MOV,
11238			   tmp_num, 0,
11239			   alu_num_lo.sel, alu_num_lo.chan,
11240			   0, 0);
11241	if (r)
11242		return r;
11243	r = single_alu_op2(ctx, ALU_OP1_MOV,
11244			   tmp_num, 1,
11245			   alu_num_hi.sel, alu_num_hi.chan,
11246			   0, 0);
11247	if (r)
11248		return r;
11249
11250	r = single_alu_op2(ctx, ALU_OP1_MOV,
11251			   tmp_num, 2,
11252			   V_SQ_ALU_SRC_LITERAL, 0,
11253			   0, 0);
11254	if (r)
11255		return r;
11256
11257	r = single_alu_op2(ctx, ALU_OP1_MOV,
11258			   tmp_num, 3,
11259			   V_SQ_ALU_SRC_LITERAL, 0,
11260			   0, 0);
11261	if (r)
11262		return r;
11263
11264	/* treg 0 is log2_denom */
11265	/* normally this gets the MSB for the denom high value
11266	   - however we know this will always be 0 here. */
11267	r = single_alu_op2(ctx,
11268			   ALU_OP1_MOV,
11269			   treg, 0,
11270			   V_SQ_ALU_SRC_LITERAL, 32,
11271			   0, 0);
11272	if (r)
11273		return r;
11274
11275	/* normally check demon hi for 0, but we know it is already */
11276	/* t0.z = num_hi >= denom_lo */
11277	r = single_alu_op2(ctx,
11278			   ALU_OP2_SETGE_UINT,
11279			   treg, 1,
11280			   alu_num_hi.sel, alu_num_hi.chan,
11281			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11282	if (r)
11283		return r;
11284
11285	memset(&alu_src, 0, sizeof(alu_src));
11286	alu_src.sel = treg;
11287	alu_src.chan = 1;
11288	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11289	if (r)
11290		return r;
11291
11292	/* for loops in here */
11293	/* get msb t0.x = msb(src[1].x) first */
11294	int msb_lo = util_last_bit(alu_denom_lo.value);
11295	r = single_alu_op2(ctx, ALU_OP1_MOV,
11296			   treg, 0,
11297			   V_SQ_ALU_SRC_LITERAL, msb_lo,
11298			   0, 0);
11299	if (r)
11300		return r;
11301
11302	/* unroll the asm here */
11303	for (i = 0; i < 31; i++) {
11304		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11305				   treg, 2,
11306				   V_SQ_ALU_SRC_LITERAL, i,
11307				   treg, 0);
11308		if (r)
11309			return r;
11310
11311		/* we can do this on the CPU */
11312		uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11313		/* t0.z = tmp_num.y >= t0.z */
11314		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11315				   treg, 1,
11316				   tmp_num, 1,
11317				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11318		if (r)
11319			return r;
11320
11321		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11322				   treg, 1,
11323				   treg, 1,
11324				   treg, 2);
11325		if (r)
11326			return r;
11327
11328		memset(&alu_src, 0, sizeof(alu_src));
11329		alu_src.sel = treg;
11330		alu_src.chan = 1;
11331		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11332		if (r)
11333			return r;
11334
11335		r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11336				   tmp_num, 1,
11337				   tmp_num, 1,
11338				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11339		if (r)
11340			return r;
11341
11342		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11343				   tmp_num, 3,
11344				   tmp_num, 3,
11345				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11346		if (r)
11347			return r;
11348
11349		r = tgsi_endif(ctx);
11350		if (r)
11351			return r;
11352	}
11353
11354	/* log2_denom is always <= 31, so manually peel the last loop
11355	 * iteration.
11356	 */
11357	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11358			   treg, 1,
11359			   tmp_num, 1,
11360			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11361	if (r)
11362		return r;
11363
11364	memset(&alu_src, 0, sizeof(alu_src));
11365	alu_src.sel = treg;
11366	alu_src.chan = 1;
11367	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11368	if (r)
11369		return r;
11370
11371	r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11372			   tmp_num, 1,
11373			   tmp_num, 1,
11374			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11375	if (r)
11376		return r;
11377
11378	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11379			   tmp_num, 3,
11380			   tmp_num, 3,
11381			   V_SQ_ALU_SRC_LITERAL, 1U);
11382	if (r)
11383		return r;
11384	r = tgsi_endif(ctx);
11385	if (r)
11386		return r;
11387
11388	r = tgsi_endif(ctx);
11389	if (r)
11390		return r;
11391
11392	/* onto the second loop to unroll */
11393	for (i = 0; i < 31; i++) {
11394		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11395				   treg, 1,
11396				   V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11397				   treg, 0);
11398		if (r)
11399			return r;
11400
11401		uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11402		r = single_alu_op2(ctx, ALU_OP1_MOV,
11403				   treg, 2,
11404				   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11405				   0, 0);
11406		if (r)
11407			return r;
11408
11409		r = single_alu_op2(ctx, ALU_OP1_MOV,
11410				   treg, 3,
11411				   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11412				   0, 0);
11413		if (r)
11414			return r;
11415
11416		r = emit_u64sge(ctx, sub_tmp,
11417				tmp_num, 0,
11418				treg, 2);
11419		if (r)
11420			return r;
11421
11422		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11423				   treg, 1,
11424				   treg, 1,
11425				   sub_tmp, 0);
11426		if (r)
11427			return r;
11428
11429		memset(&alu_src, 0, sizeof(alu_src));
11430		alu_src.sel = treg;
11431		alu_src.chan = 1;
11432		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11433		if (r)
11434			return r;
11435
11436
11437		r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11438				sub_tmp,
11439				tmp_num, 0,
11440				treg, 2);
11441		if (r)
11442			return r;
11443
11444		r = single_alu_op2(ctx, ALU_OP1_MOV,
11445				   tmp_num, 0,
11446				   sub_tmp, 0,
11447				   0, 0);
11448		if (r)
11449			return r;
11450
11451		r = single_alu_op2(ctx, ALU_OP1_MOV,
11452				   tmp_num, 1,
11453				   sub_tmp, 1,
11454				   0, 0);
11455		if (r)
11456			return r;
11457
11458		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11459				   tmp_num, 2,
11460				   tmp_num, 2,
11461				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11462		if (r)
11463			return r;
11464
11465		r = tgsi_endif(ctx);
11466		if (r)
11467			return r;
11468	}
11469
11470	/* log2_denom is always <= 63, so manually peel the last loop
11471	 * iteration.
11472	 */
11473	uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11474	r = single_alu_op2(ctx, ALU_OP1_MOV,
11475			   treg, 2,
11476			   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11477			   0, 0);
11478	if (r)
11479		return r;
11480
11481	r = single_alu_op2(ctx, ALU_OP1_MOV,
11482			   treg, 3,
11483			   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11484			   0, 0);
11485	if (r)
11486		return r;
11487
11488	r = emit_u64sge(ctx, sub_tmp,
11489			tmp_num, 0,
11490			treg, 2);
11491	if (r)
11492		return r;
11493
11494	memset(&alu_src, 0, sizeof(alu_src));
11495	alu_src.sel = sub_tmp;
11496	alu_src.chan = 0;
11497	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11498	if (r)
11499		return r;
11500
11501	r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11502			sub_tmp,
11503			tmp_num, 0,
11504			treg, 2);
11505	if (r)
11506		return r;
11507
11508	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11509			   tmp_num, 2,
11510			   tmp_num, 2,
11511			   V_SQ_ALU_SRC_LITERAL, 1U);
11512	if (r)
11513		return r;
11514	r = tgsi_endif(ctx);
11515	if (r)
11516		return r;
11517
11518	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11519	alu.op = ALU_OP1_MOV;
11520	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11521	alu.src[0].sel = tmp_num;
11522	alu.src[0].chan = 2;
11523	r = r600_bytecode_add_alu(ctx->bc, &alu);
11524	if (r)
11525		return r;
11526
11527	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11528	alu.op = ALU_OP1_MOV;
11529	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11530	alu.src[0].sel = tmp_num;
11531	alu.src[0].chan = 3;
11532	alu.last = 1;
11533	r = r600_bytecode_add_alu(ctx->bc, &alu);
11534	if (r)
11535		return r;
11536	return 0;
11537}
11538
11539static int egcm_u64sne(struct r600_shader_ctx *ctx)
11540{
11541	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11542	struct r600_bytecode_alu alu;
11543	int r;
11544	int treg = ctx->temp_reg;
11545
11546	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11547	alu.op = ALU_OP2_SETNE_INT;
11548	alu.dst.sel = treg;
11549	alu.dst.chan = 0;
11550	alu.dst.write = 1;
11551	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11552	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11553	r = r600_bytecode_add_alu(ctx->bc, &alu);
11554	if (r)
11555		return r;
11556
11557	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11558	alu.op = ALU_OP2_SETNE_INT;
11559	alu.dst.sel = treg;
11560	alu.dst.chan = 1;
11561	alu.dst.write = 1;
11562	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11563	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11564	alu.last = 1;
11565	r = r600_bytecode_add_alu(ctx->bc, &alu);
11566	if (r)
11567		return r;
11568
11569	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11570	alu.op = ALU_OP2_OR_INT;
11571	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11572	alu.src[0].sel = treg;
11573	alu.src[0].chan = 0;
11574	alu.src[1].sel = treg;
11575	alu.src[1].chan = 1;
11576	alu.last = 1;
11577	r = r600_bytecode_add_alu(ctx->bc, &alu);
11578	if (r)
11579		return r;
11580	return 0;
11581}
11582
11583static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11584	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
11585	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11586	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11587
11588	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11589
11590	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11591	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11592	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11593	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11594	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11595	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11596	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11597	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11598	/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11599	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11600	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11601	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11602	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11603	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11604	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11605	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11606	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11607	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11608	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11609	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11610	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11611	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11612	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11613	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11614	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11615	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11616	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11617	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11618	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11619	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
11620	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11621	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11622	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11623	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11624	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11625	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11626	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11627	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11628	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11629	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11630	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11631	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11632	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11633	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11634	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11635	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11636	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11637	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11638	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11639	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11640	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11641	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11642	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11643	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11644	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11645	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11646	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11647	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
11648	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11649	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11650	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11651	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11652	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11653	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11654	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11655	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11656	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11657	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11658	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11659	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11660	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11661	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11662	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11663	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11664	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11665	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11666	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11667	[81]			= { ALU_OP0_NOP, tgsi_unsupported},
11668	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11669	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11670	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11671	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11672	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11673	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11674	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11675	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11676	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11677	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11678	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11679	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11680	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11681	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11682	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11683	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11684	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11685	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11686	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11687	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11688	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11689	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11690	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11691	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
11692	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11693	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11694	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11695	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11696	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11697	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11698	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
11699	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11700	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11701	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11702	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11703	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11704	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11705	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11706	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11707	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11708	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11709	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11710	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11711	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11712	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11713	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11714	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11715	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11716	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11717	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11718	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11719	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11720	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11721	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11722	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11723	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11724	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11725	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11726	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11727	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11728	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11729	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11730	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11731	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11732	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11733	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11734	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11735	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11736	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11737	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11738	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11739	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11740	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11741	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11742	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11743	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11744	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11745	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11746	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11747	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
11748	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
11749	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11750	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11751	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11752	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
11753	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
11754	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
11755	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
11756	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
11757	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11758	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11759	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11760	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11761	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11762	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11763	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11764	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11765	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11766	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11767	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11768	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
11769	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
11770	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
11771	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
11772	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
11773	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
11774	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
11775	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
11776	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
11777	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11778	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
11779	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
11780	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
11781	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
11782};
11783
11784static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11785	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
11786	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11787	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11788	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11789	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11790	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11791	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11792	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11793	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11794	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11795	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11796	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11797	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11798	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11799	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11800	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11801	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11802	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11803	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
11804	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11805	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11806	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11807	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11808	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11809	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11810	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11811	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11812	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11813	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11814	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11815	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11816	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11817	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
11818	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11819	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11820	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11821	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11822	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11823	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11824	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
11825	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11826	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11827	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11828	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11829	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11830	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11831	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11832	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11833	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11834	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11835	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11836	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11837	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11838	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11839	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
11840	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11841	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11842	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11843	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11844	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11845	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
11846	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11847	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11848	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11849	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11850	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11851	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11852	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11853	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11854	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11855	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11856	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11857	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11858	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11859	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11860	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11861	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11862	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11863	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11864	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11865	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11866	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11867	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11868	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11869	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11870	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
11871	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11872	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11873	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11874	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11875	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11876	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11877	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11878	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11879	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11880	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11881	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11882	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11883	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11884	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11885	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11886	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11887	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11888	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11889	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11890	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11891	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11892	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11893	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11894	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11895	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11896	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11897	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11898	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11899	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11900	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11901	/* Refer below for TGSI_OPCODE_DFMA */
11902	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11903	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11904	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11905	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11906	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11907	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11908	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
11909	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11910	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11911	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11912	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11913	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11914	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11915	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11916	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11917	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11918	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11919	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11920	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11921	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
11922	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11923	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
11924	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11925	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11926	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11927	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11928	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11929	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11930	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11931	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11932	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11933	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11934	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11935	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11936	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11937	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11938	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11939	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11940	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
11941	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11942	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11943	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11944	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
11945	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
11946	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11947	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11948	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11949	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11950	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
11951	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
11952	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
11953	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
11954	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
11955	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
11956	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
11957	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
11958	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
11959	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
11960	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11961	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11962	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11963	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11964	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11965	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
11966	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
11967	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
11968	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
11969	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
11970	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
11971	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
11972	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
11973	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
11974	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
11975	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11976	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11977	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11978	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
11979	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
11980	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
11981	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
11982	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
11983	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
11984	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
11985	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
11986	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
11987	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
11988	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
11989	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
11990	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
11991	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
11992	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
11993	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
11994	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
11995	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
11996	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
11997	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
11998	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
11999	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12000	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12001	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12002	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12003	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12004	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12005	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12006	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12007	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12008};
12009
12010static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12011	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
12012	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
12013	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
12014	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12015	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12016	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
12017	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
12018	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
12019	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
12020	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12021	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12022	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
12023	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
12024	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
12025	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
12026	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
12027	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
12028	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
12029	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
12030	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12031	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
12032	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
12033	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
12034	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
12035	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
12036	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
12037	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
12038	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12039	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12040	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
12041	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
12042	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
12043	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
12044	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
12045	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
12046	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
12047	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12048	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12049	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
12050	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
12051	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12052	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12053	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12054	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12055	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12056	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12057	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12058	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
12059	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12060	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12061	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12062	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12063	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12064	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12065	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12066	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12067	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12068	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12069	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12070	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12071	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12072	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12073	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12074	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12075	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12076	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12077	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12078	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12079	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12080	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12081	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12082	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12083	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12084	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12085	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12086	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12087	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12088	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12089	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12090	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12091	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12092	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12093	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
12094	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12095	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12096	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12097	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12098	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12099	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12100	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12101	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12102	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12103	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12104	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12105	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12106	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12107	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12108	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12109	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12110	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12111	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12112	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12113	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12114	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12115	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12116	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12117	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12118	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12119	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12120	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12121	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12122	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12123	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12124	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12125	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12126	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12127	/* Refer below for TGSI_OPCODE_DFMA */
12128	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
12129	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12130	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12131	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12132	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12133	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12134	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12135	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12136	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12137	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12138	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12139	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12140	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12141	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12142	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12143	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12144	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12145	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12146	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12147	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12148	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12149	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12150	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12151	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12152	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12153	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12154	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12155	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12156	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12157	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12158	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12159	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12160	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12161	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12162	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12163	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12164	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12165	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12166	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12167	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12168	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12169	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12170	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12171	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12172	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12173	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12174	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12175	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12176	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12177	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12178	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12179	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12180	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12181	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12182	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12183	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12184	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12185	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12186	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12187	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12188	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12189	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12190	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12191	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12192	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12193	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12194	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12195	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12196	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12197	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12198	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12199	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12200	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12201	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12202	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12203	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12204	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12205	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12206	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12207	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12208	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12209	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12210	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12211	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12212	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12213	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12214	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12215	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12216	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12217	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12218	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12219	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12220	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12221	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12222	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12223	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12224	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12225	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12226	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12227	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12228	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12229	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12230	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12231	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12232	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12233	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12234};
12235