1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_formats.h"
25#include "r600_opcodes.h"
26#include "r600_shader.h"
27#include "r600d.h"
28
29#include "sb/sb_public.h"
30
31#include "pipe/p_shader_tokens.h"
32#include "tgsi/tgsi_info.h"
33#include "tgsi/tgsi_parse.h"
34#include "tgsi/tgsi_scan.h"
35#include "tgsi/tgsi_dump.h"
36#include "util/u_bitcast.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63/* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66      .y = RelVertexID (??)
67      .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70      r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73       .y = RelPatchID (??)
74       .z = InvocationID
75       .w = tess factor base.
76
77 TES - .x = TessCoord.x
78     - .y = TessCoord.y
79     - .z = RelPatchID (??)
80     - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83      face_gpr.w = SampleID
84*/
85#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86static int r600_shader_from_tgsi(struct r600_context *rctx,
87				 struct r600_pipe_shader *pipeshader,
88				 union r600_shader_key key);
89
90static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91                           int size, unsigned comp_mask) {
92
93	if (!size)
94		return;
95
96	if (ps->num_arrays == ps->max_arrays) {
97		ps->max_arrays += 64;
98		ps->arrays = realloc(ps->arrays, ps->max_arrays *
99		                     sizeof(struct r600_shader_array));
100	}
101
102	int n = ps->num_arrays;
103	++ps->num_arrays;
104
105	ps->arrays[n].comp_mask = comp_mask;
106	ps->arrays[n].gpr_start = start_gpr;
107	ps->arrays[n].gpr_count = size;
108}
109
110static void r600_dump_streamout(struct pipe_stream_output_info *so)
111{
112	unsigned i;
113
114	fprintf(stderr, "STREAMOUT\n");
115	for (i = 0; i < so->num_outputs; i++) {
116		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117				so->output[i].start_component;
118		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119			i,
120			so->output[i].stream,
121			so->output[i].output_buffer,
122			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123			so->output[i].register_index,
124			mask & 1 ? "x" : "",
125		        mask & 2 ? "y" : "",
126		        mask & 4 ? "z" : "",
127		        mask & 8 ? "w" : "",
128			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129	}
130}
131
132static int store_shader(struct pipe_context *ctx,
133			struct r600_pipe_shader *shader)
134{
135	struct r600_context *rctx = (struct r600_context *)ctx;
136	uint32_t *ptr, i;
137
138	if (shader->bo == NULL) {
139		shader->bo = (struct r600_resource*)
140			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141		if (shader->bo == NULL) {
142			return -ENOMEM;
143		}
144		ptr = r600_buffer_map_sync_with_rings(
145			&rctx->b, shader->bo,
146			PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
147		if (R600_BIG_ENDIAN) {
148			for (i = 0; i < shader->shader.bc.ndw; ++i) {
149				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
150			}
151		} else {
152			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
153		}
154		rctx->b.ws->buffer_unmap(shader->bo->buf);
155	}
156
157	return 0;
158}
159
160int r600_pipe_shader_create(struct pipe_context *ctx,
161			    struct r600_pipe_shader *shader,
162			    union r600_shader_key key)
163{
164	struct r600_context *rctx = (struct r600_context *)ctx;
165	struct r600_pipe_shader_selector *sel = shader->selector;
166	int r;
167	bool dump = r600_can_dump_shader(&rctx->screen->b,
168					 tgsi_get_processor_type(sel->tokens));
169	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
170	unsigned sb_disasm;
171	unsigned export_shader;
172
173	shader->shader.bc.isa = rctx->isa;
174
175	if (dump) {
176		fprintf(stderr, "--------------------------------------------------------------\n");
177		tgsi_dump(sel->tokens, 0);
178
179		if (sel->so.num_outputs) {
180			r600_dump_streamout(&sel->so);
181		}
182	}
183	r = r600_shader_from_tgsi(rctx, shader, key);
184	if (r) {
185		R600_ERR("translation from TGSI failed !\n");
186		goto error;
187	}
188	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
189		/* only disable for vertex shaders in tess paths */
190		if (key.vs.as_ls)
191			use_sb = 0;
192	}
193	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
194	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
195	use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
196
197	/* disable SB for shaders using doubles */
198	use_sb &= !shader->shader.uses_doubles;
199
200	use_sb &= !shader->shader.uses_atomics;
201	use_sb &= !shader->shader.uses_images;
202	use_sb &= !shader->shader.uses_helper_invocation;
203
204	/* Check if the bytecode has already been built. */
205	if (!shader->shader.bc.bytecode) {
206		r = r600_bytecode_build(&shader->shader.bc);
207		if (r) {
208			R600_ERR("building bytecode failed !\n");
209			goto error;
210		}
211	}
212
213	sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
214	if (dump && !sb_disasm) {
215		fprintf(stderr, "--------------------------------------------------------------\n");
216		r600_bytecode_disasm(&shader->shader.bc);
217		fprintf(stderr, "______________________________________________________________\n");
218	} else if ((dump && sb_disasm) || use_sb) {
219		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
220		                             dump, use_sb);
221		if (r) {
222			R600_ERR("r600_sb_bytecode_process failed !\n");
223			goto error;
224		}
225	}
226
227	if (shader->gs_copy_shader) {
228		if (dump) {
229			// dump copy shader
230			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
231						     &shader->gs_copy_shader->shader, dump, 0);
232			if (r)
233				goto error;
234		}
235
236		if ((r = store_shader(ctx, shader->gs_copy_shader)))
237			goto error;
238	}
239
240	/* Store the shader in a buffer. */
241	if ((r = store_shader(ctx, shader)))
242		goto error;
243
244	/* Build state. */
245	switch (shader->shader.processor_type) {
246	case PIPE_SHADER_TESS_CTRL:
247		evergreen_update_hs_state(ctx, shader);
248		break;
249	case PIPE_SHADER_TESS_EVAL:
250		if (key.tes.as_es)
251			evergreen_update_es_state(ctx, shader);
252		else
253			evergreen_update_vs_state(ctx, shader);
254		break;
255	case PIPE_SHADER_GEOMETRY:
256		if (rctx->b.chip_class >= EVERGREEN) {
257			evergreen_update_gs_state(ctx, shader);
258			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
259		} else {
260			r600_update_gs_state(ctx, shader);
261			r600_update_vs_state(ctx, shader->gs_copy_shader);
262		}
263		break;
264	case PIPE_SHADER_VERTEX:
265		export_shader = key.vs.as_es;
266		if (rctx->b.chip_class >= EVERGREEN) {
267			if (key.vs.as_ls)
268				evergreen_update_ls_state(ctx, shader);
269			else if (key.vs.as_es)
270				evergreen_update_es_state(ctx, shader);
271			else
272				evergreen_update_vs_state(ctx, shader);
273		} else {
274			if (export_shader)
275				r600_update_es_state(ctx, shader);
276			else
277				r600_update_vs_state(ctx, shader);
278		}
279		break;
280	case PIPE_SHADER_FRAGMENT:
281		if (rctx->b.chip_class >= EVERGREEN) {
282			evergreen_update_ps_state(ctx, shader);
283		} else {
284			r600_update_ps_state(ctx, shader);
285		}
286		break;
287	case PIPE_SHADER_COMPUTE:
288		evergreen_update_ls_state(ctx, shader);
289		break;
290	default:
291		r = -EINVAL;
292		goto error;
293	}
294	return 0;
295
296error:
297	r600_pipe_shader_destroy(ctx, shader);
298	return r;
299}
300
301void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
302{
303	r600_resource_reference(&shader->bo, NULL);
304	r600_bytecode_clear(&shader->shader.bc);
305	r600_release_command_buffer(&shader->command_buffer);
306}
307
308/*
309 * tgsi -> r600 shader
310 */
311struct r600_shader_tgsi_instruction;
312
313struct r600_shader_src {
314	unsigned				sel;
315	unsigned				swizzle[4];
316	unsigned				neg;
317	unsigned				abs;
318	unsigned				rel;
319	unsigned				kc_bank;
320	boolean					kc_rel; /* true if cache bank is indexed */
321	uint32_t				value[4];
322};
323
324struct eg_interp {
325	boolean					enabled;
326	unsigned				ij_index;
327};
328
329struct r600_shader_ctx {
330	struct tgsi_shader_info			info;
331	struct tgsi_array_info			*array_infos;
332	/* flag for each tgsi temp array if its been spilled or not */
333	bool					*spilled_arrays;
334	struct tgsi_parse_context		parse;
335	const struct tgsi_token			*tokens;
336	unsigned				type;
337	unsigned				file_offset[TGSI_FILE_COUNT];
338	unsigned				temp_reg;
339	const struct r600_shader_tgsi_instruction	*inst_info;
340	struct r600_bytecode			*bc;
341	struct r600_shader			*shader;
342	struct r600_shader_src			src[4];
343	uint32_t				*literals;
344	uint32_t				nliterals;
345	uint32_t				max_driver_temp_used;
346	/* needed for evergreen interpolation */
347	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
348	/* evergreen/cayman also store sample mask in face register */
349	int					face_gpr;
350	/* sample id is .w component stored in fixed point position register */
351	int					fixed_pt_position_gpr;
352	int					colors_used;
353	boolean                 clip_vertex_write;
354	unsigned                cv_output;
355	unsigned		edgeflag_output;
356	int					helper_invoc_reg;
357	int                                     cs_block_size_reg;
358	int                                     cs_grid_size_reg;
359	bool cs_block_size_loaded, cs_grid_size_loaded;
360	int					fragcoord_input;
361	int					next_ring_offset;
362	int					gs_out_ring_offset;
363	int					gs_next_vertex;
364	struct r600_shader	*gs_for_vs;
365	int					gs_export_gpr_tregs[4];
366	int                                     gs_rotated_input[2];
367	const struct pipe_stream_output_info	*gs_stream_output_info;
368	unsigned				enabled_stream_buffers_mask;
369	unsigned                                tess_input_info; /* temp with tess input offsets */
370	unsigned                                tess_output_info; /* temp with tess input offsets */
371	unsigned                                thread_id_gpr; /* temp with thread id calculated for images */
372};
373
374struct r600_shader_tgsi_instruction {
375	unsigned	op;
376	int (*process)(struct r600_shader_ctx *ctx);
377};
378
379static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
380static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
381static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
382static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
383static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
384static int tgsi_else(struct r600_shader_ctx *ctx);
385static int tgsi_endif(struct r600_shader_ctx *ctx);
386static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
387static int tgsi_endloop(struct r600_shader_ctx *ctx);
388static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
389static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
390                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
391                                unsigned int dst_reg);
392static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
393			const struct r600_shader_src *shader_src,
394			unsigned chan);
395static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
396			       unsigned dst_reg, unsigned mask);
397
398static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
399{
400	if (ctx->bc->family == CHIP_HEMLOCK ||
401	    ctx->bc->family == CHIP_CYPRESS ||
402	    ctx->bc->family == CHIP_JUNIPER)
403		return false;
404	return true;
405}
406
407static int tgsi_last_instruction(unsigned writemask)
408{
409	int i, lasti = 0;
410
411	for (i = 0; i < 4; i++) {
412		if (writemask & (1 << i)) {
413			lasti = i;
414		}
415	}
416	return lasti;
417}
418
419static int tgsi_is_supported(struct r600_shader_ctx *ctx)
420{
421	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
422	unsigned j;
423
424	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
425		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
426		return -EINVAL;
427	}
428#if 0
429	if (i->Instruction.Label) {
430		R600_ERR("label unsupported\n");
431		return -EINVAL;
432	}
433#endif
434	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
435		if (i->Src[j].Register.Dimension) {
436		   switch (i->Src[j].Register.File) {
437		   case TGSI_FILE_CONSTANT:
438		   case TGSI_FILE_HW_ATOMIC:
439			   break;
440		   case TGSI_FILE_INPUT:
441			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
442			       ctx->type == PIPE_SHADER_TESS_CTRL ||
443			       ctx->type == PIPE_SHADER_TESS_EVAL)
444				   break;
445		   case TGSI_FILE_OUTPUT:
446			   if (ctx->type == PIPE_SHADER_TESS_CTRL)
447				   break;
448		   default:
449			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
450				    i->Src[j].Register.File,
451				    i->Src[j].Register.Dimension);
452			   return -EINVAL;
453		   }
454		}
455	}
456	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
457		if (i->Dst[j].Register.Dimension) {
458			if (ctx->type == PIPE_SHADER_TESS_CTRL)
459				continue;
460			R600_ERR("unsupported dst (dimension)\n");
461			return -EINVAL;
462		}
463	}
464	return 0;
465}
466
467int eg_get_interpolator_index(unsigned interpolate, unsigned location)
468{
469	if (interpolate == TGSI_INTERPOLATE_COLOR ||
470		interpolate == TGSI_INTERPOLATE_LINEAR ||
471		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
472	{
473		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
474		int loc;
475
476		switch(location) {
477		case TGSI_INTERPOLATE_LOC_CENTER:
478			loc = 1;
479			break;
480		case TGSI_INTERPOLATE_LOC_CENTROID:
481			loc = 2;
482			break;
483		case TGSI_INTERPOLATE_LOC_SAMPLE:
484		default:
485			loc = 0; break;
486		}
487
488		return is_linear * 3 + loc;
489	}
490
491	return -1;
492}
493
494static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
495		int input)
496{
497	int i = eg_get_interpolator_index(
498		ctx->shader->input[input].interpolate,
499		ctx->shader->input[input].interpolate_location);
500	assert(i >= 0);
501	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
502}
503
504static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
505{
506	int i, r;
507	struct r600_bytecode_alu alu;
508	int gpr = 0, base_chan = 0;
509	int ij_index = ctx->shader->input[input].ij_index;
510
511	/* work out gpr and base_chan from index */
512	gpr = ij_index / 2;
513	base_chan = (2 * (ij_index % 2)) + 1;
514
515	for (i = 0; i < 8; i++) {
516		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
517
518		if (i < 4)
519			alu.op = ALU_OP2_INTERP_ZW;
520		else
521			alu.op = ALU_OP2_INTERP_XY;
522
523		if ((i > 1) && (i < 6)) {
524			alu.dst.sel = ctx->shader->input[input].gpr;
525			alu.dst.write = 1;
526		}
527
528		alu.dst.chan = i % 4;
529
530		alu.src[0].sel = gpr;
531		alu.src[0].chan = (base_chan - (i % 2));
532
533		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
534
535		alu.bank_swizzle_force = SQ_ALU_VEC_210;
536		if ((i % 4) == 3)
537			alu.last = 1;
538		r = r600_bytecode_add_alu(ctx->bc, &alu);
539		if (r)
540			return r;
541	}
542	return 0;
543}
544
545static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
546{
547	int i, r;
548	struct r600_bytecode_alu alu;
549
550	for (i = 0; i < 4; i++) {
551		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
552
553		alu.op = ALU_OP1_INTERP_LOAD_P0;
554
555		alu.dst.sel = ctx->shader->input[input].gpr;
556		alu.dst.write = 1;
557
558		alu.dst.chan = i;
559
560		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
561		alu.src[0].chan = i;
562
563		if (i == 3)
564			alu.last = 1;
565		r = r600_bytecode_add_alu(ctx->bc, &alu);
566		if (r)
567			return r;
568	}
569	return 0;
570}
571
572/*
573 * Special export handling in shaders
574 *
575 * shader export ARRAY_BASE for EXPORT_POS:
576 * 60 is position
577 * 61 is misc vector
578 * 62, 63 are clip distance vectors
579 *
580 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
581 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
582 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
583 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
584 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
585 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
586 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
587 * exclusive from render target index)
588 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
589 *
590 *
591 * shader export ARRAY_BASE for EXPORT_PIXEL:
592 * 0-7 CB targets
593 * 61 computed Z vector
594 *
595 * The use of the values exported in the computed Z vector are controlled
596 * by DB_SHADER_CONTROL:
597 * Z_EXPORT_ENABLE - Z as a float in RED
598 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
599 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
600 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
601 * DB_SOURCE_FORMAT - export control restrictions
602 *
603 */
604
605
606/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
607static int r600_spi_sid(struct r600_shader_io * io)
608{
609	int index, name = io->name;
610
611	/* These params are handled differently, they don't need
612	 * semantic indices, so we'll use 0 for them.
613	 */
614	if (name == TGSI_SEMANTIC_POSITION ||
615	    name == TGSI_SEMANTIC_PSIZE ||
616	    name == TGSI_SEMANTIC_EDGEFLAG ||
617	    name == TGSI_SEMANTIC_FACE ||
618	    name == TGSI_SEMANTIC_SAMPLEMASK)
619		index = 0;
620	else {
621		if (name == TGSI_SEMANTIC_GENERIC) {
622			/* For generic params simply use sid from tgsi */
623			index = io->sid;
624		} else {
625			/* For non-generic params - pack name and sid into 8 bits */
626			index = 0x80 | (name<<3) | (io->sid);
627		}
628
629		/* Make sure that all really used indices have nonzero value, so
630		 * we can just compare it to 0 later instead of comparing the name
631		 * with different values to detect special cases. */
632		index++;
633	}
634
635	return index;
636};
637
638/* we need this to get a common lds index for vs/tcs/tes input/outputs */
639int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
640{
641	switch (semantic_name) {
642	case TGSI_SEMANTIC_POSITION:
643		return 0;
644	case TGSI_SEMANTIC_PSIZE:
645		return 1;
646	case TGSI_SEMANTIC_CLIPDIST:
647		assert(index <= 1);
648		return 2 + index;
649	case TGSI_SEMANTIC_GENERIC:
650		if (index <= 63-4)
651			return 4 + index - 9;
652		else
653			/* same explanation as in the default statement,
654			 * the only user hitting this is st/nine.
655			 */
656			return 0;
657
658	/* patch indices are completely separate and thus start from 0 */
659	case TGSI_SEMANTIC_TESSOUTER:
660		return 0;
661	case TGSI_SEMANTIC_TESSINNER:
662		return 1;
663	case TGSI_SEMANTIC_PATCH:
664		return 2 + index;
665
666	default:
667		/* Don't fail here. The result of this function is only used
668		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
669		 * occur, but this function is called for all vertex shaders
670		 * before it's known whether LS will be compiled or not.
671		 */
672		return 0;
673	}
674}
675
676/* turn input into interpolate on EG */
677static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
678{
679	int r = 0;
680
681	if (ctx->shader->input[index].spi_sid) {
682		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
683		if (ctx->shader->input[index].interpolate > 0) {
684			evergreen_interp_assign_ij_index(ctx, index);
685			r = evergreen_interp_alu(ctx, index);
686		} else {
687			r = evergreen_interp_flat(ctx, index);
688		}
689	}
690	return r;
691}
692
693static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
694{
695	struct r600_bytecode_alu alu;
696	int i, r;
697	int gpr_front = ctx->shader->input[front].gpr;
698	int gpr_back = ctx->shader->input[back].gpr;
699
700	for (i = 0; i < 4; i++) {
701		memset(&alu, 0, sizeof(alu));
702		alu.op = ALU_OP3_CNDGT;
703		alu.is_op3 = 1;
704		alu.dst.write = 1;
705		alu.dst.sel = gpr_front;
706		alu.src[0].sel = ctx->face_gpr;
707		alu.src[1].sel = gpr_front;
708		alu.src[2].sel = gpr_back;
709
710		alu.dst.chan = i;
711		alu.src[1].chan = i;
712		alu.src[2].chan = i;
713		alu.last = (i==3);
714
715		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
716			return r;
717	}
718
719	return 0;
720}
721
722/* execute a single slot ALU calculation */
723static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
724			  int dst_sel, int dst_chan,
725			  int src0_sel, unsigned src0_chan_val,
726			  int src1_sel, unsigned src1_chan_val)
727{
728	struct r600_bytecode_alu alu;
729	int r, i;
730
731	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
732		for (i = 0; i < 4; i++) {
733			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
734			alu.op = op;
735			alu.src[0].sel = src0_sel;
736			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
737				alu.src[0].value = src0_chan_val;
738			else
739				alu.src[0].chan = src0_chan_val;
740			alu.src[1].sel = src1_sel;
741			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
742				alu.src[1].value = src1_chan_val;
743			else
744				alu.src[1].chan = src1_chan_val;
745			alu.dst.sel = dst_sel;
746			alu.dst.chan = i;
747			alu.dst.write = i == dst_chan;
748			alu.last = (i == 3);
749			r = r600_bytecode_add_alu(ctx->bc, &alu);
750			if (r)
751				return r;
752		}
753		return 0;
754	}
755
756	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
757	alu.op = op;
758	alu.src[0].sel = src0_sel;
759	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
760		alu.src[0].value = src0_chan_val;
761	else
762		alu.src[0].chan = src0_chan_val;
763	alu.src[1].sel = src1_sel;
764	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
765		alu.src[1].value = src1_chan_val;
766	else
767		alu.src[1].chan = src1_chan_val;
768	alu.dst.sel = dst_sel;
769	alu.dst.chan = dst_chan;
770	alu.dst.write = 1;
771	alu.last = 1;
772	r = r600_bytecode_add_alu(ctx->bc, &alu);
773	if (r)
774		return r;
775	return 0;
776}
777
778/* execute a single slot ALU calculation */
779static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
780			  int dst_sel, int dst_chan,
781			  int src0_sel, unsigned src0_chan_val,
782			  int src1_sel, unsigned src1_chan_val,
783			  int src2_sel, unsigned src2_chan_val)
784{
785	struct r600_bytecode_alu alu;
786	int r;
787
788	/* validate this for other ops */
789	assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
790	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
791	alu.op = op;
792	alu.src[0].sel = src0_sel;
793	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
794		alu.src[0].value = src0_chan_val;
795	else
796		alu.src[0].chan = src0_chan_val;
797	alu.src[1].sel = src1_sel;
798	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
799		alu.src[1].value = src1_chan_val;
800	else
801		alu.src[1].chan = src1_chan_val;
802	alu.src[2].sel = src2_sel;
803	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
804		alu.src[2].value = src2_chan_val;
805	else
806		alu.src[2].chan = src2_chan_val;
807	alu.dst.sel = dst_sel;
808	alu.dst.chan = dst_chan;
809	alu.is_op3 = 1;
810	alu.last = 1;
811	r = r600_bytecode_add_alu(ctx->bc, &alu);
812	if (r)
813		return r;
814	return 0;
815}
816
817/* put it in temp_reg.x */
818static int get_lds_offset0(struct r600_shader_ctx *ctx,
819			   int rel_patch_chan,
820			   int temp_reg, bool is_patch_var)
821{
822	int r;
823
824	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
825	/* ADD
826	   Dimension - patch0_offset (input_vals.z),
827	   Non-dim - patch0_data_offset (input_vals.w)
828	*/
829	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
830			   temp_reg, 0,
831			   ctx->tess_output_info, 0,
832			   0, rel_patch_chan,
833			   ctx->tess_output_info, is_patch_var ? 3 : 2);
834	if (r)
835		return r;
836	return 0;
837}
838
839static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
840{
841	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
842}
843
844static int r600_get_temp(struct r600_shader_ctx *ctx)
845{
846	return ctx->temp_reg + ctx->max_driver_temp_used++;
847}
848
849static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
850{
851	int i;
852	i = ctx->shader->noutput++;
853	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
854	ctx->shader->output[i].sid = 0;
855	ctx->shader->output[i].gpr = 0;
856	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
857	ctx->shader->output[i].write_mask = 0x4;
858	ctx->shader->output[i].spi_sid = prim_id_sid;
859
860	return 0;
861}
862
863static int tgsi_barrier(struct r600_shader_ctx *ctx)
864{
865	struct r600_bytecode_alu alu;
866	int r;
867
868	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
869	alu.op = ctx->inst_info->op;
870	alu.last = 1;
871
872	r = r600_bytecode_add_alu(ctx->bc, &alu);
873	if (r)
874		return r;
875	return 0;
876}
877
878static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
879{
880	// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
881	unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
882	unsigned narrays_left = n;
883	bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
884
885	*scratch_space_needed = 0;
886	while (*regno > 124 && narrays_left) {
887		unsigned i;
888		unsigned largest = 0;
889		unsigned largest_index = 0;
890
891		for (i = 0; i < n; i++) {
892			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
893			if (!spilled[i] && size > largest) {
894				largest = size;
895				largest_index = i;
896			}
897		}
898
899		spilled[largest_index] = true;
900		*regno -= largest;
901		*scratch_space_needed += largest;
902
903		narrays_left --;
904	}
905
906	if (narrays_left == 0) {
907		ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
908	}
909}
910
911/* Take spilled temp arrays into account when translating tgsi register
912 * indexes into r600 gprs if spilled is false, or scratch array offset if
913 * spilled is true */
914static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
915{
916	unsigned i;
917	unsigned spilled_size = 0;
918
919	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
920		if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
921			if (ctx->spilled_arrays[i]) {
922				/* vec4 index into spilled scratch memory */
923				*spilled = true;
924				return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
925			}
926			else {
927				/* regular GPR array */
928				*spilled = false;
929				return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
930			}
931		}
932
933		if (tgsi_reg_index < ctx->array_infos[i].range.First)
934			break;
935		if (ctx->spilled_arrays[i]) {
936			spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
937		}
938	}
939
940	/* regular GPR index, minus the holes from spilled arrays */
941	*spilled = false;
942
943	return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
944}
945
946/* look up spill area base offset and array size for a spilled temp array */
947static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
948	unsigned *array_base, unsigned *array_size)
949{
950	unsigned i;
951	unsigned offset = 0;
952
953	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
954		if (ctx->spilled_arrays[i]) {
955			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
956
957			if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
958				*array_base = offset;
959				*array_size = size - 1; /* hw counts from 1 */
960
961				return;
962			}
963
964			offset += size;
965		}
966	}
967}
968
969static int tgsi_declaration(struct r600_shader_ctx *ctx)
970{
971	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
972	int r, i, j, count = d->Range.Last - d->Range.First + 1;
973
974	switch (d->Declaration.File) {
975	case TGSI_FILE_INPUT:
976		for (j = 0; j < count; j++) {
977			i = ctx->shader->ninput + j;
978			assert(i < ARRAY_SIZE(ctx->shader->input));
979			ctx->shader->input[i].name = d->Semantic.Name;
980			ctx->shader->input[i].sid = d->Semantic.Index + j;
981			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
982			ctx->shader->input[i].interpolate_location = d->Interp.Location;
983			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
984			if (ctx->type == PIPE_SHADER_FRAGMENT) {
985				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
986				switch (ctx->shader->input[i].name) {
987				case TGSI_SEMANTIC_FACE:
988					if (ctx->face_gpr != -1)
989						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
990					else
991						ctx->face_gpr = ctx->shader->input[i].gpr;
992					break;
993				case TGSI_SEMANTIC_COLOR:
994					ctx->colors_used++;
995					break;
996				case TGSI_SEMANTIC_POSITION:
997					ctx->fragcoord_input = i;
998					break;
999				case TGSI_SEMANTIC_PRIMID:
1000					/* set this for now */
1001					ctx->shader->gs_prim_id_input = true;
1002					ctx->shader->ps_prim_id_input = i;
1003					break;
1004				}
1005				if (ctx->bc->chip_class >= EVERGREEN) {
1006					if ((r = evergreen_interp_input(ctx, i)))
1007						return r;
1008				}
1009			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1010				/* FIXME probably skip inputs if they aren't passed in the ring */
1011				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1012				ctx->next_ring_offset += 16;
1013				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1014					ctx->shader->gs_prim_id_input = true;
1015			}
1016		}
1017		ctx->shader->ninput += count;
1018		break;
1019	case TGSI_FILE_OUTPUT:
1020		for (j = 0; j < count; j++) {
1021			i = ctx->shader->noutput + j;
1022			assert(i < ARRAY_SIZE(ctx->shader->output));
1023			ctx->shader->output[i].name = d->Semantic.Name;
1024			ctx->shader->output[i].sid = d->Semantic.Index + j;
1025			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1026			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1027			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1028			if (ctx->type == PIPE_SHADER_VERTEX ||
1029			    ctx->type == PIPE_SHADER_GEOMETRY ||
1030			    ctx->type == PIPE_SHADER_TESS_EVAL) {
1031				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1032				switch (d->Semantic.Name) {
1033				case TGSI_SEMANTIC_CLIPDIST:
1034					break;
1035				case TGSI_SEMANTIC_PSIZE:
1036					ctx->shader->vs_out_misc_write = 1;
1037					ctx->shader->vs_out_point_size = 1;
1038					break;
1039				case TGSI_SEMANTIC_EDGEFLAG:
1040					ctx->shader->vs_out_misc_write = 1;
1041					ctx->shader->vs_out_edgeflag = 1;
1042					ctx->edgeflag_output = i;
1043					break;
1044				case TGSI_SEMANTIC_VIEWPORT_INDEX:
1045					ctx->shader->vs_out_misc_write = 1;
1046					ctx->shader->vs_out_viewport = 1;
1047					break;
1048				case TGSI_SEMANTIC_LAYER:
1049					ctx->shader->vs_out_misc_write = 1;
1050					ctx->shader->vs_out_layer = 1;
1051					break;
1052				case TGSI_SEMANTIC_CLIPVERTEX:
1053					ctx->clip_vertex_write = TRUE;
1054					ctx->cv_output = i;
1055					break;
1056				}
1057				if (ctx->type == PIPE_SHADER_GEOMETRY) {
1058					ctx->gs_out_ring_offset += 16;
1059				}
1060			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1061				switch (d->Semantic.Name) {
1062				case TGSI_SEMANTIC_COLOR:
1063					ctx->shader->nr_ps_max_color_exports++;
1064					break;
1065				}
1066			}
1067		}
1068		ctx->shader->noutput += count;
1069		break;
1070	case TGSI_FILE_TEMPORARY:
1071		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1072			if (d->Array.ArrayID) {
1073				bool spilled;
1074				unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1075					d->Range.First,
1076					&spilled);
1077
1078				if (!spilled) {
1079					r600_add_gpr_array(ctx->shader, idx,
1080						d->Range.Last - d->Range.First + 1, 0x0F);
1081				}
1082			}
1083		}
1084		break;
1085
1086	case TGSI_FILE_CONSTANT:
1087	case TGSI_FILE_SAMPLER:
1088	case TGSI_FILE_SAMPLER_VIEW:
1089	case TGSI_FILE_ADDRESS:
1090	case TGSI_FILE_BUFFER:
1091	case TGSI_FILE_IMAGE:
1092	case TGSI_FILE_MEMORY:
1093		break;
1094
1095	case TGSI_FILE_HW_ATOMIC:
1096		i = ctx->shader->nhwatomic_ranges;
1097		ctx->shader->atomics[i].start = d->Range.First;
1098		ctx->shader->atomics[i].end = d->Range.Last;
1099		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1100		ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1101		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1102		ctx->shader->nhwatomic_ranges++;
1103		ctx->shader->nhwatomic += count;
1104		break;
1105
1106	case TGSI_FILE_SYSTEM_VALUE:
1107		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1108			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1109			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1110			break; /* Already handled from allocate_system_value_inputs */
1111		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1112			break;
1113		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1114			break;
1115		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1116			break;
1117		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1118			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1119			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1120			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1121			unsigned temp_reg = r600_get_temp(ctx);
1122
1123			r = get_lds_offset0(ctx, 2, temp_reg, true);
1124			if (r)
1125				return r;
1126
1127			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1128					   temp_reg, 0,
1129					   temp_reg, 0,
1130					   V_SQ_ALU_SRC_LITERAL, param * 16);
1131			if (r)
1132				return r;
1133
1134			do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1135		}
1136		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1137			/* MOV r1.x, r0.x;
1138			   MOV r1.y, r0.y;
1139			*/
1140			for (i = 0; i < 2; i++) {
1141				struct r600_bytecode_alu alu;
1142				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1143				alu.op = ALU_OP1_MOV;
1144				alu.src[0].sel = 0;
1145				alu.src[0].chan = 0 + i;
1146				alu.dst.sel = 1;
1147				alu.dst.chan = 0 + i;
1148				alu.dst.write = 1;
1149				alu.last = (i == 1) ? 1 : 0;
1150				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1151					return r;
1152			}
1153			/* ADD r1.z, 1.0f, -r0.x */
1154			struct r600_bytecode_alu alu;
1155			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1156			alu.op = ALU_OP2_ADD;
1157			alu.src[0].sel = V_SQ_ALU_SRC_1;
1158			alu.src[1].sel = 1;
1159			alu.src[1].chan = 0;
1160			alu.src[1].neg = 1;
1161			alu.dst.sel = 1;
1162			alu.dst.chan = 2;
1163			alu.dst.write = 1;
1164			alu.last = 1;
1165			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1166				return r;
1167
1168			/* ADD r1.z, r1.z, -r1.y */
1169			alu.op = ALU_OP2_ADD;
1170			alu.src[0].sel = 1;
1171			alu.src[0].chan = 2;
1172			alu.src[1].sel = 1;
1173			alu.src[1].chan = 1;
1174			alu.src[1].neg = 1;
1175			alu.dst.sel = 1;
1176			alu.dst.chan = 2;
1177			alu.dst.write = 1;
1178			alu.last = 1;
1179			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1180				return r;
1181			break;
1182		}
1183		break;
1184	default:
1185		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1186		return -EINVAL;
1187	}
1188	return 0;
1189}
1190
1191static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1192{
1193	struct tgsi_parse_context parse;
1194	struct {
1195		boolean enabled;
1196		int *reg;
1197		unsigned name, alternate_name;
1198	} inputs[2] = {
1199		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1200
1201		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1202	};
1203	int num_regs = 0;
1204	unsigned k, i;
1205
1206	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1207		return 0;
1208	}
1209
1210	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1211	while (!tgsi_parse_end_of_tokens(&parse)) {
1212		tgsi_parse_token(&parse);
1213
1214		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1215			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1216			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1217				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1218				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1219			{
1220				int interpolate, location, k;
1221
1222				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1223					location = TGSI_INTERPOLATE_LOC_CENTER;
1224				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1225					location = TGSI_INTERPOLATE_LOC_CENTER;
1226					/* Needs sample positions, currently those are always available */
1227				} else {
1228					location = TGSI_INTERPOLATE_LOC_CENTROID;
1229				}
1230
1231				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1232				k = eg_get_interpolator_index(interpolate, location);
1233				if (k >= 0)
1234					ctx->eg_interpolators[k].enabled = true;
1235			}
1236		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1237			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1238			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1239				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1240					if (d->Semantic.Name == inputs[k].name ||
1241						d->Semantic.Name == inputs[k].alternate_name) {
1242						inputs[k].enabled = true;
1243					}
1244				}
1245			}
1246		}
1247	}
1248
1249	tgsi_parse_free(&parse);
1250
1251	if (ctx->info.reads_samplemask &&
1252	    (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1253		inputs[1].enabled = true;
1254	}
1255
1256	if (ctx->bc->chip_class >= EVERGREEN) {
1257		int num_baryc = 0;
1258		/* assign gpr to each interpolator according to priority */
1259		for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1260			if (ctx->eg_interpolators[i].enabled) {
1261				ctx->eg_interpolators[i].ij_index = num_baryc;
1262				num_baryc++;
1263			}
1264		}
1265		num_baryc = (num_baryc + 1) >> 1;
1266		gpr_offset += num_baryc;
1267	}
1268
1269	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1270		boolean enabled = inputs[i].enabled;
1271		int *reg = inputs[i].reg;
1272		unsigned name = inputs[i].name;
1273
1274		if (enabled) {
1275			int gpr = gpr_offset + num_regs++;
1276			ctx->shader->nsys_inputs++;
1277
1278			// add to inputs, allocate a gpr
1279			k = ctx->shader->ninput++;
1280			ctx->shader->input[k].name = name;
1281			ctx->shader->input[k].sid = 0;
1282			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1283			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1284			*reg = ctx->shader->input[k].gpr = gpr;
1285		}
1286	}
1287
1288	return gpr_offset + num_regs;
1289}
1290
1291/*
1292 * for evergreen we need to scan the shader to find the number of GPRs we need to
1293 * reserve for interpolation and system values
1294 *
1295 * we need to know if we are going to emit any sample or centroid inputs
1296 * if perspective and linear are required
1297*/
1298static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1299{
1300	unsigned i;
1301
1302	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1303
1304	/*
1305	 * Could get this information from the shader info. But right now
1306	 * we interpolate all declared inputs, whereas the shader info will
1307	 * only contain the bits if the inputs are actually used, so it might
1308	 * not be safe...
1309	 */
1310	for (i = 0; i < ctx->info.num_inputs; i++) {
1311		int k;
1312		/* skip position/face/mask/sampleid */
1313		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1314		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1315		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1316		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1317			continue;
1318
1319		k = eg_get_interpolator_index(
1320			ctx->info.input_interpolate[i],
1321			ctx->info.input_interpolate_loc[i]);
1322		if (k >= 0)
1323			ctx->eg_interpolators[k].enabled = TRUE;
1324	}
1325
1326	/* XXX PULL MODEL and LINE STIPPLE */
1327
1328	return allocate_system_value_inputs(ctx, 0);
1329}
1330
1331/* sample_id_sel == NULL means fetch for current sample */
1332static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1333{
1334	struct r600_bytecode_vtx vtx;
1335	int r, t1;
1336
1337	t1 = r600_get_temp(ctx);
1338
1339	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1340	vtx.op = FETCH_OP_VFETCH;
1341	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1342	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1343	if (sample_id == NULL) {
1344		assert(ctx->fixed_pt_position_gpr != -1);
1345
1346		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1347		vtx.src_sel_x = 3;
1348	}
1349	else {
1350		struct r600_bytecode_alu alu;
1351
1352		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1353		alu.op = ALU_OP1_MOV;
1354		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1355		alu.dst.sel = t1;
1356		alu.dst.write = 1;
1357		alu.last = 1;
1358		r = r600_bytecode_add_alu(ctx->bc, &alu);
1359		if (r)
1360			return r;
1361
1362		vtx.src_gpr = t1;
1363		vtx.src_sel_x = 0;
1364	}
1365	vtx.mega_fetch_count = 16;
1366	vtx.dst_gpr = t1;
1367	vtx.dst_sel_x = 0;
1368	vtx.dst_sel_y = 1;
1369	vtx.dst_sel_z = 2;
1370	vtx.dst_sel_w = 3;
1371	vtx.data_format = FMT_32_32_32_32_FLOAT;
1372	vtx.num_format_all = 2;
1373	vtx.format_comp_all = 1;
1374	vtx.use_const_fields = 0;
1375	vtx.offset = 0;
1376	vtx.endian = r600_endian_swap(32);
1377	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1378
1379	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1380	if (r)
1381		return r;
1382
1383	return t1;
1384}
1385
1386static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1387{
1388	int r;
1389	struct r600_bytecode_alu alu;
1390
1391	/* do a vtx fetch with wqm set on the vtx fetch */
1392	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1393	alu.op = ALU_OP1_MOV;
1394	alu.dst.sel = ctx->helper_invoc_reg;
1395	alu.dst.chan = 0;
1396	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1397	alu.src[0].value = 0xffffffff;
1398	alu.dst.write = 1;
1399	alu.last = 1;
1400	r = r600_bytecode_add_alu(ctx->bc, &alu);
1401	if (r)
1402		return r;
1403
1404	/* do a vtx fetch in VPM mode */
1405	struct r600_bytecode_vtx vtx;
1406	memset(&vtx, 0, sizeof(vtx));
1407	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1408	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1409	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1410	vtx.src_gpr = 0;
1411	vtx.mega_fetch_count = 16; /* no idea here really... */
1412	vtx.dst_gpr = ctx->helper_invoc_reg;
1413	vtx.dst_sel_x = 4;
1414	vtx.dst_sel_y = 7;		/* SEL_Y */
1415	vtx.dst_sel_z = 7;		/* SEL_Z */
1416	vtx.dst_sel_w = 7;		/* SEL_W */
1417	vtx.data_format = FMT_32;
1418	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1419		return r;
1420	ctx->bc->cf_last->vpm = 1;
1421	return 0;
1422}
1423
1424static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1425{
1426	int r;
1427	struct r600_bytecode_alu alu;
1428
1429	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1430	alu.op = ALU_OP1_MOV;
1431	alu.dst.sel = ctx->helper_invoc_reg;
1432	alu.dst.chan = 0;
1433	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1434	alu.src[0].value = 0xffffffff;
1435	alu.dst.write = 1;
1436	alu.last = 1;
1437	r = r600_bytecode_add_alu(ctx->bc, &alu);
1438	if (r)
1439		return r;
1440
1441	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1442	alu.op = ALU_OP1_MOV;
1443	alu.dst.sel = ctx->helper_invoc_reg;
1444	alu.dst.chan = 0;
1445	alu.src[0].sel = V_SQ_ALU_SRC_0;
1446	alu.dst.write = 1;
1447	alu.last = 1;
1448	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1449	if (r)
1450		return r;
1451
1452	return ctx->helper_invoc_reg;
1453}
1454
1455static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1456{
1457	struct r600_bytecode_vtx vtx;
1458	int r, t1;
1459
1460	if (ctx->cs_block_size_loaded)
1461		return ctx->cs_block_size_reg;
1462	if (ctx->cs_grid_size_loaded)
1463		return ctx->cs_grid_size_reg;
1464
1465	t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1466	struct r600_bytecode_alu alu;
1467	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1468	alu.op = ALU_OP1_MOV;
1469	alu.src[0].sel = V_SQ_ALU_SRC_0;
1470	alu.dst.sel = t1;
1471	alu.dst.write = 1;
1472	alu.last = 1;
1473	r = r600_bytecode_add_alu(ctx->bc, &alu);
1474	if (r)
1475		return r;
1476
1477	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1478	vtx.op = FETCH_OP_VFETCH;
1479	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1480	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1481	vtx.src_gpr = t1;
1482	vtx.src_sel_x = 0;
1483
1484	vtx.mega_fetch_count = 16;
1485	vtx.dst_gpr = t1;
1486	vtx.dst_sel_x = 0;
1487	vtx.dst_sel_y = 1;
1488	vtx.dst_sel_z = 2;
1489	vtx.dst_sel_w = 7;
1490	vtx.data_format = FMT_32_32_32_32;
1491	vtx.num_format_all = 1;
1492	vtx.format_comp_all = 0;
1493	vtx.use_const_fields = 0;
1494	vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1495	vtx.endian = r600_endian_swap(32);
1496	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1497
1498	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1499	if (r)
1500		return r;
1501
1502	if (load_block)
1503		ctx->cs_block_size_loaded = true;
1504	else
1505		ctx->cs_grid_size_loaded = true;
1506	return t1;
1507}
1508
1509static void tgsi_src(struct r600_shader_ctx *ctx,
1510		     const struct tgsi_full_src_register *tgsi_src,
1511		     struct r600_shader_src *r600_src)
1512{
1513	memset(r600_src, 0, sizeof(*r600_src));
1514	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1515	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1516	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1517	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1518	r600_src->neg = tgsi_src->Register.Negate;
1519	r600_src->abs = tgsi_src->Register.Absolute;
1520
1521	if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1522		bool spilled;
1523		unsigned idx;
1524
1525		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1526
1527		if (spilled) {
1528			int reg = r600_get_temp(ctx);
1529			int r;
1530
1531			r600_src->sel = reg;
1532
1533			if (ctx->bc->chip_class < R700) {
1534				struct r600_bytecode_output cf;
1535
1536				memset(&cf, 0, sizeof(struct r600_bytecode_output));
1537				cf.op = CF_OP_MEM_SCRATCH;
1538				cf.elem_size = 3;
1539				cf.gpr = reg;
1540				cf.comp_mask = 0xF;
1541				cf.swizzle_x = 0;
1542				cf.swizzle_y = 1;
1543				cf.swizzle_z = 2;
1544				cf.swizzle_w = 3;
1545				cf.burst_count = 1;
1546
1547				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1548					&cf.array_base, &cf.array_size);
1549
1550				if (tgsi_src->Register.Indirect) {
1551					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1552					cf.index_gpr = ctx->bc->ar_reg;
1553				}
1554				else {
1555					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1556					cf.array_base += idx;
1557					cf.array_size = 0;
1558				}
1559
1560				r = r600_bytecode_add_output(ctx->bc, &cf);
1561			}
1562			else {
1563				struct r600_bytecode_vtx vtx;
1564
1565				if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1566					r600_bytecode_need_wait_ack(ctx->bc, false);
1567					r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1568				}
1569
1570				memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1571				vtx.op = FETCH_OP_READ_SCRATCH;
1572				vtx.dst_gpr = reg;
1573				vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1574				vtx.elem_size = 3;
1575				vtx.data_format = FMT_32_32_32_32;
1576				vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1577				vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1578				vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1579				vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1580				vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1581
1582				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1583					&vtx.array_base, &vtx.array_size);
1584
1585				if (tgsi_src->Register.Indirect) {
1586					vtx.indexed = 1;
1587					vtx.src_gpr = ctx->bc->ar_reg;
1588				}
1589				else {
1590					vtx.array_base += idx;
1591					vtx.array_size = 0;
1592				}
1593
1594				r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1595			}
1596
1597			if (r)
1598				return;
1599		}
1600		else {
1601			if (tgsi_src->Register.Indirect)
1602				r600_src->rel = V_SQ_REL_RELATIVE;
1603
1604			r600_src->sel = idx;
1605		}
1606
1607		return;
1608	}
1609
1610	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1611		int index;
1612		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1613			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1614			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1615
1616			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1617			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1618			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1619				return;
1620		}
1621		index = tgsi_src->Register.Index;
1622		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1623		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1624	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1625		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1626			r600_src->swizzle[0] = 2; // Z value
1627			r600_src->swizzle[1] = 2;
1628			r600_src->swizzle[2] = 2;
1629			r600_src->swizzle[3] = 2;
1630			r600_src->sel = ctx->face_gpr;
1631		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1632			r600_src->swizzle[0] = 3; // W value
1633			r600_src->swizzle[1] = 3;
1634			r600_src->swizzle[2] = 3;
1635			r600_src->swizzle[3] = 3;
1636			r600_src->sel = ctx->fixed_pt_position_gpr;
1637		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1638			r600_src->swizzle[0] = 0;
1639			r600_src->swizzle[1] = 1;
1640			r600_src->swizzle[2] = 4;
1641			r600_src->swizzle[3] = 4;
1642			r600_src->sel = load_sample_position(ctx, NULL, -1);
1643		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1644			r600_src->swizzle[0] = 3;
1645			r600_src->swizzle[1] = 3;
1646			r600_src->swizzle[2] = 3;
1647			r600_src->swizzle[3] = 3;
1648			r600_src->sel = 0;
1649		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1650			r600_src->swizzle[0] = 0;
1651			r600_src->swizzle[1] = 0;
1652			r600_src->swizzle[2] = 0;
1653			r600_src->swizzle[3] = 0;
1654			r600_src->sel = 0;
1655		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1656			r600_src->sel = 0;
1657		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1658			r600_src->sel = 1;
1659		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1660			r600_src->swizzle[0] = 3;
1661			r600_src->swizzle[1] = 3;
1662			r600_src->swizzle[2] = 3;
1663			r600_src->swizzle[3] = 3;
1664			r600_src->sel = 1;
1665		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1666			r600_src->swizzle[0] = 2;
1667			r600_src->swizzle[1] = 2;
1668			r600_src->swizzle[2] = 2;
1669			r600_src->swizzle[3] = 2;
1670			r600_src->sel = 0;
1671		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1672			r600_src->sel = 1;
1673		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1674			r600_src->sel = 3;
1675		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1676			r600_src->sel = 2;
1677		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1678			r600_src->sel = ctx->tess_input_info;
1679			r600_src->swizzle[0] = 2;
1680			r600_src->swizzle[1] = 2;
1681			r600_src->swizzle[2] = 2;
1682			r600_src->swizzle[3] = 2;
1683		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1684			r600_src->sel = 0;
1685			r600_src->swizzle[0] = 0;
1686			r600_src->swizzle[1] = 0;
1687			r600_src->swizzle[2] = 0;
1688			r600_src->swizzle[3] = 0;
1689		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1690			r600_src->sel = 0;
1691			r600_src->swizzle[0] = 3;
1692			r600_src->swizzle[1] = 3;
1693			r600_src->swizzle[2] = 3;
1694			r600_src->swizzle[3] = 3;
1695		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1696			r600_src->sel = load_block_grid_size(ctx, false);
1697		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1698			r600_src->sel = load_block_grid_size(ctx, true);
1699		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1700			r600_src->sel = ctx->helper_invoc_reg;
1701			r600_src->swizzle[0] = 0;
1702			r600_src->swizzle[1] = 0;
1703			r600_src->swizzle[2] = 0;
1704			r600_src->swizzle[3] = 0;
1705		}
1706	} else {
1707		if (tgsi_src->Register.Indirect)
1708			r600_src->rel = V_SQ_REL_RELATIVE;
1709		r600_src->sel = tgsi_src->Register.Index;
1710		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1711	}
1712	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1713		if (tgsi_src->Register.Dimension) {
1714			r600_src->kc_bank = tgsi_src->Dimension.Index;
1715			if (tgsi_src->Dimension.Indirect) {
1716				r600_src->kc_rel = 1;
1717			}
1718		}
1719	}
1720}
1721
1722static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1723                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1724                                unsigned int dst_reg)
1725{
1726	struct r600_bytecode_vtx vtx;
1727	unsigned int ar_reg;
1728	int r;
1729
1730	if (offset) {
1731		struct r600_bytecode_alu alu;
1732
1733		memset(&alu, 0, sizeof(alu));
1734
1735		alu.op = ALU_OP2_ADD_INT;
1736		alu.src[0].sel = ctx->bc->ar_reg;
1737		alu.src[0].chan = ar_chan;
1738
1739		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1740		alu.src[1].value = offset;
1741
1742		alu.dst.sel = dst_reg;
1743		alu.dst.chan = ar_chan;
1744		alu.dst.write = 1;
1745		alu.last = 1;
1746
1747		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1748			return r;
1749
1750		ar_reg = dst_reg;
1751	} else {
1752		ar_reg = ctx->bc->ar_reg;
1753	}
1754
1755	memset(&vtx, 0, sizeof(vtx));
1756	vtx.buffer_id = cb_idx;
1757	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1758	vtx.src_gpr = ar_reg;
1759	vtx.src_sel_x = ar_chan;
1760	vtx.mega_fetch_count = 16;
1761	vtx.dst_gpr = dst_reg;
1762	vtx.dst_sel_x = 0;		/* SEL_X */
1763	vtx.dst_sel_y = 1;		/* SEL_Y */
1764	vtx.dst_sel_z = 2;		/* SEL_Z */
1765	vtx.dst_sel_w = 3;		/* SEL_W */
1766	vtx.data_format = FMT_32_32_32_32_FLOAT;
1767	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1768	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1769	vtx.endian = r600_endian_swap(32);
1770	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1771
1772	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1773		return r;
1774
1775	return 0;
1776}
1777
1778static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1779{
1780	struct r600_bytecode_vtx vtx;
1781	int r;
1782	unsigned index = src->Register.Index;
1783	unsigned vtx_id = src->Dimension.Index;
1784	int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1785	int offset_chan = vtx_id % 3;
1786	int t2 = 0;
1787
1788	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1789	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1790
1791	if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1792		offset_chan = 3;
1793
1794	if (src->Dimension.Indirect || src->Register.Indirect)
1795		t2 = r600_get_temp(ctx);
1796
1797	if (src->Dimension.Indirect) {
1798		int treg[3];
1799		struct r600_bytecode_alu alu;
1800		int r, i;
1801		unsigned addr_reg;
1802		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1803		if (src->DimIndirect.Index > 0) {
1804			r = single_alu_op2(ctx, ALU_OP1_MOV,
1805					   ctx->bc->ar_reg, 0,
1806					   addr_reg, 0,
1807					   0, 0);
1808			if (r)
1809				return r;
1810		}
1811		/*
1812		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1813		   at least this is what fglrx seems to do. */
1814		for (i = 0; i < 3; i++) {
1815			treg[i] = r600_get_temp(ctx);
1816		}
1817		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1818
1819		for (i = 0; i < 3; i++) {
1820			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1821			alu.op = ALU_OP1_MOV;
1822			alu.src[0].sel = ctx->gs_rotated_input[0];
1823			alu.src[0].chan = i == 2 ? 3 : i;
1824			alu.dst.sel = treg[i];
1825			alu.dst.chan = 0;
1826			alu.dst.write = 1;
1827			alu.last = 1;
1828			r = r600_bytecode_add_alu(ctx->bc, &alu);
1829			if (r)
1830				return r;
1831		}
1832		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1833		alu.op = ALU_OP1_MOV;
1834		alu.src[0].sel = treg[0];
1835		alu.src[0].rel = 1;
1836		alu.dst.sel = t2;
1837		alu.dst.write = 1;
1838		alu.last = 1;
1839		r = r600_bytecode_add_alu(ctx->bc, &alu);
1840		if (r)
1841			return r;
1842		offset_reg = t2;
1843		offset_chan = 0;
1844	}
1845
1846	if (src->Register.Indirect) {
1847		int addr_reg;
1848		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1849
1850		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1851
1852		/* pull the value from index_reg */
1853		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1854				   t2, 1,
1855				   addr_reg, 0,
1856				   V_SQ_ALU_SRC_LITERAL, first);
1857		if (r)
1858			return r;
1859		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1860				   t2, 0,
1861				   t2, 1,
1862				   V_SQ_ALU_SRC_LITERAL, 4,
1863				   offset_reg, offset_chan);
1864		if (r)
1865			return r;
1866		offset_reg = t2;
1867		offset_chan = 0;
1868		index = src->Register.Index - first;
1869	}
1870
1871	memset(&vtx, 0, sizeof(vtx));
1872	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1873	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1874	vtx.src_gpr = offset_reg;
1875	vtx.src_sel_x = offset_chan;
1876	vtx.offset = index * 16; /*bytes*/
1877	vtx.mega_fetch_count = 16;
1878	vtx.dst_gpr = dst_reg;
1879	vtx.dst_sel_x = 0;		/* SEL_X */
1880	vtx.dst_sel_y = 1;		/* SEL_Y */
1881	vtx.dst_sel_z = 2;		/* SEL_Z */
1882	vtx.dst_sel_w = 3;		/* SEL_W */
1883	if (ctx->bc->chip_class >= EVERGREEN) {
1884		vtx.use_const_fields = 1;
1885	} else {
1886		vtx.data_format = FMT_32_32_32_32_FLOAT;
1887	}
1888
1889	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1890		return r;
1891
1892	return 0;
1893}
1894
1895static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1896{
1897	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1898	unsigned i;
1899
1900	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1901		struct tgsi_full_src_register *src = &inst->Src[i];
1902
1903		if (src->Register.File == TGSI_FILE_INPUT) {
1904			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1905				/* primitive id is in R0.z */
1906				ctx->src[i].sel = 0;
1907				ctx->src[i].swizzle[0] = 2;
1908			}
1909		}
1910		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1911			int treg = r600_get_temp(ctx);
1912
1913			fetch_gs_input(ctx, src, treg);
1914			ctx->src[i].sel = treg;
1915			ctx->src[i].rel = 0;
1916		}
1917	}
1918	return 0;
1919}
1920
1921
1922/* Tessellation shaders pass outputs to the next shader using LDS.
1923 *
1924 * LS outputs = TCS(HS) inputs
1925 * TCS(HS) outputs = TES(DS) inputs
1926 *
1927 * The LDS layout is:
1928 * - TCS inputs for patch 0
1929 * - TCS inputs for patch 1
1930 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1931 * - ...
1932 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1933 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1934 * - TCS outputs for patch 1
1935 * - Per-patch TCS outputs for patch 1
1936 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1937 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1938 * - ...
1939 *
1940 * All three shaders VS(LS), TCS, TES share the same LDS space.
1941 */
1942/* this will return with the dw address in temp_reg.x */
1943static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1944				 const struct tgsi_full_dst_register *dst,
1945				 const struct tgsi_full_src_register *src,
1946				 int stride_bytes_reg, int stride_bytes_chan)
1947{
1948	struct tgsi_full_dst_register reg;
1949	ubyte *name, *index, *array_first;
1950	int r;
1951	int param;
1952	struct tgsi_shader_info *info = &ctx->info;
1953	/* Set the register description. The address computation is the same
1954	 * for sources and destinations. */
1955	if (src) {
1956		reg.Register.File = src->Register.File;
1957		reg.Register.Index = src->Register.Index;
1958		reg.Register.Indirect = src->Register.Indirect;
1959		reg.Register.Dimension = src->Register.Dimension;
1960		reg.Indirect = src->Indirect;
1961		reg.Dimension = src->Dimension;
1962		reg.DimIndirect = src->DimIndirect;
1963	} else
1964		reg = *dst;
1965
1966	/* If the register is 2-dimensional (e.g. an array of vertices
1967	 * in a primitive), calculate the base address of the vertex. */
1968	if (reg.Register.Dimension) {
1969		int sel, chan;
1970		if (reg.Dimension.Indirect) {
1971			unsigned addr_reg;
1972			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1973
1974			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1975			/* pull the value from index_reg */
1976			sel = addr_reg;
1977			chan = 0;
1978		} else {
1979			sel = V_SQ_ALU_SRC_LITERAL;
1980			chan = reg.Dimension.Index;
1981		}
1982
1983		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1984				   temp_reg, 0,
1985				   stride_bytes_reg, stride_bytes_chan,
1986				   sel, chan,
1987				   temp_reg, 0);
1988		if (r)
1989			return r;
1990	}
1991
1992	if (reg.Register.File == TGSI_FILE_INPUT) {
1993		name = info->input_semantic_name;
1994		index = info->input_semantic_index;
1995		array_first = info->input_array_first;
1996	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1997		name = info->output_semantic_name;
1998		index = info->output_semantic_index;
1999		array_first = info->output_array_first;
2000	} else {
2001		assert(0);
2002		return -1;
2003	}
2004	if (reg.Register.Indirect) {
2005		int addr_reg;
2006		int first;
2007		/* Add the relative address of the element. */
2008		if (reg.Indirect.ArrayID)
2009			first = array_first[reg.Indirect.ArrayID];
2010		else
2011			first = reg.Register.Index;
2012
2013		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2014
2015		/* pull the value from index_reg */
2016		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2017				   temp_reg, 0,
2018				   V_SQ_ALU_SRC_LITERAL, 16,
2019				   addr_reg, 0,
2020				   temp_reg, 0);
2021		if (r)
2022			return r;
2023
2024		param = r600_get_lds_unique_index(name[first],
2025						  index[first]);
2026
2027	} else {
2028		param = r600_get_lds_unique_index(name[reg.Register.Index],
2029						  index[reg.Register.Index]);
2030	}
2031
2032	/* add to base_addr - passed in temp_reg.x */
2033	if (param) {
2034		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2035				   temp_reg, 0,
2036				   temp_reg, 0,
2037				   V_SQ_ALU_SRC_LITERAL, param * 16);
2038		if (r)
2039			return r;
2040
2041	}
2042	return 0;
2043}
2044
2045static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2046			       unsigned dst_reg, unsigned mask)
2047{
2048	struct r600_bytecode_alu alu;
2049	int r, i, lasti;
2050
2051	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2052		ctx->bc->force_add_cf = 1;
2053
2054	lasti = tgsi_last_instruction(mask);
2055	for (i = 1; i <= lasti; i++) {
2056		if (!(mask & (1 << i)))
2057			continue;
2058
2059		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2060				   temp_reg, i,
2061				   temp_reg, 0,
2062				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2063		if (r)
2064			return r;
2065	}
2066	for (i = 0; i <= lasti; i++) {
2067		if (!(mask & (1 << i)))
2068			continue;
2069
2070		/* emit an LDS_READ_RET */
2071		memset(&alu, 0, sizeof(alu));
2072		alu.op = LDS_OP1_LDS_READ_RET;
2073		alu.src[0].sel = temp_reg;
2074		alu.src[0].chan = i;
2075		alu.src[1].sel = V_SQ_ALU_SRC_0;
2076		alu.src[2].sel = V_SQ_ALU_SRC_0;
2077		alu.dst.chan = 0;
2078		alu.is_lds_idx_op = true;
2079		alu.last = 1;
2080		r = r600_bytecode_add_alu(ctx->bc, &alu);
2081		if (r)
2082			return r;
2083	}
2084	for (i = 0; i <= lasti; i++) {
2085		if (!(mask & (1 << i)))
2086			continue;
2087
2088		/* then read from LDS_OQ_A_POP */
2089		memset(&alu, 0, sizeof(alu));
2090
2091		alu.op = ALU_OP1_MOV;
2092		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2093		alu.src[0].chan = 0;
2094		alu.dst.sel = dst_reg;
2095		alu.dst.chan = i;
2096		alu.dst.write = 1;
2097		alu.last = 1;
2098		r = r600_bytecode_add_alu(ctx->bc, &alu);
2099		if (r)
2100			return r;
2101	}
2102	return 0;
2103}
2104
2105static int fetch_mask(struct tgsi_src_register *reg)
2106{
2107	int mask = 0;
2108	mask |= 1 << reg->SwizzleX;
2109	mask |= 1 << reg->SwizzleY;
2110	mask |= 1 << reg->SwizzleZ;
2111	mask |= 1 << reg->SwizzleW;
2112	return mask;
2113}
2114
2115static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2116{
2117	int r;
2118	unsigned temp_reg = r600_get_temp(ctx);
2119
2120	r = get_lds_offset0(ctx, 2, temp_reg,
2121			    src->Register.Dimension ? false : true);
2122	if (r)
2123		return r;
2124
2125	/* the base address is now in temp.x */
2126	r = r600_get_byte_address(ctx, temp_reg,
2127				  NULL, src, ctx->tess_output_info, 1);
2128	if (r)
2129		return r;
2130
2131	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2132	if (r)
2133		return r;
2134	return 0;
2135}
2136
2137static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2138{
2139	int r;
2140	unsigned temp_reg = r600_get_temp(ctx);
2141
2142	/* t.x = ips * r0.y */
2143	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2144			   temp_reg, 0,
2145			   ctx->tess_input_info, 0,
2146			   0, 1);
2147
2148	if (r)
2149		return r;
2150
2151	/* the base address is now in temp.x */
2152	r = r600_get_byte_address(ctx, temp_reg,
2153				  NULL, src, ctx->tess_input_info, 1);
2154	if (r)
2155		return r;
2156
2157	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2158	if (r)
2159		return r;
2160	return 0;
2161}
2162
2163static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2164{
2165	int r;
2166	unsigned temp_reg = r600_get_temp(ctx);
2167
2168	r = get_lds_offset0(ctx, 1, temp_reg,
2169			    src->Register.Dimension ? false : true);
2170	if (r)
2171		return r;
2172	/* the base address is now in temp.x */
2173	r = r600_get_byte_address(ctx, temp_reg,
2174				  NULL, src,
2175				  ctx->tess_output_info, 1);
2176	if (r)
2177		return r;
2178
2179	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2180	if (r)
2181		return r;
2182	return 0;
2183}
2184
2185static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2186{
2187	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2188	unsigned i;
2189
2190	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2191		struct tgsi_full_src_register *src = &inst->Src[i];
2192
2193		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2194			int treg = r600_get_temp(ctx);
2195			fetch_tes_input(ctx, src, treg);
2196			ctx->src[i].sel = treg;
2197			ctx->src[i].rel = 0;
2198		}
2199		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2200			int treg = r600_get_temp(ctx);
2201			fetch_tcs_input(ctx, src, treg);
2202			ctx->src[i].sel = treg;
2203			ctx->src[i].rel = 0;
2204		}
2205		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2206			int treg = r600_get_temp(ctx);
2207			fetch_tcs_output(ctx, src, treg);
2208			ctx->src[i].sel = treg;
2209			ctx->src[i].rel = 0;
2210		}
2211	}
2212	return 0;
2213}
2214
2215static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2216{
2217	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2218	struct r600_bytecode_alu alu;
2219	int i, j, k, nconst, r;
2220
2221	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2222		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2223			nconst++;
2224		}
2225		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2226	}
2227	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2228		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2229			continue;
2230		}
2231
2232		if (ctx->src[i].rel) {
2233			int chan = inst->Src[i].Indirect.Swizzle;
2234			int treg = r600_get_temp(ctx);
2235			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2236				return r;
2237
2238			ctx->src[i].kc_bank = 0;
2239			ctx->src[i].kc_rel = 0;
2240			ctx->src[i].sel = treg;
2241			ctx->src[i].rel = 0;
2242			j--;
2243		} else if (j > 0) {
2244			int treg = r600_get_temp(ctx);
2245			for (k = 0; k < 4; k++) {
2246				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2247				alu.op = ALU_OP1_MOV;
2248				alu.src[0].sel = ctx->src[i].sel;
2249				alu.src[0].chan = k;
2250				alu.src[0].rel = ctx->src[i].rel;
2251				alu.src[0].kc_bank = ctx->src[i].kc_bank;
2252				alu.src[0].kc_rel = ctx->src[i].kc_rel;
2253				alu.dst.sel = treg;
2254				alu.dst.chan = k;
2255				alu.dst.write = 1;
2256				if (k == 3)
2257					alu.last = 1;
2258				r = r600_bytecode_add_alu(ctx->bc, &alu);
2259				if (r)
2260					return r;
2261			}
2262			ctx->src[i].sel = treg;
2263			ctx->src[i].rel =0;
2264			j--;
2265		}
2266	}
2267	return 0;
2268}
2269
2270/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2271static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2272{
2273	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2274	struct r600_bytecode_alu alu;
2275	int i, j, k, nliteral, r;
2276
2277	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2278		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2279			nliteral++;
2280		}
2281	}
2282	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2283		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2284			int treg = r600_get_temp(ctx);
2285			for (k = 0; k < 4; k++) {
2286				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2287				alu.op = ALU_OP1_MOV;
2288				alu.src[0].sel = ctx->src[i].sel;
2289				alu.src[0].chan = k;
2290				alu.src[0].value = ctx->src[i].value[k];
2291				alu.dst.sel = treg;
2292				alu.dst.chan = k;
2293				alu.dst.write = 1;
2294				if (k == 3)
2295					alu.last = 1;
2296				r = r600_bytecode_add_alu(ctx->bc, &alu);
2297				if (r)
2298					return r;
2299			}
2300			ctx->src[i].sel = treg;
2301			j--;
2302		}
2303	}
2304	return 0;
2305}
2306
2307static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2308{
2309	int i, r, count = ctx->shader->ninput;
2310
2311	for (i = 0; i < count; i++) {
2312		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2313			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2314			if (r)
2315				return r;
2316		}
2317	}
2318	return 0;
2319}
2320
2321static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2322						  int stream, unsigned *stream_item_size UNUSED)
2323{
2324	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2325	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2326	int j, r;
2327	unsigned i;
2328
2329	/* Sanity checking. */
2330	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2331		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2332		r = -EINVAL;
2333		goto out_err;
2334	}
2335	for (i = 0; i < so->num_outputs; i++) {
2336		if (so->output[i].output_buffer >= 4) {
2337			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2338				 so->output[i].output_buffer);
2339			r = -EINVAL;
2340			goto out_err;
2341		}
2342	}
2343
2344	/* Initialize locations where the outputs are stored. */
2345	for (i = 0; i < so->num_outputs; i++) {
2346
2347		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2348		start_comp[i] = so->output[i].start_component;
2349		/* Lower outputs with dst_offset < start_component.
2350		 *
2351		 * We can only output 4D vectors with a write mask, e.g. we can
2352		 * only output the W component at offset 3, etc. If we want
2353		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2354		 * to move it to X and output X. */
2355		if (so->output[i].dst_offset < so->output[i].start_component) {
2356			unsigned tmp = r600_get_temp(ctx);
2357
2358			for (j = 0; j < so->output[i].num_components; j++) {
2359				struct r600_bytecode_alu alu;
2360				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2361				alu.op = ALU_OP1_MOV;
2362				alu.src[0].sel = so_gpr[i];
2363				alu.src[0].chan = so->output[i].start_component + j;
2364
2365				alu.dst.sel = tmp;
2366				alu.dst.chan = j;
2367				alu.dst.write = 1;
2368				if (j == so->output[i].num_components - 1)
2369					alu.last = 1;
2370				r = r600_bytecode_add_alu(ctx->bc, &alu);
2371				if (r)
2372					return r;
2373			}
2374			start_comp[i] = 0;
2375			so_gpr[i] = tmp;
2376		}
2377	}
2378
2379	/* Write outputs to buffers. */
2380	for (i = 0; i < so->num_outputs; i++) {
2381		struct r600_bytecode_output output;
2382
2383		if (stream != -1 && stream != so->output[i].stream)
2384			continue;
2385
2386		memset(&output, 0, sizeof(struct r600_bytecode_output));
2387		output.gpr = so_gpr[i];
2388		output.elem_size = so->output[i].num_components - 1;
2389		if (output.elem_size == 2)
2390			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2391		output.array_base = so->output[i].dst_offset - start_comp[i];
2392		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2393		output.burst_count = 1;
2394		/* array_size is an upper limit for the burst_count
2395		 * with MEM_STREAM instructions */
2396		output.array_size = 0xFFF;
2397		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2398
2399		if (ctx->bc->chip_class >= EVERGREEN) {
2400			switch (so->output[i].output_buffer) {
2401			case 0:
2402				output.op = CF_OP_MEM_STREAM0_BUF0;
2403				break;
2404			case 1:
2405				output.op = CF_OP_MEM_STREAM0_BUF1;
2406				break;
2407			case 2:
2408				output.op = CF_OP_MEM_STREAM0_BUF2;
2409				break;
2410			case 3:
2411				output.op = CF_OP_MEM_STREAM0_BUF3;
2412				break;
2413			}
2414			output.op += so->output[i].stream * 4;
2415			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2416			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2417		} else {
2418			switch (so->output[i].output_buffer) {
2419			case 0:
2420				output.op = CF_OP_MEM_STREAM0;
2421				break;
2422			case 1:
2423				output.op = CF_OP_MEM_STREAM1;
2424				break;
2425			case 2:
2426				output.op = CF_OP_MEM_STREAM2;
2427				break;
2428			case 3:
2429				output.op = CF_OP_MEM_STREAM3;
2430					break;
2431			}
2432			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2433		}
2434		r = r600_bytecode_add_output(ctx->bc, &output);
2435		if (r)
2436			goto out_err;
2437	}
2438	return 0;
2439out_err:
2440	return r;
2441}
2442
2443static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2444{
2445	struct r600_bytecode_alu alu;
2446	unsigned reg;
2447
2448	if (!ctx->shader->vs_out_edgeflag)
2449		return;
2450
2451	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2452
2453	/* clamp(x, 0, 1) */
2454	memset(&alu, 0, sizeof(alu));
2455	alu.op = ALU_OP1_MOV;
2456	alu.src[0].sel = reg;
2457	alu.dst.sel = reg;
2458	alu.dst.write = 1;
2459	alu.dst.clamp = 1;
2460	alu.last = 1;
2461	r600_bytecode_add_alu(ctx->bc, &alu);
2462
2463	memset(&alu, 0, sizeof(alu));
2464	alu.op = ALU_OP1_FLT_TO_INT;
2465	alu.src[0].sel = reg;
2466	alu.dst.sel = reg;
2467	alu.dst.write = 1;
2468	alu.last = 1;
2469	r600_bytecode_add_alu(ctx->bc, &alu);
2470}
2471
2472static int generate_gs_copy_shader(struct r600_context *rctx,
2473				   struct r600_pipe_shader *gs,
2474				   struct pipe_stream_output_info *so)
2475{
2476	struct r600_shader_ctx ctx = {};
2477	struct r600_shader *gs_shader = &gs->shader;
2478	struct r600_pipe_shader *cshader;
2479	unsigned ocnt = gs_shader->noutput;
2480	struct r600_bytecode_alu alu;
2481	struct r600_bytecode_vtx vtx;
2482	struct r600_bytecode_output output;
2483	struct r600_bytecode_cf *cf_jump, *cf_pop,
2484		*last_exp_pos = NULL, *last_exp_param = NULL;
2485	int next_clip_pos = 61, next_param = 0;
2486	unsigned i, j;
2487	int ring;
2488	bool only_ring_0 = true;
2489	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2490	if (!cshader)
2491		return 0;
2492
2493	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2494	       sizeof(struct r600_shader_io));
2495
2496	cshader->shader.noutput = ocnt;
2497
2498	ctx.shader = &cshader->shader;
2499	ctx.bc = &ctx.shader->bc;
2500	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2501
2502	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2503			   rctx->screen->has_compressed_msaa_texturing);
2504
2505	ctx.bc->isa = rctx->isa;
2506
2507	cf_jump = NULL;
2508	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2509
2510	/* R0.x = R0.x & 0x3fffffff */
2511	memset(&alu, 0, sizeof(alu));
2512	alu.op = ALU_OP2_AND_INT;
2513	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2514	alu.src[1].value = 0x3fffffff;
2515	alu.dst.write = 1;
2516	r600_bytecode_add_alu(ctx.bc, &alu);
2517
2518	/* R0.y = R0.x >> 30 */
2519	memset(&alu, 0, sizeof(alu));
2520	alu.op = ALU_OP2_LSHR_INT;
2521	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2522	alu.src[1].value = 0x1e;
2523	alu.dst.chan = 1;
2524	alu.dst.write = 1;
2525	alu.last = 1;
2526	r600_bytecode_add_alu(ctx.bc, &alu);
2527
2528	/* fetch vertex data from GSVS ring */
2529	for (i = 0; i < ocnt; ++i) {
2530		struct r600_shader_io *out = &ctx.shader->output[i];
2531
2532		out->gpr = i + 1;
2533		out->ring_offset = i * 16;
2534
2535		memset(&vtx, 0, sizeof(vtx));
2536		vtx.op = FETCH_OP_VFETCH;
2537		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2538		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2539		vtx.mega_fetch_count = 16;
2540		vtx.offset = out->ring_offset;
2541		vtx.dst_gpr = out->gpr;
2542		vtx.src_gpr = 0;
2543		vtx.dst_sel_x = 0;
2544		vtx.dst_sel_y = 1;
2545		vtx.dst_sel_z = 2;
2546		vtx.dst_sel_w = 3;
2547		if (rctx->b.chip_class >= EVERGREEN) {
2548			vtx.use_const_fields = 1;
2549		} else {
2550			vtx.data_format = FMT_32_32_32_32_FLOAT;
2551		}
2552
2553		r600_bytecode_add_vtx(ctx.bc, &vtx);
2554	}
2555	ctx.temp_reg = i + 1;
2556	for (ring = 3; ring >= 0; --ring) {
2557		bool enabled = false;
2558		for (i = 0; i < so->num_outputs; i++) {
2559			if (so->output[i].stream == ring) {
2560				enabled = true;
2561				if (ring > 0)
2562					only_ring_0 = false;
2563				break;
2564			}
2565		}
2566		if (ring != 0 && !enabled) {
2567			cshader->shader.ring_item_sizes[ring] = 0;
2568			continue;
2569		}
2570
2571		if (cf_jump) {
2572			// Patch up jump label
2573			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2574			cf_pop = ctx.bc->cf_last;
2575
2576			cf_jump->cf_addr = cf_pop->id + 2;
2577			cf_jump->pop_count = 1;
2578			cf_pop->cf_addr = cf_pop->id + 2;
2579			cf_pop->pop_count = 1;
2580		}
2581
2582		/* PRED_SETE_INT __, R0.y, ring */
2583		memset(&alu, 0, sizeof(alu));
2584		alu.op = ALU_OP2_PRED_SETE_INT;
2585		alu.src[0].chan = 1;
2586		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2587		alu.src[1].value = ring;
2588		alu.execute_mask = 1;
2589		alu.update_pred = 1;
2590		alu.last = 1;
2591		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2592
2593		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2594		cf_jump = ctx.bc->cf_last;
2595
2596		if (enabled)
2597			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2598		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2599	}
2600
2601	/* bc adds nops - copy it */
2602	if (ctx.bc->chip_class == R600) {
2603		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2604		alu.op = ALU_OP0_NOP;
2605		alu.last = 1;
2606		r600_bytecode_add_alu(ctx.bc, &alu);
2607
2608		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2609	}
2610
2611	/* export vertex data */
2612	/* XXX factor out common code with r600_shader_from_tgsi ? */
2613	for (i = 0; i < ocnt; ++i) {
2614		struct r600_shader_io *out = &ctx.shader->output[i];
2615		bool instream0 = true;
2616		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2617			continue;
2618
2619		for (j = 0; j < so->num_outputs; j++) {
2620			if (so->output[j].register_index == i) {
2621				if (so->output[j].stream == 0)
2622					break;
2623				if (so->output[j].stream > 0)
2624					instream0 = false;
2625			}
2626		}
2627		if (!instream0)
2628			continue;
2629		memset(&output, 0, sizeof(output));
2630		output.gpr = out->gpr;
2631		output.elem_size = 3;
2632		output.swizzle_x = 0;
2633		output.swizzle_y = 1;
2634		output.swizzle_z = 2;
2635		output.swizzle_w = 3;
2636		output.burst_count = 1;
2637		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2638		output.op = CF_OP_EXPORT;
2639		switch (out->name) {
2640		case TGSI_SEMANTIC_POSITION:
2641			output.array_base = 60;
2642			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2643			break;
2644
2645		case TGSI_SEMANTIC_PSIZE:
2646			output.array_base = 61;
2647			if (next_clip_pos == 61)
2648				next_clip_pos = 62;
2649			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2650			output.swizzle_y = 7;
2651			output.swizzle_z = 7;
2652			output.swizzle_w = 7;
2653			ctx.shader->vs_out_misc_write = 1;
2654			ctx.shader->vs_out_point_size = 1;
2655			break;
2656		case TGSI_SEMANTIC_LAYER:
2657			if (out->spi_sid) {
2658				/* duplicate it as PARAM to pass to the pixel shader */
2659				output.array_base = next_param++;
2660				r600_bytecode_add_output(ctx.bc, &output);
2661				last_exp_param = ctx.bc->cf_last;
2662			}
2663			output.array_base = 61;
2664			if (next_clip_pos == 61)
2665				next_clip_pos = 62;
2666			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2667			output.swizzle_x = 7;
2668			output.swizzle_y = 7;
2669			output.swizzle_z = 0;
2670			output.swizzle_w = 7;
2671			ctx.shader->vs_out_misc_write = 1;
2672			ctx.shader->vs_out_layer = 1;
2673			break;
2674		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2675			if (out->spi_sid) {
2676				/* duplicate it as PARAM to pass to the pixel shader */
2677				output.array_base = next_param++;
2678				r600_bytecode_add_output(ctx.bc, &output);
2679				last_exp_param = ctx.bc->cf_last;
2680			}
2681			output.array_base = 61;
2682			if (next_clip_pos == 61)
2683				next_clip_pos = 62;
2684			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2685			ctx.shader->vs_out_misc_write = 1;
2686			ctx.shader->vs_out_viewport = 1;
2687			output.swizzle_x = 7;
2688			output.swizzle_y = 7;
2689			output.swizzle_z = 7;
2690			output.swizzle_w = 0;
2691			break;
2692		case TGSI_SEMANTIC_CLIPDIST:
2693			/* spi_sid is 0 for clipdistance outputs that were generated
2694			 * for clipvertex - we don't need to pass them to PS */
2695			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2696			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2697			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2698			if (out->spi_sid) {
2699				/* duplicate it as PARAM to pass to the pixel shader */
2700				output.array_base = next_param++;
2701				r600_bytecode_add_output(ctx.bc, &output);
2702				last_exp_param = ctx.bc->cf_last;
2703			}
2704			output.array_base = next_clip_pos++;
2705			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2706			break;
2707		case TGSI_SEMANTIC_FOG:
2708			output.swizzle_y = 4; /* 0 */
2709			output.swizzle_z = 4; /* 0 */
2710			output.swizzle_w = 5; /* 1 */
2711			break;
2712		default:
2713			output.array_base = next_param++;
2714			break;
2715		}
2716		r600_bytecode_add_output(ctx.bc, &output);
2717		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2718			last_exp_param = ctx.bc->cf_last;
2719		else
2720			last_exp_pos = ctx.bc->cf_last;
2721	}
2722
2723	if (!last_exp_pos) {
2724		memset(&output, 0, sizeof(output));
2725		output.gpr = 0;
2726		output.elem_size = 3;
2727		output.swizzle_x = 7;
2728		output.swizzle_y = 7;
2729		output.swizzle_z = 7;
2730		output.swizzle_w = 7;
2731		output.burst_count = 1;
2732		output.type = 2;
2733		output.op = CF_OP_EXPORT;
2734		output.array_base = 60;
2735		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2736		r600_bytecode_add_output(ctx.bc, &output);
2737		last_exp_pos = ctx.bc->cf_last;
2738	}
2739
2740	if (!last_exp_param) {
2741		memset(&output, 0, sizeof(output));
2742		output.gpr = 0;
2743		output.elem_size = 3;
2744		output.swizzle_x = 7;
2745		output.swizzle_y = 7;
2746		output.swizzle_z = 7;
2747		output.swizzle_w = 7;
2748		output.burst_count = 1;
2749		output.type = 2;
2750		output.op = CF_OP_EXPORT;
2751		output.array_base = next_param++;
2752		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2753		r600_bytecode_add_output(ctx.bc, &output);
2754		last_exp_param = ctx.bc->cf_last;
2755	}
2756
2757	last_exp_pos->op = CF_OP_EXPORT_DONE;
2758	last_exp_param->op = CF_OP_EXPORT_DONE;
2759
2760	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2761	cf_pop = ctx.bc->cf_last;
2762
2763	cf_jump->cf_addr = cf_pop->id + 2;
2764	cf_jump->pop_count = 1;
2765	cf_pop->cf_addr = cf_pop->id + 2;
2766	cf_pop->pop_count = 1;
2767
2768	if (ctx.bc->chip_class == CAYMAN)
2769		cm_bytecode_add_cf_end(ctx.bc);
2770	else {
2771		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2772		ctx.bc->cf_last->end_of_program = 1;
2773	}
2774
2775	gs->gs_copy_shader = cshader;
2776	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2777
2778	ctx.bc->nstack = 1;
2779
2780	return r600_bytecode_build(ctx.bc);
2781}
2782
2783static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2784{
2785	if (ind) {
2786		struct r600_bytecode_alu alu;
2787		int r;
2788
2789		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2790		alu.op = ALU_OP2_ADD_INT;
2791		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2792		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2793		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2794		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2795		alu.dst.write = 1;
2796		alu.last = 1;
2797		r = r600_bytecode_add_alu(ctx->bc, &alu);
2798		if (r)
2799			return r;
2800	}
2801	return 0;
2802}
2803
2804static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2805{
2806	struct r600_bytecode_output output;
2807	int ring_offset;
2808	unsigned i, k;
2809	int effective_stream = stream == -1 ? 0 : stream;
2810	int idx = 0;
2811
2812	for (i = 0; i < ctx->shader->noutput; i++) {
2813		if (ctx->gs_for_vs) {
2814			/* for ES we need to lookup corresponding ring offset expected by GS
2815			 * (map this output to GS input by name and sid) */
2816			/* FIXME precompute offsets */
2817			ring_offset = -1;
2818			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2819				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2820				struct r600_shader_io *out = &ctx->shader->output[i];
2821				if (in->name == out->name && in->sid == out->sid)
2822					ring_offset = in->ring_offset;
2823			}
2824
2825			if (ring_offset == -1)
2826				continue;
2827		} else {
2828			ring_offset = idx * 16;
2829			idx++;
2830		}
2831
2832		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2833			continue;
2834		/* next_ring_offset after parsing input decls contains total size of
2835		 * single vertex data, gs_next_vertex - current vertex index */
2836		if (!ind)
2837			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2838
2839		memset(&output, 0, sizeof(struct r600_bytecode_output));
2840		output.gpr = ctx->shader->output[i].gpr;
2841		output.elem_size = 3;
2842		output.comp_mask = 0xF;
2843		output.burst_count = 1;
2844
2845		if (ind)
2846			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2847		else
2848			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2849
2850		switch (stream) {
2851		default:
2852		case 0:
2853			output.op = CF_OP_MEM_RING; break;
2854		case 1:
2855			output.op = CF_OP_MEM_RING1; break;
2856		case 2:
2857			output.op = CF_OP_MEM_RING2; break;
2858		case 3:
2859			output.op = CF_OP_MEM_RING3; break;
2860		}
2861
2862		if (ind) {
2863			output.array_base = ring_offset >> 2; /* in dwords */
2864			output.array_size = 0xfff;
2865			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2866		} else
2867			output.array_base = ring_offset >> 2; /* in dwords */
2868		r600_bytecode_add_output(ctx->bc, &output);
2869	}
2870
2871	++ctx->gs_next_vertex;
2872	return 0;
2873}
2874
2875
2876static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2877{
2878	int r;
2879	struct r600_bytecode_vtx vtx;
2880	int temp_val = ctx->temp_reg;
2881	/* need to store the TCS output somewhere */
2882	r = single_alu_op2(ctx, ALU_OP1_MOV,
2883			   temp_val, 0,
2884			   V_SQ_ALU_SRC_LITERAL, 0,
2885			   0, 0);
2886	if (r)
2887		return r;
2888
2889	/* used by VS/TCS */
2890	if (ctx->tess_input_info) {
2891		/* fetch tcs input values into resv space */
2892		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2893		vtx.op = FETCH_OP_VFETCH;
2894		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2895		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2896		vtx.mega_fetch_count = 16;
2897		vtx.data_format = FMT_32_32_32_32;
2898		vtx.num_format_all = 2;
2899		vtx.format_comp_all = 1;
2900		vtx.use_const_fields = 0;
2901		vtx.endian = r600_endian_swap(32);
2902		vtx.srf_mode_all = 1;
2903		vtx.offset = 0;
2904		vtx.dst_gpr = ctx->tess_input_info;
2905		vtx.dst_sel_x = 0;
2906		vtx.dst_sel_y = 1;
2907		vtx.dst_sel_z = 2;
2908		vtx.dst_sel_w = 3;
2909		vtx.src_gpr = temp_val;
2910		vtx.src_sel_x = 0;
2911
2912		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2913		if (r)
2914			return r;
2915	}
2916
2917	/* used by TCS/TES */
2918	if (ctx->tess_output_info) {
2919		/* fetch tcs output values into resv space */
2920		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2921		vtx.op = FETCH_OP_VFETCH;
2922		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2923		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2924		vtx.mega_fetch_count = 16;
2925		vtx.data_format = FMT_32_32_32_32;
2926		vtx.num_format_all = 2;
2927		vtx.format_comp_all = 1;
2928		vtx.use_const_fields = 0;
2929		vtx.endian = r600_endian_swap(32);
2930		vtx.srf_mode_all = 1;
2931		vtx.offset = 16;
2932		vtx.dst_gpr = ctx->tess_output_info;
2933		vtx.dst_sel_x = 0;
2934		vtx.dst_sel_y = 1;
2935		vtx.dst_sel_z = 2;
2936		vtx.dst_sel_w = 3;
2937		vtx.src_gpr = temp_val;
2938		vtx.src_sel_x = 0;
2939
2940		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2941		if (r)
2942			return r;
2943	}
2944	return 0;
2945}
2946
2947static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2948{
2949	int j, r;
2950	int temp_reg;
2951	unsigned i;
2952
2953	/* fetch tcs input values into input_vals */
2954	ctx->tess_input_info = r600_get_temp(ctx);
2955	ctx->tess_output_info = 0;
2956	r = r600_fetch_tess_io_info(ctx);
2957	if (r)
2958		return r;
2959
2960	temp_reg = r600_get_temp(ctx);
2961	/* dst reg contains LDS address stride * idx */
2962	/* MUL vertexID, vertex_dw_stride */
2963	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2964			   temp_reg, 0,
2965			   ctx->tess_input_info, 1,
2966			   0, 1); /* rel id in r0.y? */
2967	if (r)
2968		return r;
2969
2970	for (i = 0; i < ctx->shader->noutput; i++) {
2971		struct r600_bytecode_alu alu;
2972		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2973
2974		if (param) {
2975			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2976					   temp_reg, 1,
2977					   temp_reg, 0,
2978					   V_SQ_ALU_SRC_LITERAL, param * 16);
2979			if (r)
2980				return r;
2981		}
2982
2983		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2984				   temp_reg, 2,
2985				   temp_reg, param ? 1 : 0,
2986				   V_SQ_ALU_SRC_LITERAL, 8);
2987		if (r)
2988			return r;
2989
2990
2991		for (j = 0; j < 2; j++) {
2992			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2993			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2994			alu.op = LDS_OP3_LDS_WRITE_REL;
2995			alu.src[0].sel = temp_reg;
2996			alu.src[0].chan = chan;
2997			alu.src[1].sel = ctx->shader->output[i].gpr;
2998			alu.src[1].chan = j * 2;
2999			alu.src[2].sel = ctx->shader->output[i].gpr;
3000			alu.src[2].chan = (j * 2) + 1;
3001			alu.last = 1;
3002			alu.dst.chan = 0;
3003			alu.lds_idx = 1;
3004			alu.is_lds_idx_op = true;
3005			r = r600_bytecode_add_alu(ctx->bc, &alu);
3006			if (r)
3007				return r;
3008		}
3009	}
3010	return 0;
3011}
3012
3013static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3014{
3015	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3016	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3017	int i, r, lasti;
3018	int temp_reg = r600_get_temp(ctx);
3019	struct r600_bytecode_alu alu;
3020	unsigned write_mask = dst->Register.WriteMask;
3021
3022	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3023		return 0;
3024
3025	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3026	if (r)
3027		return r;
3028
3029	/* the base address is now in temp.x */
3030	r = r600_get_byte_address(ctx, temp_reg,
3031				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3032	if (r)
3033		return r;
3034
3035	/* LDS write */
3036	lasti = tgsi_last_instruction(write_mask);
3037	for (i = 1; i <= lasti; i++) {
3038
3039		if (!(write_mask & (1 << i)))
3040			continue;
3041		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3042				   temp_reg, i,
3043				   temp_reg, 0,
3044				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3045		if (r)
3046			return r;
3047	}
3048
3049	for (i = 0; i <= lasti; i++) {
3050		if (!(write_mask & (1 << i)))
3051			continue;
3052
3053		if ((i == 0 && ((write_mask & 3) == 3)) ||
3054		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
3055			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3056			alu.op = LDS_OP3_LDS_WRITE_REL;
3057			alu.src[0].sel = temp_reg;
3058			alu.src[0].chan = i;
3059
3060			alu.src[1].sel = dst->Register.Index;
3061			alu.src[1].sel += ctx->file_offset[dst->Register.File];
3062			alu.src[1].chan = i;
3063
3064			alu.src[2].sel = dst->Register.Index;
3065			alu.src[2].sel += ctx->file_offset[dst->Register.File];
3066			alu.src[2].chan = i + 1;
3067			alu.lds_idx = 1;
3068			alu.dst.chan = 0;
3069			alu.last = 1;
3070			alu.is_lds_idx_op = true;
3071			r = r600_bytecode_add_alu(ctx->bc, &alu);
3072			if (r)
3073				return r;
3074			i += 1;
3075			continue;
3076		}
3077		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3078		alu.op = LDS_OP2_LDS_WRITE;
3079		alu.src[0].sel = temp_reg;
3080		alu.src[0].chan = i;
3081
3082		alu.src[1].sel = dst->Register.Index;
3083		alu.src[1].sel += ctx->file_offset[dst->Register.File];
3084		alu.src[1].chan = i;
3085
3086		alu.src[2].sel = V_SQ_ALU_SRC_0;
3087		alu.dst.chan = 0;
3088		alu.last = 1;
3089		alu.is_lds_idx_op = true;
3090		r = r600_bytecode_add_alu(ctx->bc, &alu);
3091		if (r)
3092			return r;
3093	}
3094	return 0;
3095}
3096
3097static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3098				 int output_idx, int nc)
3099{
3100	int param;
3101	unsigned temp_reg = r600_get_temp(ctx);
3102	unsigned name = ctx->shader->output[output_idx].name;
3103	int dreg = ctx->shader->output[output_idx].gpr;
3104	int r;
3105
3106	param = r600_get_lds_unique_index(name, 0);
3107	r = get_lds_offset0(ctx, 1, temp_reg, true);
3108	if (r)
3109		return r;
3110
3111	if (param) {
3112		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3113				   temp_reg, 0,
3114				   temp_reg, 0,
3115				   V_SQ_ALU_SRC_LITERAL, param * 16);
3116		if (r)
3117			return r;
3118	}
3119
3120	do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3121	return 0;
3122}
3123
3124static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3125{
3126	int stride, outer_comps, inner_comps;
3127	int tessinner_idx = -1, tessouter_idx = -1;
3128	int i, r;
3129	unsigned j;
3130	int temp_reg = r600_get_temp(ctx);
3131	int treg[3] = {-1, -1, -1};
3132	struct r600_bytecode_alu alu;
3133	struct r600_bytecode_cf *cf_jump, *cf_pop;
3134
3135	/* only execute factor emission for invocation 0 */
3136	/* PRED_SETE_INT __, R0.x, 0 */
3137	memset(&alu, 0, sizeof(alu));
3138	alu.op = ALU_OP2_PRED_SETE_INT;
3139	alu.src[0].chan = 2;
3140	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3141	alu.execute_mask = 1;
3142	alu.update_pred = 1;
3143	alu.last = 1;
3144	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3145
3146	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3147	cf_jump = ctx->bc->cf_last;
3148
3149	treg[0] = r600_get_temp(ctx);
3150	switch (ctx->shader->tcs_prim_mode) {
3151	case PIPE_PRIM_LINES:
3152		stride = 8; /* 2 dwords, 1 vec2 store */
3153		outer_comps = 2;
3154		inner_comps = 0;
3155		break;
3156	case PIPE_PRIM_TRIANGLES:
3157		stride = 16; /* 4 dwords, 1 vec4 store */
3158		outer_comps = 3;
3159		inner_comps = 1;
3160		treg[1] = r600_get_temp(ctx);
3161		break;
3162	case PIPE_PRIM_QUADS:
3163		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3164		outer_comps = 4;
3165		inner_comps = 2;
3166		treg[1] = r600_get_temp(ctx);
3167		treg[2] = r600_get_temp(ctx);
3168		break;
3169	default:
3170		assert(0);
3171		return -1;
3172	}
3173
3174	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3175	/* TF_WRITE takes index in R.x, value in R.y */
3176	for (j = 0; j < ctx->shader->noutput; j++) {
3177		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3178			tessinner_idx = j;
3179		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3180			tessouter_idx = j;
3181	}
3182
3183	if (tessouter_idx == -1)
3184		return -1;
3185
3186	if (tessinner_idx == -1 && inner_comps)
3187		return -1;
3188
3189	if (tessouter_idx != -1) {
3190		r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3191		if (r)
3192			return r;
3193	}
3194
3195	if (tessinner_idx != -1) {
3196		r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3197		if (r)
3198			return r;
3199	}
3200
3201	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3202	/* r.x = relpatchid(r0.y) * tf_stride */
3203
3204	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
3205	/* add incoming r0.w to it: t.x = t.x + r0.w */
3206	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3207			   temp_reg, 0,
3208			   0, 1,
3209			   V_SQ_ALU_SRC_LITERAL, stride,
3210			   0, 3);
3211	if (r)
3212		return r;
3213
3214	for (i = 0; i < outer_comps + inner_comps; i++) {
3215		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3216		int out_comp = i >= outer_comps ? i - outer_comps : i;
3217
3218		if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3219			if (out_comp == 1)
3220				out_comp = 0;
3221			else if (out_comp == 0)
3222				out_comp = 1;
3223		}
3224
3225		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3226				   treg[i / 2], (2 * (i % 2)),
3227				   temp_reg, 0,
3228				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3229		if (r)
3230			return r;
3231		r = single_alu_op2(ctx, ALU_OP1_MOV,
3232				   treg[i / 2], 1 + (2 * (i%2)),
3233				   ctx->shader->output[out_idx].gpr, out_comp,
3234				   0, 0);
3235		if (r)
3236			return r;
3237	}
3238	for (i = 0; i < outer_comps + inner_comps; i++) {
3239		struct r600_bytecode_gds gds;
3240
3241		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3242		gds.src_gpr = treg[i / 2];
3243		gds.src_sel_x = 2 * (i % 2);
3244		gds.src_sel_y = 1 + (2 * (i % 2));
3245		gds.src_sel_z = 4;
3246		gds.dst_sel_x = 7;
3247		gds.dst_sel_y = 7;
3248		gds.dst_sel_z = 7;
3249		gds.dst_sel_w = 7;
3250		gds.op = FETCH_OP_TF_WRITE;
3251		r = r600_bytecode_add_gds(ctx->bc, &gds);
3252		if (r)
3253			return r;
3254	}
3255
3256	// Patch up jump label
3257	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3258	cf_pop = ctx->bc->cf_last;
3259
3260	cf_jump->cf_addr = cf_pop->id + 2;
3261	cf_jump->pop_count = 1;
3262	cf_pop->cf_addr = cf_pop->id + 2;
3263	cf_pop->pop_count = 1;
3264
3265	return 0;
3266}
3267
3268/*
3269 * We have to work out the thread ID for load and atomic
3270 * operations, which store the returned value to an index
3271 * in an intermediate buffer.
3272 * The index is calculated by taking the thread id,
3273 * calculated from the MBCNT instructions.
3274 * Then the shader engine ID is multiplied by 256,
3275 * and the wave id is added.
3276 * Then the result is multipled by 64 and thread id is
3277 * added.
3278 */
3279static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3280{
3281	struct r600_bytecode_alu alu;
3282	int r;
3283
3284	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3285	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3286	alu.dst.sel = ctx->temp_reg;
3287	alu.dst.chan = 0;
3288	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3289	alu.src[0].value = 0xffffffff;
3290	alu.dst.write = 1;
3291	r = r600_bytecode_add_alu(ctx->bc, &alu);
3292	if (r)
3293		return r;
3294
3295	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3296	alu.op = ALU_OP1_MBCNT_32HI_INT;
3297	alu.dst.sel = ctx->temp_reg;
3298	alu.dst.chan = 1;
3299	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3300	alu.src[0].value = 0xffffffff;
3301	alu.dst.write = 1;
3302	r = r600_bytecode_add_alu(ctx->bc, &alu);
3303	if (r)
3304		return r;
3305
3306	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3307	alu.op = ALU_OP3_MULADD_UINT24;
3308	alu.dst.sel = ctx->temp_reg;
3309	alu.dst.chan = 2;
3310	alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3311	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3312	alu.src[1].value = 256;
3313	alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3314	alu.dst.write = 1;
3315	alu.is_op3 = 1;
3316	alu.last = 1;
3317	r = r600_bytecode_add_alu(ctx->bc, &alu);
3318	if (r)
3319		return r;
3320
3321	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3322			   ctx->thread_id_gpr, 1,
3323			   ctx->temp_reg, 2,
3324			   V_SQ_ALU_SRC_LITERAL, 0x40,
3325			   ctx->temp_reg, 0);
3326	if (r)
3327		return r;
3328	return 0;
3329}
3330
3331static int r600_shader_from_tgsi(struct r600_context *rctx,
3332				 struct r600_pipe_shader *pipeshader,
3333				 union r600_shader_key key)
3334{
3335	struct r600_screen *rscreen = rctx->screen;
3336	struct r600_shader *shader = &pipeshader->shader;
3337	struct tgsi_token *tokens = pipeshader->selector->tokens;
3338	struct pipe_stream_output_info so = pipeshader->selector->so;
3339	struct tgsi_full_immediate *immediate;
3340	struct r600_shader_ctx ctx;
3341	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3342	unsigned output_done, noutput;
3343	unsigned opcode;
3344	int j, k, r = 0;
3345	unsigned i;
3346	int next_param_base = 0, next_clip_base;
3347	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3348	bool indirect_gprs;
3349	bool ring_outputs = false;
3350	bool lds_outputs = false;
3351	bool lds_inputs = false;
3352	bool pos_emitted = false;
3353
3354	ctx.bc = &shader->bc;
3355	ctx.shader = shader;
3356
3357	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3358			   rscreen->has_compressed_msaa_texturing);
3359	ctx.tokens = tokens;
3360	tgsi_scan_shader(tokens, &ctx.info);
3361	shader->indirect_files = ctx.info.indirect_files;
3362
3363	int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3364	ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3365	ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3366	tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3367
3368	shader->uses_helper_invocation = false;
3369	shader->uses_doubles = ctx.info.uses_doubles;
3370	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3371	shader->nsys_inputs = 0;
3372
3373	shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3374		ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3375	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3376	tgsi_parse_init(&ctx.parse, tokens);
3377	ctx.type = ctx.info.processor;
3378	shader->processor_type = ctx.type;
3379	ctx.bc->type = shader->processor_type;
3380
3381	switch (ctx.type) {
3382	case PIPE_SHADER_VERTEX:
3383		shader->vs_as_gs_a = key.vs.as_gs_a;
3384		shader->vs_as_es = key.vs.as_es;
3385		shader->vs_as_ls = key.vs.as_ls;
3386		shader->atomic_base = key.vs.first_atomic_counter;
3387		if (shader->vs_as_es)
3388			ring_outputs = true;
3389		if (shader->vs_as_ls)
3390			lds_outputs = true;
3391		break;
3392	case PIPE_SHADER_GEOMETRY:
3393		ring_outputs = true;
3394		shader->atomic_base = key.gs.first_atomic_counter;
3395		shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3396		break;
3397	case PIPE_SHADER_TESS_CTRL:
3398		shader->tcs_prim_mode = key.tcs.prim_mode;
3399		shader->atomic_base = key.tcs.first_atomic_counter;
3400		lds_outputs = true;
3401		lds_inputs = true;
3402		break;
3403	case PIPE_SHADER_TESS_EVAL:
3404		shader->tes_as_es = key.tes.as_es;
3405		shader->atomic_base = key.tes.first_atomic_counter;
3406		lds_inputs = true;
3407		if (shader->tes_as_es)
3408			ring_outputs = true;
3409		break;
3410	case PIPE_SHADER_FRAGMENT:
3411		shader->two_side = key.ps.color_two_side;
3412		shader->atomic_base = key.ps.first_atomic_counter;
3413		shader->rat_base = key.ps.nr_cbufs;
3414		shader->image_size_const_offset = key.ps.image_size_const_offset;
3415		break;
3416	case PIPE_SHADER_COMPUTE:
3417		shader->rat_base = 0;
3418		shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3419		break;
3420	default:
3421		break;
3422	}
3423
3424	if (shader->vs_as_es || shader->tes_as_es) {
3425		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3426	} else {
3427		ctx.gs_for_vs = NULL;
3428	}
3429
3430	ctx.next_ring_offset = 0;
3431	ctx.gs_out_ring_offset = 0;
3432	ctx.gs_next_vertex = 0;
3433	ctx.gs_stream_output_info = &so;
3434
3435	ctx.thread_id_gpr = -1;
3436	ctx.face_gpr = -1;
3437	ctx.fixed_pt_position_gpr = -1;
3438	ctx.fragcoord_input = -1;
3439	ctx.colors_used = 0;
3440	ctx.clip_vertex_write = 0;
3441
3442	ctx.helper_invoc_reg = -1;
3443	ctx.cs_block_size_reg = -1;
3444	ctx.cs_grid_size_reg = -1;
3445	ctx.cs_block_size_loaded = false;
3446	ctx.cs_grid_size_loaded = false;
3447
3448	shader->nr_ps_color_exports = 0;
3449	shader->nr_ps_max_color_exports = 0;
3450
3451
3452	/* register allocations */
3453	/* Values [0,127] correspond to GPR[0..127].
3454	 * Values [128,159] correspond to constant buffer bank 0
3455	 * Values [160,191] correspond to constant buffer bank 1
3456	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3457	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3458	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3459	 * Other special values are shown in the list below.
3460	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3461	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3462	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3463	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3464	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3465	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3466	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3467	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3468	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3469	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3470	 * 254	SQ_ALU_SRC_PV: previous vector result.
3471	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3472	 */
3473	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3474		ctx.file_offset[i] = 0;
3475	}
3476
3477	if (ctx.type == PIPE_SHADER_VERTEX)  {
3478
3479		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3480		if (ctx.info.num_inputs)
3481			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3482	}
3483	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3484		if (ctx.bc->chip_class >= EVERGREEN)
3485			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3486		else
3487			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3488
3489		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3490			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3491				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3492				shader->uses_helper_invocation = true;
3493			}
3494		}
3495	}
3496	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3497		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3498		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3499	}
3500	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3501		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3502	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3503		bool add_tesscoord = false, add_tess_inout = false;
3504		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3505		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3506			/* if we have tesscoord save one reg */
3507			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3508				add_tesscoord = true;
3509			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3510			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3511				add_tess_inout = true;
3512		}
3513		if (add_tesscoord || add_tess_inout)
3514			ctx.file_offset[TGSI_FILE_INPUT]++;
3515		if (add_tess_inout)
3516			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3517	}
3518	if (ctx.type == PIPE_SHADER_COMPUTE) {
3519		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3520		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3521			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3522				ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3523			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3524				ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3525		}
3526	}
3527
3528	ctx.file_offset[TGSI_FILE_OUTPUT] =
3529			ctx.file_offset[TGSI_FILE_INPUT] +
3530			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3531	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3532						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3533
3534	/* Outside the GPR range. This will be translated to one of the
3535	 * kcache banks later. */
3536	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3537	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3538
3539	pipeshader->scratch_space_needed = 0;
3540	int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3541			ctx.info.file_max[TGSI_FILE_TEMPORARY];
3542	if (regno > 124) {
3543		choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3544		shader->indirect_files = ctx.info.indirect_files;
3545	}
3546	shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3547
3548	ctx.bc->ar_reg = ++regno;
3549	ctx.bc->index_reg[0] = ++regno;
3550	ctx.bc->index_reg[1] = ++regno;
3551
3552	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3553		ctx.tess_input_info = ++regno;
3554		ctx.tess_output_info = ++regno;
3555	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3556		ctx.tess_input_info = ++regno;
3557		ctx.tess_output_info = ++regno;
3558	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3559		ctx.gs_export_gpr_tregs[0] = ++regno;
3560		ctx.gs_export_gpr_tregs[1] = ++regno;
3561		ctx.gs_export_gpr_tregs[2] = ++regno;
3562		ctx.gs_export_gpr_tregs[3] = ++regno;
3563		if (ctx.shader->gs_tri_strip_adj_fix) {
3564			ctx.gs_rotated_input[0] = ++regno;
3565			ctx.gs_rotated_input[1] = ++regno;
3566		} else {
3567			ctx.gs_rotated_input[0] = 0;
3568			ctx.gs_rotated_input[1] = 1;
3569		}
3570	}
3571
3572	if (shader->uses_images) {
3573		ctx.thread_id_gpr = ++regno;
3574	}
3575	ctx.temp_reg = ++regno;
3576
3577	shader->max_arrays = 0;
3578	shader->num_arrays = 0;
3579	if (indirect_gprs) {
3580
3581		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3582			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3583			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3584			                   ctx.file_offset[TGSI_FILE_INPUT],
3585			                   0x0F);
3586		}
3587		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3588			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3589			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3590			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3591			                   0x0F);
3592		}
3593	}
3594
3595	ctx.nliterals = 0;
3596	ctx.literals = NULL;
3597	ctx.max_driver_temp_used = 0;
3598
3599	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3600			       ctx.info.colors_written == 1;
3601	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3602	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3603
3604	if (ctx.type == PIPE_SHADER_VERTEX ||
3605	    ctx.type == PIPE_SHADER_GEOMETRY ||
3606	    ctx.type == PIPE_SHADER_TESS_EVAL) {
3607		shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3608					      ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3609		shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3610		shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3611	}
3612
3613	if (shader->vs_as_gs_a)
3614		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3615
3616	if (ctx.thread_id_gpr != -1) {
3617		r = load_thread_id_gpr(&ctx);
3618		if (r)
3619			return r;
3620	}
3621
3622	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3623		r600_fetch_tess_io_info(&ctx);
3624
3625	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3626		tgsi_parse_token(&ctx.parse);
3627		switch (ctx.parse.FullToken.Token.Type) {
3628		case TGSI_TOKEN_TYPE_IMMEDIATE:
3629			immediate = &ctx.parse.FullToken.FullImmediate;
3630			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3631			if(ctx.literals == NULL) {
3632				r = -ENOMEM;
3633				goto out_err;
3634			}
3635			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3636			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3637			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3638			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3639			ctx.nliterals++;
3640			break;
3641		case TGSI_TOKEN_TYPE_DECLARATION:
3642			r = tgsi_declaration(&ctx);
3643			if (r)
3644				goto out_err;
3645			break;
3646		case TGSI_TOKEN_TYPE_INSTRUCTION:
3647		case TGSI_TOKEN_TYPE_PROPERTY:
3648			break;
3649		default:
3650			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3651			r = -EINVAL;
3652			goto out_err;
3653		}
3654	}
3655
3656	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3657	shader->ring_item_sizes[1] = 0;
3658	shader->ring_item_sizes[2] = 0;
3659	shader->ring_item_sizes[3] = 0;
3660
3661	/* Process two side if needed */
3662	if (shader->two_side && ctx.colors_used) {
3663		int i, count = ctx.shader->ninput;
3664		unsigned next_lds_loc = ctx.shader->nlds;
3665
3666		/* additional inputs will be allocated right after the existing inputs,
3667		 * we won't need them after the color selection, so we don't need to
3668		 * reserve these gprs for the rest of the shader code and to adjust
3669		 * output offsets etc. */
3670		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3671				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3672
3673		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3674		if (ctx.face_gpr == -1) {
3675			i = ctx.shader->ninput++;
3676			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3677			ctx.shader->input[i].spi_sid = 0;
3678			ctx.shader->input[i].gpr = gpr++;
3679			ctx.face_gpr = ctx.shader->input[i].gpr;
3680		}
3681
3682		for (i = 0; i < count; i++) {
3683			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3684				int ni = ctx.shader->ninput++;
3685				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3686				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3687				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3688				ctx.shader->input[ni].gpr = gpr++;
3689				// TGSI to LLVM needs to know the lds position of inputs.
3690				// Non LLVM path computes it later (in process_twoside_color)
3691				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3692				ctx.shader->input[i].back_color_input = ni;
3693				if (ctx.bc->chip_class >= EVERGREEN) {
3694					if ((r = evergreen_interp_input(&ctx, ni)))
3695						return r;
3696				}
3697			}
3698		}
3699	}
3700
3701	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3702		shader->nr_ps_max_color_exports = 8;
3703
3704	if (ctx.shader->uses_helper_invocation) {
3705		if (ctx.bc->chip_class == CAYMAN)
3706			r = cm_load_helper_invocation(&ctx);
3707		else
3708			r = eg_load_helper_invocation(&ctx);
3709		if (r)
3710			return r;
3711	}
3712
3713	/*
3714	 * XXX this relies on fixed_pt_position_gpr only being present when
3715	 * this shader should be executed per sample. Should be the case for now...
3716	 */
3717	if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3718		/*
3719		 * Fix up sample mask. The hw always gives us coverage mask for
3720		 * the pixel. However, for per-sample shading, we need the
3721		 * coverage for the shader invocation only.
3722		 * Also, with disabled msaa, only the first bit should be set
3723		 * (luckily the same fixup works for both problems).
3724		 * For now, we can only do it if we know this shader is always
3725		 * executed per sample (due to usage of bits in the shader
3726		 * forcing per-sample execution).
3727		 * If the fb is not multisampled, we'd do unnecessary work but
3728		 * it should still be correct.
3729		 * It will however do nothing for sample shading according
3730		 * to MinSampleShading.
3731		 */
3732		struct r600_bytecode_alu alu;
3733		int tmp = r600_get_temp(&ctx);
3734		assert(ctx.face_gpr != -1);
3735		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3736
3737		alu.op = ALU_OP2_LSHL_INT;
3738		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3739		alu.src[0].value = 0x1;
3740		alu.src[1].sel = ctx.fixed_pt_position_gpr;
3741		alu.src[1].chan = 3;
3742		alu.dst.sel = tmp;
3743		alu.dst.chan = 0;
3744		alu.dst.write = 1;
3745		alu.last = 1;
3746		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3747			return r;
3748
3749		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3750		alu.op = ALU_OP2_AND_INT;
3751		alu.src[0].sel = tmp;
3752		alu.src[1].sel = ctx.face_gpr;
3753		alu.src[1].chan = 2;
3754		alu.dst.sel = ctx.face_gpr;
3755		alu.dst.chan = 2;
3756		alu.dst.write = 1;
3757		alu.last = 1;
3758		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3759			return r;
3760	}
3761
3762	if (ctx.fragcoord_input >= 0) {
3763		if (ctx.bc->chip_class == CAYMAN) {
3764			for (j = 0 ; j < 4; j++) {
3765				struct r600_bytecode_alu alu;
3766				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3767				alu.op = ALU_OP1_RECIP_IEEE;
3768				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3769				alu.src[0].chan = 3;
3770
3771				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3772				alu.dst.chan = j;
3773				alu.dst.write = (j == 3);
3774				alu.last = (j == 3);
3775				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3776					return r;
3777			}
3778		} else {
3779			struct r600_bytecode_alu alu;
3780			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3781			alu.op = ALU_OP1_RECIP_IEEE;
3782			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3783			alu.src[0].chan = 3;
3784
3785			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3786			alu.dst.chan = 3;
3787			alu.dst.write = 1;
3788			alu.last = 1;
3789			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3790				return r;
3791		}
3792	}
3793
3794	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3795		struct r600_bytecode_alu alu;
3796		int r;
3797
3798		/* GS thread with no output workaround - emit a cut at start of GS */
3799		if (ctx.bc->chip_class == R600)
3800			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3801
3802		for (j = 0; j < 4; j++) {
3803			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3804			alu.op = ALU_OP1_MOV;
3805			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3806			alu.src[0].value = 0;
3807			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3808			alu.dst.write = 1;
3809			alu.last = 1;
3810			r = r600_bytecode_add_alu(ctx.bc, &alu);
3811			if (r)
3812				return r;
3813		}
3814
3815		if (ctx.shader->gs_tri_strip_adj_fix) {
3816			r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3817					   ctx.gs_rotated_input[0], 2,
3818					   0, 2,
3819					   V_SQ_ALU_SRC_LITERAL, 1);
3820			if (r)
3821				return r;
3822
3823			for (i = 0; i < 6; i++) {
3824				int rotated = (i + 4) % 6;
3825				int offset_reg = i / 3;
3826				int offset_chan = i % 3;
3827				int rotated_offset_reg = rotated / 3;
3828				int rotated_offset_chan = rotated % 3;
3829
3830				if (offset_reg == 0 && offset_chan == 2)
3831					offset_chan = 3;
3832				if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3833					rotated_offset_chan = 3;
3834
3835				r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3836						   ctx.gs_rotated_input[offset_reg], offset_chan,
3837						   ctx.gs_rotated_input[0], 2,
3838						   offset_reg, offset_chan,
3839						   rotated_offset_reg, rotated_offset_chan);
3840				if (r)
3841					return r;
3842			}
3843		}
3844	}
3845
3846	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3847		r600_fetch_tess_io_info(&ctx);
3848
3849	if (shader->two_side && ctx.colors_used) {
3850		if ((r = process_twoside_color_inputs(&ctx)))
3851			return r;
3852	}
3853
3854	tgsi_parse_init(&ctx.parse, tokens);
3855	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3856		tgsi_parse_token(&ctx.parse);
3857		switch (ctx.parse.FullToken.Token.Type) {
3858		case TGSI_TOKEN_TYPE_INSTRUCTION:
3859			r = tgsi_is_supported(&ctx);
3860			if (r)
3861				goto out_err;
3862			ctx.max_driver_temp_used = 0;
3863			/* reserve first tmp for everyone */
3864			r600_get_temp(&ctx);
3865
3866			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3867			if ((r = tgsi_split_constant(&ctx)))
3868				goto out_err;
3869			if ((r = tgsi_split_literal_constant(&ctx)))
3870				goto out_err;
3871			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3872				if ((r = tgsi_split_gs_inputs(&ctx)))
3873					goto out_err;
3874			} else if (lds_inputs) {
3875				if ((r = tgsi_split_lds_inputs(&ctx)))
3876					goto out_err;
3877			}
3878			if (ctx.bc->chip_class == CAYMAN)
3879				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3880			else if (ctx.bc->chip_class >= EVERGREEN)
3881				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3882			else
3883				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3884
3885			ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3886
3887			r = ctx.inst_info->process(&ctx);
3888			if (r)
3889				goto out_err;
3890
3891			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3892				r = r600_store_tcs_output(&ctx);
3893				if (r)
3894					goto out_err;
3895			}
3896			break;
3897		default:
3898			break;
3899		}
3900	}
3901
3902	/* Reset the temporary register counter. */
3903	ctx.max_driver_temp_used = 0;
3904
3905	noutput = shader->noutput;
3906
3907	if (!ring_outputs && ctx.clip_vertex_write) {
3908		unsigned clipdist_temp[2];
3909
3910		clipdist_temp[0] = r600_get_temp(&ctx);
3911		clipdist_temp[1] = r600_get_temp(&ctx);
3912
3913		/* need to convert a clipvertex write into clipdistance writes and not export
3914		   the clip vertex anymore */
3915
3916		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3917		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3918		shader->output[noutput].gpr = clipdist_temp[0];
3919		noutput++;
3920		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3921		shader->output[noutput].gpr = clipdist_temp[1];
3922		noutput++;
3923
3924		/* reset spi_sid for clipvertex output to avoid confusing spi */
3925		shader->output[ctx.cv_output].spi_sid = 0;
3926
3927		shader->clip_dist_write = 0xFF;
3928		shader->cc_dist_mask = 0xFF;
3929
3930		for (i = 0; i < 8; i++) {
3931			int oreg = i >> 2;
3932			int ochan = i & 3;
3933
3934			for (j = 0; j < 4; j++) {
3935				struct r600_bytecode_alu alu;
3936				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3937				alu.op = ALU_OP2_DOT4;
3938				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3939				alu.src[0].chan = j;
3940
3941				alu.src[1].sel = 512 + i;
3942				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3943				alu.src[1].chan = j;
3944
3945				alu.dst.sel = clipdist_temp[oreg];
3946				alu.dst.chan = j;
3947				alu.dst.write = (j == ochan);
3948				if (j == 3)
3949					alu.last = 1;
3950				r = r600_bytecode_add_alu(ctx.bc, &alu);
3951				if (r)
3952					return r;
3953			}
3954		}
3955	}
3956
3957	/* Add stream outputs. */
3958	if (so.num_outputs) {
3959		bool emit = false;
3960		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3961			emit = true;
3962		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3963			emit = true;
3964		if (emit)
3965			emit_streamout(&ctx, &so, -1, NULL);
3966	}
3967	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3968	convert_edgeflag_to_int(&ctx);
3969
3970	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3971		r600_emit_tess_factor(&ctx);
3972
3973	if (lds_outputs) {
3974		if (ctx.type == PIPE_SHADER_VERTEX) {
3975			if (ctx.shader->noutput)
3976				emit_lds_vs_writes(&ctx);
3977		}
3978	} else if (ring_outputs) {
3979		if (shader->vs_as_es || shader->tes_as_es) {
3980			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3981			ctx.gs_export_gpr_tregs[1] = -1;
3982			ctx.gs_export_gpr_tregs[2] = -1;
3983			ctx.gs_export_gpr_tregs[3] = -1;
3984
3985			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3986		}
3987	} else {
3988		/* Export output */
3989		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3990
3991		for (i = 0, j = 0; i < noutput; i++, j++) {
3992			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3993			output[j].gpr = shader->output[i].gpr;
3994			output[j].elem_size = 3;
3995			output[j].swizzle_x = 0;
3996			output[j].swizzle_y = 1;
3997			output[j].swizzle_z = 2;
3998			output[j].swizzle_w = 3;
3999			output[j].burst_count = 1;
4000			output[j].type = 0xffffffff;
4001			output[j].op = CF_OP_EXPORT;
4002			switch (ctx.type) {
4003			case PIPE_SHADER_VERTEX:
4004			case PIPE_SHADER_TESS_EVAL:
4005				switch (shader->output[i].name) {
4006				case TGSI_SEMANTIC_POSITION:
4007					output[j].array_base = 60;
4008					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4009					pos_emitted = true;
4010					break;
4011
4012				case TGSI_SEMANTIC_PSIZE:
4013					output[j].array_base = 61;
4014					output[j].swizzle_y = 7;
4015					output[j].swizzle_z = 7;
4016					output[j].swizzle_w = 7;
4017					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4018					pos_emitted = true;
4019					break;
4020				case TGSI_SEMANTIC_EDGEFLAG:
4021					output[j].array_base = 61;
4022					output[j].swizzle_x = 7;
4023					output[j].swizzle_y = 0;
4024					output[j].swizzle_z = 7;
4025					output[j].swizzle_w = 7;
4026					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4027					pos_emitted = true;
4028					break;
4029				case TGSI_SEMANTIC_LAYER:
4030					/* spi_sid is 0 for outputs that are
4031					 * not consumed by PS */
4032					if (shader->output[i].spi_sid) {
4033						output[j].array_base = next_param_base++;
4034						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4035						j++;
4036						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4037					}
4038					output[j].array_base = 61;
4039					output[j].swizzle_x = 7;
4040					output[j].swizzle_y = 7;
4041					output[j].swizzle_z = 0;
4042					output[j].swizzle_w = 7;
4043					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4044					pos_emitted = true;
4045					break;
4046				case TGSI_SEMANTIC_VIEWPORT_INDEX:
4047					/* spi_sid is 0 for outputs that are
4048					 * not consumed by PS */
4049					if (shader->output[i].spi_sid) {
4050						output[j].array_base = next_param_base++;
4051						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4052						j++;
4053						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4054					}
4055					output[j].array_base = 61;
4056					output[j].swizzle_x = 7;
4057					output[j].swizzle_y = 7;
4058					output[j].swizzle_z = 7;
4059					output[j].swizzle_w = 0;
4060					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4061					pos_emitted = true;
4062					break;
4063				case TGSI_SEMANTIC_CLIPVERTEX:
4064					j--;
4065					break;
4066				case TGSI_SEMANTIC_CLIPDIST:
4067					output[j].array_base = next_clip_base++;
4068					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4069					pos_emitted = true;
4070					/* spi_sid is 0 for clipdistance outputs that were generated
4071					 * for clipvertex - we don't need to pass them to PS */
4072					if (shader->output[i].spi_sid) {
4073						j++;
4074						/* duplicate it as PARAM to pass to the pixel shader */
4075						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4076						output[j].array_base = next_param_base++;
4077						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4078					}
4079					break;
4080				case TGSI_SEMANTIC_FOG:
4081					output[j].swizzle_y = 4; /* 0 */
4082					output[j].swizzle_z = 4; /* 0 */
4083					output[j].swizzle_w = 5; /* 1 */
4084					break;
4085				case TGSI_SEMANTIC_PRIMID:
4086					output[j].swizzle_x = 2;
4087					output[j].swizzle_y = 4; /* 0 */
4088					output[j].swizzle_z = 4; /* 0 */
4089					output[j].swizzle_w = 4; /* 0 */
4090					break;
4091				}
4092
4093				break;
4094			case PIPE_SHADER_FRAGMENT:
4095				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4096					/* never export more colors than the number of CBs */
4097					if (shader->output[i].sid >= max_color_exports) {
4098						/* skip export */
4099						j--;
4100						continue;
4101					}
4102					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4103					output[j].array_base = shader->output[i].sid;
4104					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4105					shader->nr_ps_color_exports++;
4106					shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4107
4108					/* If the i-th target format is set, all previous target formats must
4109					 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4110					 */
4111					if (shader->output[i].sid > 0)
4112						for (unsigned x = 0; x < shader->output[i].sid; x++)
4113							shader->ps_color_export_mask |= (1 << (x*4));
4114
4115					if (shader->output[i].sid > shader->ps_export_highest)
4116						shader->ps_export_highest = shader->output[i].sid;
4117					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4118						for (k = 1; k < max_color_exports; k++) {
4119							j++;
4120							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4121							output[j].gpr = shader->output[i].gpr;
4122							output[j].elem_size = 3;
4123							output[j].swizzle_x = 0;
4124							output[j].swizzle_y = 1;
4125							output[j].swizzle_z = 2;
4126							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4127							output[j].burst_count = 1;
4128							output[j].array_base = k;
4129							output[j].op = CF_OP_EXPORT;
4130							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4131							shader->nr_ps_color_exports++;
4132							if (k > shader->ps_export_highest)
4133								shader->ps_export_highest = k;
4134							shader->ps_color_export_mask |= (0xf << (j * 4));
4135						}
4136					}
4137				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4138					output[j].array_base = 61;
4139					output[j].swizzle_x = 2;
4140					output[j].swizzle_y = 7;
4141					output[j].swizzle_z = output[j].swizzle_w = 7;
4142					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4143				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4144					output[j].array_base = 61;
4145					output[j].swizzle_x = 7;
4146					output[j].swizzle_y = 1;
4147					output[j].swizzle_z = output[j].swizzle_w = 7;
4148					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4149				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4150					output[j].array_base = 61;
4151					output[j].swizzle_x = 7;
4152					output[j].swizzle_y = 7;
4153					output[j].swizzle_z = 0;
4154					output[j].swizzle_w = 7;
4155					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4156				} else {
4157					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4158					r = -EINVAL;
4159					goto out_err;
4160				}
4161				break;
4162			case PIPE_SHADER_TESS_CTRL:
4163				break;
4164			default:
4165				R600_ERR("unsupported processor type %d\n", ctx.type);
4166				r = -EINVAL;
4167				goto out_err;
4168			}
4169
4170			if (output[j].type == 0xffffffff) {
4171				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4172				output[j].array_base = next_param_base++;
4173			}
4174		}
4175
4176		/* add fake position export */
4177		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4178			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4179			output[j].gpr = 0;
4180			output[j].elem_size = 3;
4181			output[j].swizzle_x = 7;
4182			output[j].swizzle_y = 7;
4183			output[j].swizzle_z = 7;
4184			output[j].swizzle_w = 7;
4185			output[j].burst_count = 1;
4186			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4187			output[j].array_base = 60;
4188			output[j].op = CF_OP_EXPORT;
4189			j++;
4190		}
4191
4192		/* add fake param output for vertex shader if no param is exported */
4193		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4194			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4195			output[j].gpr = 0;
4196			output[j].elem_size = 3;
4197			output[j].swizzle_x = 7;
4198			output[j].swizzle_y = 7;
4199			output[j].swizzle_z = 7;
4200			output[j].swizzle_w = 7;
4201			output[j].burst_count = 1;
4202			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4203			output[j].array_base = 0;
4204			output[j].op = CF_OP_EXPORT;
4205			j++;
4206		}
4207
4208		/* add fake pixel export */
4209		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4210			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4211			output[j].gpr = 0;
4212			output[j].elem_size = 3;
4213			output[j].swizzle_x = 7;
4214			output[j].swizzle_y = 7;
4215			output[j].swizzle_z = 7;
4216			output[j].swizzle_w = 7;
4217			output[j].burst_count = 1;
4218			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4219			output[j].array_base = 0;
4220			output[j].op = CF_OP_EXPORT;
4221			j++;
4222			shader->nr_ps_color_exports++;
4223			shader->ps_color_export_mask = 0xf;
4224		}
4225
4226		noutput = j;
4227
4228		/* set export done on last export of each type */
4229		for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4230			if (!(output_done & (1 << output[k].type))) {
4231				output_done |= (1 << output[k].type);
4232				output[k].op = CF_OP_EXPORT_DONE;
4233			}
4234		}
4235		/* add output to bytecode */
4236		for (i = 0; i < noutput; i++) {
4237			r = r600_bytecode_add_output(ctx.bc, &output[i]);
4238			if (r)
4239				goto out_err;
4240		}
4241	}
4242
4243	/* add program end */
4244	if (ctx.bc->chip_class == CAYMAN)
4245		cm_bytecode_add_cf_end(ctx.bc);
4246	else {
4247		const struct cf_op_info *last = NULL;
4248
4249		if (ctx.bc->cf_last)
4250			last = r600_isa_cf(ctx.bc->cf_last->op);
4251
4252		/* alu clause instructions don't have EOP bit, so add NOP */
4253		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4254			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4255
4256		ctx.bc->cf_last->end_of_program = 1;
4257	}
4258
4259	/* check GPR limit - we have 124 = 128 - 4
4260	 * (4 are reserved as alu clause temporary registers) */
4261	if (ctx.bc->ngpr > 124) {
4262		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4263		r = -ENOMEM;
4264		goto out_err;
4265	}
4266
4267	if (ctx.type == PIPE_SHADER_GEOMETRY) {
4268		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4269			return r;
4270	}
4271
4272	free(ctx.spilled_arrays);
4273	free(ctx.array_infos);
4274	free(ctx.literals);
4275	tgsi_parse_free(&ctx.parse);
4276	return 0;
4277out_err:
4278	free(ctx.spilled_arrays);
4279	free(ctx.array_infos);
4280	free(ctx.literals);
4281	tgsi_parse_free(&ctx.parse);
4282	return r;
4283}
4284
4285static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4286{
4287	const unsigned tgsi_opcode =
4288		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4289	R600_ERR("%s tgsi opcode unsupported\n",
4290		 tgsi_get_opcode_name(tgsi_opcode));
4291	return -EINVAL;
4292}
4293
4294static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4295{
4296	return 0;
4297}
4298
4299static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4300			const struct r600_shader_src *shader_src,
4301			unsigned chan)
4302{
4303	bc_src->sel = shader_src->sel;
4304	bc_src->chan = shader_src->swizzle[chan];
4305	bc_src->neg = shader_src->neg;
4306	bc_src->abs = shader_src->abs;
4307	bc_src->rel = shader_src->rel;
4308	bc_src->value = shader_src->value[bc_src->chan];
4309	bc_src->kc_bank = shader_src->kc_bank;
4310	bc_src->kc_rel = shader_src->kc_rel;
4311}
4312
4313static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4314{
4315	bc_src->abs = 1;
4316	bc_src->neg = 0;
4317}
4318
4319static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4320{
4321	bc_src->neg = !bc_src->neg;
4322}
4323
4324static void tgsi_dst(struct r600_shader_ctx *ctx,
4325		     const struct tgsi_full_dst_register *tgsi_dst,
4326		     unsigned swizzle,
4327		     struct r600_bytecode_alu_dst *r600_dst)
4328{
4329	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4330
4331	if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4332		bool spilled;
4333		unsigned idx;
4334
4335		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4336
4337		if (spilled) {
4338			struct r600_bytecode_output cf;
4339			int reg = 0;
4340			int r;
4341			bool add_pending_output = true;
4342
4343			memset(&cf, 0, sizeof(struct r600_bytecode_output));
4344			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4345				&cf.array_base, &cf.array_size);
4346
4347			/* If no component has spilled, reserve a register and add the spill code
4348			 *  ctx->bc->n_pending_outputs is cleared after each instruction group */
4349			if (ctx->bc->n_pending_outputs == 0) {
4350				reg = r600_get_temp(ctx);
4351			} else {
4352				/* If we are already spilling and the output address is the same like
4353				* before then just reuse the same slot */
4354				struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4355				if ((cf.array_base + idx == tmpl->array_base) ||
4356				    (cf.array_base == tmpl->array_base &&
4357				     tmpl->index_gpr == ctx->bc->ar_reg &&
4358				     tgsi_dst->Register.Indirect)) {
4359					reg = ctx->bc->pending_outputs[0].gpr;
4360					add_pending_output = false;
4361				} else {
4362					reg = r600_get_temp(ctx);
4363				}
4364			}
4365
4366			r600_dst->sel = reg;
4367			r600_dst->chan = swizzle;
4368			r600_dst->write = 1;
4369			if (inst->Instruction.Saturate) {
4370				r600_dst->clamp = 1;
4371			}
4372
4373			/* Add new outputs as pending */
4374			if (add_pending_output) {
4375				cf.op = CF_OP_MEM_SCRATCH;
4376				cf.elem_size = 3;
4377				cf.gpr = reg;
4378				cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4379				cf.mark = 1;
4380				cf.comp_mask = inst->Dst[0].Register.WriteMask;
4381				cf.swizzle_x = 0;
4382				cf.swizzle_y = 1;
4383				cf.swizzle_z = 2;
4384				cf.swizzle_w = 3;
4385				cf.burst_count = 1;
4386
4387				if (tgsi_dst->Register.Indirect) {
4388					if (ctx->bc->chip_class < R700)
4389						cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4390					else
4391						cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4392					cf.index_gpr = ctx->bc->ar_reg;
4393			}
4394			else {
4395				cf.array_base += idx;
4396				cf.array_size = 0;
4397			}
4398
4399			r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4400			if (r)
4401				return;
4402
4403			if (ctx->bc->chip_class >= R700)
4404				r600_bytecode_need_wait_ack(ctx->bc, true);
4405			}
4406			return;
4407		}
4408		else {
4409			r600_dst->sel = idx;
4410		}
4411	}
4412	else {
4413		r600_dst->sel = tgsi_dst->Register.Index;
4414		r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4415	}
4416	r600_dst->chan = swizzle;
4417	r600_dst->write = 1;
4418	if (inst->Instruction.Saturate) {
4419		r600_dst->clamp = 1;
4420	}
4421	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4422		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4423			return;
4424		}
4425	}
4426	if (tgsi_dst->Register.Indirect)
4427		r600_dst->rel = V_SQ_REL_RELATIVE;
4428
4429}
4430
4431static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4432{
4433	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4434	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4435	struct r600_bytecode_alu alu;
4436	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4437	int use_tmp = 0;
4438	int swizzle_x = inst->Src[0].Register.SwizzleX;
4439
4440	if (singledest) {
4441		switch (write_mask) {
4442		case 0x1:
4443			if (swizzle_x == 2) {
4444				write_mask = 0xc;
4445				use_tmp = 3;
4446			} else
4447				write_mask = 0x3;
4448			break;
4449		case 0x2:
4450			if (swizzle_x == 2) {
4451				write_mask = 0xc;
4452				use_tmp = 3;
4453			} else {
4454				write_mask = 0x3;
4455				use_tmp = 1;
4456			}
4457			break;
4458		case 0x4:
4459			if (swizzle_x == 0) {
4460				write_mask = 0x3;
4461				use_tmp = 1;
4462			} else
4463				write_mask = 0xc;
4464			break;
4465		case 0x8:
4466			if (swizzle_x == 0) {
4467				write_mask = 0x3;
4468				use_tmp = 1;
4469			} else {
4470				write_mask = 0xc;
4471				use_tmp = 3;
4472			}
4473			break;
4474		}
4475	}
4476
4477	lasti = tgsi_last_instruction(write_mask);
4478	for (i = 0; i <= lasti; i++) {
4479
4480		if (!(write_mask & (1 << i)))
4481			continue;
4482
4483		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4484
4485		if (singledest) {
4486			if (use_tmp || dest_temp) {
4487				alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4488				alu.dst.chan = i;
4489				alu.dst.write = 1;
4490			} else {
4491				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4492			}
4493			if (i == 1 || i == 3)
4494				alu.dst.write = 0;
4495		} else
4496			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4497
4498		alu.op = op_override ? op_override : ctx->inst_info->op;
4499		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4500			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4501		} else if (!swap) {
4502			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4503				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4504			}
4505		} else {
4506			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4507			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4508		}
4509
4510		/* handle some special cases */
4511		if (i == 1 || i == 3) {
4512			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4513			case TGSI_OPCODE_DABS:
4514				r600_bytecode_src_set_abs(&alu.src[0]);
4515				break;
4516			default:
4517				break;
4518			}
4519		}
4520		if (i == lasti) {
4521			alu.last = 1;
4522		}
4523		r = r600_bytecode_add_alu(ctx->bc, &alu);
4524		if (r)
4525			return r;
4526	}
4527
4528	if (use_tmp) {
4529		write_mask = inst->Dst[0].Register.WriteMask;
4530
4531		lasti = tgsi_last_instruction(write_mask);
4532		/* move result from temp to dst */
4533		for (i = 0; i <= lasti; i++) {
4534			if (!(write_mask & (1 << i)))
4535				continue;
4536
4537			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4538			alu.op = ALU_OP1_MOV;
4539
4540			if (dest_temp) {
4541				alu.dst.sel = dest_temp;
4542				alu.dst.chan = i;
4543				alu.dst.write = 1;
4544			} else
4545				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4546			alu.src[0].sel = ctx->temp_reg;
4547			alu.src[0].chan = use_tmp - 1;
4548			alu.last = (i == lasti);
4549
4550			r = r600_bytecode_add_alu(ctx->bc, &alu);
4551			if (r)
4552				return r;
4553		}
4554	}
4555	return 0;
4556}
4557
4558static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4559{
4560	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4561	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4562	/* confirm writemasking */
4563	if ((write_mask & 0x3) != 0x3 &&
4564	    (write_mask & 0xc) != 0xc) {
4565		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4566		return -1;
4567	}
4568	return tgsi_op2_64_params(ctx, false, false, 0, 0);
4569}
4570
4571static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4572{
4573	return tgsi_op2_64_params(ctx, true, false, 0, 0);
4574}
4575
4576static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4577{
4578	return tgsi_op2_64_params(ctx, true, true, 0, 0);
4579}
4580
4581static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4582{
4583	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4584	struct r600_bytecode_alu alu;
4585	int i, j, r;
4586	int lasti = 3;
4587	int tmp = r600_get_temp(ctx);
4588
4589	for (i = 0; i < lasti + 1; i++) {
4590
4591		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4592		alu.op = ctx->inst_info->op;
4593		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4594			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4595		}
4596
4597		if (inst->Dst[0].Register.WriteMask & (1 << i))
4598			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4599		else
4600			alu.dst.sel = tmp;
4601
4602		alu.dst.chan = i;
4603		alu.is_op3 = 1;
4604		if (i == lasti) {
4605			alu.last = 1;
4606		}
4607		r = r600_bytecode_add_alu(ctx->bc, &alu);
4608		if (r)
4609			return r;
4610	}
4611	return 0;
4612}
4613
4614static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4615{
4616	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4617	struct r600_bytecode_alu alu;
4618	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4619	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4620	/* use temp register if trans_only and more than one dst component */
4621	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4622	unsigned op = ctx->inst_info->op;
4623
4624	if (op == ALU_OP2_MUL_IEEE &&
4625	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4626		op = ALU_OP2_MUL;
4627
4628	for (i = 0; i <= lasti; i++) {
4629		if (!(write_mask & (1 << i)))
4630			continue;
4631
4632		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4633		if (use_tmp) {
4634			alu.dst.sel = ctx->temp_reg;
4635			alu.dst.chan = i;
4636			alu.dst.write = 1;
4637		} else
4638			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4639
4640		alu.op = op;
4641		if (!swap) {
4642			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4643				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4644			}
4645		} else {
4646			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4647			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4648		}
4649		if (i == lasti || trans_only) {
4650			alu.last = 1;
4651		}
4652		r = r600_bytecode_add_alu(ctx->bc, &alu);
4653		if (r)
4654			return r;
4655	}
4656
4657	if (use_tmp) {
4658		/* move result from temp to dst */
4659		for (i = 0; i <= lasti; i++) {
4660			if (!(write_mask & (1 << i)))
4661				continue;
4662
4663			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4664			alu.op = ALU_OP1_MOV;
4665			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4666			alu.src[0].sel = ctx->temp_reg;
4667			alu.src[0].chan = i;
4668			alu.last = (i == lasti);
4669
4670			r = r600_bytecode_add_alu(ctx->bc, &alu);
4671			if (r)
4672				return r;
4673		}
4674	}
4675	return 0;
4676}
4677
4678static int tgsi_op2(struct r600_shader_ctx *ctx)
4679{
4680	return tgsi_op2_s(ctx, 0, 0);
4681}
4682
4683static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4684{
4685	return tgsi_op2_s(ctx, 1, 0);
4686}
4687
4688static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4689{
4690	return tgsi_op2_s(ctx, 0, 1);
4691}
4692
4693static int tgsi_ineg(struct r600_shader_ctx *ctx)
4694{
4695	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4696	struct r600_bytecode_alu alu;
4697	int i, r;
4698	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4699
4700	for (i = 0; i < lasti + 1; i++) {
4701
4702		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4703			continue;
4704		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4705		alu.op = ctx->inst_info->op;
4706
4707		alu.src[0].sel = V_SQ_ALU_SRC_0;
4708
4709		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4710
4711		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4712
4713		if (i == lasti) {
4714			alu.last = 1;
4715		}
4716		r = r600_bytecode_add_alu(ctx->bc, &alu);
4717		if (r)
4718			return r;
4719	}
4720	return 0;
4721
4722}
4723
4724static int tgsi_dneg(struct r600_shader_ctx *ctx)
4725{
4726	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4727	struct r600_bytecode_alu alu;
4728	int i, r;
4729	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4730
4731	for (i = 0; i < lasti + 1; i++) {
4732
4733		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4734			continue;
4735		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4736		alu.op = ALU_OP1_MOV;
4737
4738		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4739
4740		if (i == 1 || i == 3)
4741			r600_bytecode_src_toggle_neg(&alu.src[0]);
4742		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4743
4744		if (i == lasti) {
4745			alu.last = 1;
4746		}
4747		r = r600_bytecode_add_alu(ctx->bc, &alu);
4748		if (r)
4749			return r;
4750	}
4751	return 0;
4752
4753}
4754
4755static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4756{
4757	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4758	struct r600_bytecode_alu alu;
4759	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4760	int i, j, r;
4761
4762	for (i = 0; i <= 3; i++) {
4763		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4764		alu.op = ctx->inst_info->op;
4765
4766		alu.dst.sel = ctx->temp_reg;
4767		alu.dst.chan = i;
4768		alu.dst.write = 1;
4769		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4770			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4771		}
4772
4773		if (i == 3)
4774			alu.last = 1;
4775
4776		r = r600_bytecode_add_alu(ctx->bc, &alu);
4777		if (r)
4778			return r;
4779	}
4780
4781	/* Replicate significand result across channels. */
4782	for (i = 0; i <= 3; i++) {
4783		if (!(write_mask & (1 << i)))
4784			continue;
4785
4786		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4787		alu.op = ALU_OP1_MOV;
4788		alu.src[0].chan = (i & 1) + 2;
4789		alu.src[0].sel = ctx->temp_reg;
4790
4791		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4792		alu.dst.write = 1;
4793		alu.last = 1;
4794		r = r600_bytecode_add_alu(ctx->bc, &alu);
4795		if (r)
4796			return r;
4797	}
4798
4799	for (i = 0; i <= 3; i++) {
4800		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4801			/* MOV third channels to writemask dst1 */
4802			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4803			alu.op = ALU_OP1_MOV;
4804			alu.src[0].chan = 1;
4805			alu.src[0].sel = ctx->temp_reg;
4806
4807			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4808			alu.last = 1;
4809			r = r600_bytecode_add_alu(ctx->bc, &alu);
4810			if (r)
4811				return r;
4812			break;
4813		}
4814	}
4815	return 0;
4816}
4817
4818
4819static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4820{
4821	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4822	struct r600_bytecode_alu alu;
4823	int i, c, r;
4824	int write_mask = inst->Dst[0].Register.WriteMask;
4825	int temp_reg = r600_get_temp(ctx);
4826
4827	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4828		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4829
4830	for (c = 0; c < 2; c++) {
4831		int dchan = c * 2;
4832		if (write_mask & (0x3 << dchan)) {
4833	/* split into 24-bit int and 8-bit int */
4834			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4835			alu.op = ALU_OP2_AND_INT;
4836			alu.dst.sel = temp_reg;
4837			alu.dst.chan = dchan;
4838			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4839			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4840			alu.src[1].value = 0xffffff00;
4841			alu.dst.write = 1;
4842			r = r600_bytecode_add_alu(ctx->bc, &alu);
4843			if (r)
4844				return r;
4845
4846			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4847			alu.op = ALU_OP2_AND_INT;
4848			alu.dst.sel = temp_reg;
4849			alu.dst.chan = dchan + 1;
4850			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4851			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4852			alu.src[1].value = 0xff;
4853			alu.dst.write = 1;
4854			alu.last = 1;
4855			r = r600_bytecode_add_alu(ctx->bc, &alu);
4856			if (r)
4857				return r;
4858		}
4859	}
4860
4861	for (c = 0; c < 2; c++) {
4862		int dchan = c * 2;
4863		if (write_mask & (0x3 << dchan)) {
4864			for (i = dchan; i <= dchan + 1; i++) {
4865				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4866				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4867
4868				alu.src[0].sel = temp_reg;
4869				alu.src[0].chan = i;
4870				alu.dst.sel = temp_reg;
4871				alu.dst.chan = i;
4872				alu.dst.write = 1;
4873				if (ctx->bc->chip_class == CAYMAN)
4874					alu.last = i == dchan + 1;
4875				else
4876					alu.last = 1; /* trans only ops on evergreen */
4877
4878				r = r600_bytecode_add_alu(ctx->bc, &alu);
4879				if (r)
4880					return r;
4881			}
4882		}
4883	}
4884
4885	for (c = 0; c < 2; c++) {
4886		int dchan = c * 2;
4887		if (write_mask & (0x3 << dchan)) {
4888			for (i = 0; i < 4; i++) {
4889				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4890				alu.op = ALU_OP1_FLT32_TO_FLT64;
4891
4892				alu.src[0].chan = dchan + (i / 2);
4893				if (i == 0 || i == 2)
4894					alu.src[0].sel = temp_reg;
4895				else {
4896					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4897					alu.src[0].value = 0x0;
4898				}
4899				alu.dst.sel = ctx->temp_reg;
4900				alu.dst.chan = i;
4901				alu.last = i == 3;
4902				alu.dst.write = 1;
4903
4904				r = r600_bytecode_add_alu(ctx->bc, &alu);
4905				if (r)
4906					return r;
4907			}
4908
4909			for (i = 0; i <= 1; i++) {
4910				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4911				alu.op = ALU_OP2_ADD_64;
4912
4913				alu.src[0].chan = fp64_switch(i);
4914				alu.src[0].sel = ctx->temp_reg;
4915
4916				alu.src[1].chan = fp64_switch(i + 2);
4917				alu.src[1].sel = ctx->temp_reg;
4918				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
4919				alu.last = i == 1;
4920
4921				r = r600_bytecode_add_alu(ctx->bc, &alu);
4922				if (r)
4923					return r;
4924			}
4925		}
4926	}
4927
4928	return 0;
4929}
4930
4931static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4932{
4933	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4934	struct r600_bytecode_alu alu;
4935	int i, r;
4936	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4937	int treg = r600_get_temp(ctx);
4938	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4939		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4940
4941	/* do a 64->32 into a temp register */
4942	r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4943	if (r)
4944		return r;
4945
4946	for (i = 0; i <= lasti; i++) {
4947		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4948			continue;
4949		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4950		alu.op = ctx->inst_info->op;
4951
4952		alu.src[0].chan = i;
4953		alu.src[0].sel = treg;
4954		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4955		alu.last = (i == lasti);
4956
4957		r = r600_bytecode_add_alu(ctx->bc, &alu);
4958		if (r)
4959			return r;
4960	}
4961
4962	return 0;
4963}
4964
4965static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4966					unsigned op,
4967					int dst_reg,
4968					struct r600_shader_src *src,
4969					bool abs)
4970{
4971	struct r600_bytecode_alu alu;
4972	const int last_slot = 3;
4973	int r;
4974
4975	/* these have to write the result to X/Y by the looks of it */
4976	for (int i = 0 ; i < last_slot; i++) {
4977		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4978		alu.op = op;
4979
4980		r600_bytecode_src(&alu.src[0], src, 1);
4981		r600_bytecode_src(&alu.src[1], src, 0);
4982
4983		if (abs)
4984			r600_bytecode_src_set_abs(&alu.src[1]);
4985
4986		alu.dst.sel = dst_reg;
4987		alu.dst.chan = i;
4988		alu.dst.write = (i == 0 || i == 1);
4989
4990		if (bc->chip_class != CAYMAN || i == last_slot - 1)
4991			alu.last = 1;
4992		r = r600_bytecode_add_alu(bc, &alu);
4993		if (r)
4994			return r;
4995	}
4996
4997	return 0;
4998}
4999
5000static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5001{
5002	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5003	int i, r;
5004	struct r600_bytecode_alu alu;
5005	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5006	int t1 = ctx->temp_reg;
5007
5008	/* should only be one src regs */
5009	assert(inst->Instruction.NumSrcRegs == 1);
5010
5011	/* only support one double at a time */
5012	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5013	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5014
5015	r = cayman_emit_unary_double_raw(
5016		ctx->bc, ctx->inst_info->op, t1,
5017		&ctx->src[0],
5018		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5019		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5020	if (r)
5021		return r;
5022
5023	for (i = 0 ; i <= lasti; i++) {
5024		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5025			continue;
5026		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5027		alu.op = ALU_OP1_MOV;
5028		alu.src[0].sel = t1;
5029		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5030		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5031		alu.dst.write = 1;
5032		if (i == lasti)
5033			alu.last = 1;
5034		r = r600_bytecode_add_alu(ctx->bc, &alu);
5035		if (r)
5036			return r;
5037	}
5038	return 0;
5039}
5040
5041static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5042{
5043	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5044	int i, j, r;
5045	struct r600_bytecode_alu alu;
5046	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5047
5048	for (i = 0 ; i < last_slot; i++) {
5049		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5050		alu.op = ctx->inst_info->op;
5051		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5052			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5053
5054			/* RSQ should take the absolute value of src */
5055			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5056				r600_bytecode_src_set_abs(&alu.src[j]);
5057			}
5058		}
5059		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5060		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5061
5062		if (i == last_slot - 1)
5063			alu.last = 1;
5064		r = r600_bytecode_add_alu(ctx->bc, &alu);
5065		if (r)
5066			return r;
5067	}
5068	return 0;
5069}
5070
5071static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5072{
5073	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5074	int i, j, k, r;
5075	struct r600_bytecode_alu alu;
5076	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5077	int t1 = ctx->temp_reg;
5078
5079	for (k = 0; k <= lasti; k++) {
5080		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5081			continue;
5082
5083		for (i = 0 ; i < 4; i++) {
5084			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5085			alu.op = ctx->inst_info->op;
5086			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5087				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5088			}
5089			alu.dst.sel = t1;
5090			alu.dst.chan = i;
5091			alu.dst.write = (i == k);
5092			if (i == 3)
5093				alu.last = 1;
5094			r = r600_bytecode_add_alu(ctx->bc, &alu);
5095			if (r)
5096				return r;
5097		}
5098	}
5099
5100	for (i = 0 ; i <= lasti; i++) {
5101		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5102			continue;
5103		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5104		alu.op = ALU_OP1_MOV;
5105		alu.src[0].sel = t1;
5106		alu.src[0].chan = i;
5107		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5108		alu.dst.write = 1;
5109		if (i == lasti)
5110			alu.last = 1;
5111		r = r600_bytecode_add_alu(ctx->bc, &alu);
5112		if (r)
5113			return r;
5114	}
5115
5116	return 0;
5117}
5118
5119
5120static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5121{
5122	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5123	int i, j, k, r;
5124	struct r600_bytecode_alu alu;
5125	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5126	int t1 = ctx->temp_reg;
5127
5128	/* t1 would get overwritten below if we actually tried to
5129	 * multiply two pairs of doubles at a time. */
5130	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5131	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5132
5133	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5134
5135	for (i = 0; i < 4; i++) {
5136		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5137		alu.op = ctx->inst_info->op;
5138		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5139			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5140		}
5141		alu.dst.sel = t1;
5142		alu.dst.chan = i;
5143		alu.dst.write = 1;
5144		if (i == 3)
5145			alu.last = 1;
5146		r = r600_bytecode_add_alu(ctx->bc, &alu);
5147		if (r)
5148			return r;
5149	}
5150
5151	for (i = 0; i <= lasti; i++) {
5152		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5153			continue;
5154		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5155		alu.op = ALU_OP1_MOV;
5156		alu.src[0].sel = t1;
5157		alu.src[0].chan = i;
5158		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5159		alu.dst.write = 1;
5160		if (i == lasti)
5161			alu.last = 1;
5162		r = r600_bytecode_add_alu(ctx->bc, &alu);
5163		if (r)
5164			return r;
5165	}
5166
5167	return 0;
5168}
5169
5170/*
5171 * Emit RECIP_64 + MUL_64 to implement division.
5172 */
5173static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5174{
5175	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5176	int r;
5177	struct r600_bytecode_alu alu;
5178	int t1 = ctx->temp_reg;
5179	int k;
5180
5181	/* Only support one double at a time. This is the same constraint as
5182	 * in DMUL lowering. */
5183	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5184	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5185
5186	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5187
5188	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5189	if (r)
5190		return r;
5191
5192	for (int i = 0; i < 4; i++) {
5193		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5194		alu.op = ALU_OP2_MUL_64;
5195
5196		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5197
5198		alu.src[1].sel = t1;
5199		alu.src[1].chan = (i == 3) ? 0 : 1;
5200
5201		alu.dst.sel = t1;
5202		alu.dst.chan = i;
5203		alu.dst.write = 1;
5204		if (i == 3)
5205			alu.last = 1;
5206		r = r600_bytecode_add_alu(ctx->bc, &alu);
5207		if (r)
5208			return r;
5209	}
5210
5211	for (int i = 0; i < 2; i++) {
5212		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5213		alu.op = ALU_OP1_MOV;
5214		alu.src[0].sel = t1;
5215		alu.src[0].chan = i;
5216		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5217		alu.dst.write = 1;
5218		if (i == 1)
5219			alu.last = 1;
5220		r = r600_bytecode_add_alu(ctx->bc, &alu);
5221		if (r)
5222			return r;
5223	}
5224	return 0;
5225}
5226
5227/*
5228 * r600 - trunc to -PI..PI range
5229 * r700 - normalize by dividing by 2PI
5230 * see fdo bug 27901
5231 */
5232static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5233{
5234	int r;
5235	struct r600_bytecode_alu alu;
5236
5237	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5238	alu.op = ALU_OP3_MULADD;
5239	alu.is_op3 = 1;
5240
5241	alu.dst.chan = 0;
5242	alu.dst.sel = ctx->temp_reg;
5243	alu.dst.write = 1;
5244
5245	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5246
5247	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5248	alu.src[1].chan = 0;
5249	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5250	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5251	alu.src[2].chan = 0;
5252	alu.last = 1;
5253	r = r600_bytecode_add_alu(ctx->bc, &alu);
5254	if (r)
5255		return r;
5256
5257	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5258	alu.op = ALU_OP1_FRACT;
5259
5260	alu.dst.chan = 0;
5261	alu.dst.sel = ctx->temp_reg;
5262	alu.dst.write = 1;
5263
5264	alu.src[0].sel = ctx->temp_reg;
5265	alu.src[0].chan = 0;
5266	alu.last = 1;
5267	r = r600_bytecode_add_alu(ctx->bc, &alu);
5268	if (r)
5269		return r;
5270
5271	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5272	alu.op = ALU_OP3_MULADD;
5273	alu.is_op3 = 1;
5274
5275	alu.dst.chan = 0;
5276	alu.dst.sel = ctx->temp_reg;
5277	alu.dst.write = 1;
5278
5279	alu.src[0].sel = ctx->temp_reg;
5280	alu.src[0].chan = 0;
5281
5282	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5283	alu.src[1].chan = 0;
5284	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5285	alu.src[2].chan = 0;
5286
5287	if (ctx->bc->chip_class == R600) {
5288		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5289		alu.src[2].value = u_bitcast_f2u(-M_PI);
5290	} else {
5291		alu.src[1].sel = V_SQ_ALU_SRC_1;
5292		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5293		alu.src[2].neg = 1;
5294	}
5295
5296	alu.last = 1;
5297	r = r600_bytecode_add_alu(ctx->bc, &alu);
5298	if (r)
5299		return r;
5300	return 0;
5301}
5302
5303static int cayman_trig(struct r600_shader_ctx *ctx)
5304{
5305	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5306	struct r600_bytecode_alu alu;
5307	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5308	int i, r;
5309
5310	r = tgsi_setup_trig(ctx);
5311	if (r)
5312		return r;
5313
5314
5315	for (i = 0; i < last_slot; i++) {
5316		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5317		alu.op = ctx->inst_info->op;
5318		alu.dst.chan = i;
5319
5320		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5321		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5322
5323		alu.src[0].sel = ctx->temp_reg;
5324		alu.src[0].chan = 0;
5325		if (i == last_slot - 1)
5326			alu.last = 1;
5327		r = r600_bytecode_add_alu(ctx->bc, &alu);
5328		if (r)
5329			return r;
5330	}
5331	return 0;
5332}
5333
5334static int tgsi_trig(struct r600_shader_ctx *ctx)
5335{
5336	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5337	struct r600_bytecode_alu alu;
5338	int i, r;
5339	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5340
5341	r = tgsi_setup_trig(ctx);
5342	if (r)
5343		return r;
5344
5345	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5346	alu.op = ctx->inst_info->op;
5347	alu.dst.chan = 0;
5348	alu.dst.sel = ctx->temp_reg;
5349	alu.dst.write = 1;
5350
5351	alu.src[0].sel = ctx->temp_reg;
5352	alu.src[0].chan = 0;
5353	alu.last = 1;
5354	r = r600_bytecode_add_alu(ctx->bc, &alu);
5355	if (r)
5356		return r;
5357
5358	/* replicate result */
5359	for (i = 0; i < lasti + 1; i++) {
5360		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5361			continue;
5362
5363		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5364		alu.op = ALU_OP1_MOV;
5365
5366		alu.src[0].sel = ctx->temp_reg;
5367		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5368		if (i == lasti)
5369			alu.last = 1;
5370		r = r600_bytecode_add_alu(ctx->bc, &alu);
5371		if (r)
5372			return r;
5373	}
5374	return 0;
5375}
5376
5377static int tgsi_kill(struct r600_shader_ctx *ctx)
5378{
5379	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5380	struct r600_bytecode_alu alu;
5381	int i, r;
5382
5383	for (i = 0; i < 4; i++) {
5384		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5385		alu.op = ctx->inst_info->op;
5386
5387		alu.dst.chan = i;
5388
5389		alu.src[0].sel = V_SQ_ALU_SRC_0;
5390
5391		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5392			alu.src[1].sel = V_SQ_ALU_SRC_1;
5393			alu.src[1].neg = 1;
5394		} else {
5395			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5396		}
5397		if (i == 3) {
5398			alu.last = 1;
5399		}
5400		r = r600_bytecode_add_alu(ctx->bc, &alu);
5401		if (r)
5402			return r;
5403	}
5404
5405	/* kill must be last in ALU */
5406	ctx->bc->force_add_cf = 1;
5407	ctx->shader->uses_kill = TRUE;
5408	return 0;
5409}
5410
5411static int tgsi_lit(struct r600_shader_ctx *ctx)
5412{
5413	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5414	struct r600_bytecode_alu alu;
5415	int r;
5416
5417	/* tmp.x = max(src.y, 0.0) */
5418	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5419	alu.op = ALU_OP2_MAX;
5420	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5421	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5422	alu.src[1].chan = 1;
5423
5424	alu.dst.sel = ctx->temp_reg;
5425	alu.dst.chan = 0;
5426	alu.dst.write = 1;
5427
5428	alu.last = 1;
5429	r = r600_bytecode_add_alu(ctx->bc, &alu);
5430	if (r)
5431		return r;
5432
5433	if (inst->Dst[0].Register.WriteMask & (1 << 2))
5434	{
5435		int chan;
5436		int sel;
5437		unsigned i;
5438
5439		if (ctx->bc->chip_class == CAYMAN) {
5440			for (i = 0; i < 3; i++) {
5441				/* tmp.z = log(tmp.x) */
5442				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5443				alu.op = ALU_OP1_LOG_CLAMPED;
5444				alu.src[0].sel = ctx->temp_reg;
5445				alu.src[0].chan = 0;
5446				alu.dst.sel = ctx->temp_reg;
5447				alu.dst.chan = i;
5448				if (i == 2) {
5449					alu.dst.write = 1;
5450					alu.last = 1;
5451				} else
5452					alu.dst.write = 0;
5453
5454				r = r600_bytecode_add_alu(ctx->bc, &alu);
5455				if (r)
5456					return r;
5457			}
5458		} else {
5459			/* tmp.z = log(tmp.x) */
5460			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5461			alu.op = ALU_OP1_LOG_CLAMPED;
5462			alu.src[0].sel = ctx->temp_reg;
5463			alu.src[0].chan = 0;
5464			alu.dst.sel = ctx->temp_reg;
5465			alu.dst.chan = 2;
5466			alu.dst.write = 1;
5467			alu.last = 1;
5468			r = r600_bytecode_add_alu(ctx->bc, &alu);
5469			if (r)
5470				return r;
5471		}
5472
5473		chan = alu.dst.chan;
5474		sel = alu.dst.sel;
5475
5476		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5477		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5478		alu.op = ALU_OP3_MUL_LIT;
5479		alu.src[0].sel  = sel;
5480		alu.src[0].chan = chan;
5481		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5482		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5483		alu.dst.sel = ctx->temp_reg;
5484		alu.dst.chan = 0;
5485		alu.dst.write = 1;
5486		alu.is_op3 = 1;
5487		alu.last = 1;
5488		r = r600_bytecode_add_alu(ctx->bc, &alu);
5489		if (r)
5490			return r;
5491
5492		if (ctx->bc->chip_class == CAYMAN) {
5493			for (i = 0; i < 3; i++) {
5494				/* dst.z = exp(tmp.x) */
5495				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5496				alu.op = ALU_OP1_EXP_IEEE;
5497				alu.src[0].sel = ctx->temp_reg;
5498				alu.src[0].chan = 0;
5499				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5500				if (i == 2) {
5501					alu.dst.write = 1;
5502					alu.last = 1;
5503				} else
5504					alu.dst.write = 0;
5505				r = r600_bytecode_add_alu(ctx->bc, &alu);
5506				if (r)
5507					return r;
5508			}
5509		} else {
5510			/* dst.z = exp(tmp.x) */
5511			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5512			alu.op = ALU_OP1_EXP_IEEE;
5513			alu.src[0].sel = ctx->temp_reg;
5514			alu.src[0].chan = 0;
5515			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5516			alu.last = 1;
5517			r = r600_bytecode_add_alu(ctx->bc, &alu);
5518			if (r)
5519				return r;
5520		}
5521	}
5522
5523	/* dst.x, <- 1.0  */
5524	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5525	alu.op = ALU_OP1_MOV;
5526	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
5527	alu.src[0].chan = 0;
5528	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5529	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5530	r = r600_bytecode_add_alu(ctx->bc, &alu);
5531	if (r)
5532		return r;
5533
5534	/* dst.y = max(src.x, 0.0) */
5535	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5536	alu.op = ALU_OP2_MAX;
5537	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5538	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5539	alu.src[1].chan = 0;
5540	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5541	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5542	r = r600_bytecode_add_alu(ctx->bc, &alu);
5543	if (r)
5544		return r;
5545
5546	/* dst.w, <- 1.0  */
5547	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5548	alu.op = ALU_OP1_MOV;
5549	alu.src[0].sel  = V_SQ_ALU_SRC_1;
5550	alu.src[0].chan = 0;
5551	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5552	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5553	alu.last = 1;
5554	r = r600_bytecode_add_alu(ctx->bc, &alu);
5555	if (r)
5556		return r;
5557
5558	return 0;
5559}
5560
5561static int tgsi_rsq(struct r600_shader_ctx *ctx)
5562{
5563	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5564	struct r600_bytecode_alu alu;
5565	int i, r;
5566
5567	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5568
5569	alu.op = ALU_OP1_RECIPSQRT_IEEE;
5570
5571	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5572		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5573		r600_bytecode_src_set_abs(&alu.src[i]);
5574	}
5575	alu.dst.sel = ctx->temp_reg;
5576	alu.dst.write = 1;
5577	alu.last = 1;
5578	r = r600_bytecode_add_alu(ctx->bc, &alu);
5579	if (r)
5580		return r;
5581	/* replicate result */
5582	return tgsi_helper_tempx_replicate(ctx);
5583}
5584
5585static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5586{
5587	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5588	struct r600_bytecode_alu alu;
5589	int i, r;
5590
5591	for (i = 0; i < 4; i++) {
5592		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5593		alu.src[0].sel = ctx->temp_reg;
5594		alu.op = ALU_OP1_MOV;
5595		alu.dst.chan = i;
5596		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5597		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5598		if (i == 3)
5599			alu.last = 1;
5600		r = r600_bytecode_add_alu(ctx->bc, &alu);
5601		if (r)
5602			return r;
5603	}
5604	return 0;
5605}
5606
5607static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5608{
5609	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5610	struct r600_bytecode_alu alu;
5611	int i, r;
5612
5613	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5614	alu.op = ctx->inst_info->op;
5615	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5616		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5617	}
5618	alu.dst.sel = ctx->temp_reg;
5619	alu.dst.write = 1;
5620	alu.last = 1;
5621	r = r600_bytecode_add_alu(ctx->bc, &alu);
5622	if (r)
5623		return r;
5624	/* replicate result */
5625	return tgsi_helper_tempx_replicate(ctx);
5626}
5627
5628static int cayman_pow(struct r600_shader_ctx *ctx)
5629{
5630	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5631	int i, r;
5632	struct r600_bytecode_alu alu;
5633	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5634
5635	for (i = 0; i < 3; i++) {
5636		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5637		alu.op = ALU_OP1_LOG_IEEE;
5638		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5639		alu.dst.sel = ctx->temp_reg;
5640		alu.dst.chan = i;
5641		alu.dst.write = 1;
5642		if (i == 2)
5643			alu.last = 1;
5644		r = r600_bytecode_add_alu(ctx->bc, &alu);
5645		if (r)
5646			return r;
5647	}
5648
5649	/* b * LOG2(a) */
5650	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5651	alu.op = ALU_OP2_MUL;
5652	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5653	alu.src[1].sel = ctx->temp_reg;
5654	alu.dst.sel = ctx->temp_reg;
5655	alu.dst.write = 1;
5656	alu.last = 1;
5657	r = r600_bytecode_add_alu(ctx->bc, &alu);
5658	if (r)
5659		return r;
5660
5661	for (i = 0; i < last_slot; i++) {
5662		/* POW(a,b) = EXP2(b * LOG2(a))*/
5663		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5664		alu.op = ALU_OP1_EXP_IEEE;
5665		alu.src[0].sel = ctx->temp_reg;
5666
5667		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5668		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5669		if (i == last_slot - 1)
5670			alu.last = 1;
5671		r = r600_bytecode_add_alu(ctx->bc, &alu);
5672		if (r)
5673			return r;
5674	}
5675	return 0;
5676}
5677
5678static int tgsi_pow(struct r600_shader_ctx *ctx)
5679{
5680	struct r600_bytecode_alu alu;
5681	int r;
5682
5683	/* LOG2(a) */
5684	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5685	alu.op = ALU_OP1_LOG_IEEE;
5686	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5687	alu.dst.sel = ctx->temp_reg;
5688	alu.dst.write = 1;
5689	alu.last = 1;
5690	r = r600_bytecode_add_alu(ctx->bc, &alu);
5691	if (r)
5692		return r;
5693	/* b * LOG2(a) */
5694	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5695	alu.op = ALU_OP2_MUL;
5696	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5697	alu.src[1].sel = ctx->temp_reg;
5698	alu.dst.sel = ctx->temp_reg;
5699	alu.dst.write = 1;
5700	alu.last = 1;
5701	r = r600_bytecode_add_alu(ctx->bc, &alu);
5702	if (r)
5703		return r;
5704	/* POW(a,b) = EXP2(b * LOG2(a))*/
5705	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5706	alu.op = ALU_OP1_EXP_IEEE;
5707	alu.src[0].sel = ctx->temp_reg;
5708	alu.dst.sel = ctx->temp_reg;
5709	alu.dst.write = 1;
5710	alu.last = 1;
5711	r = r600_bytecode_add_alu(ctx->bc, &alu);
5712	if (r)
5713		return r;
5714	return tgsi_helper_tempx_replicate(ctx);
5715}
5716
5717static int emit_mul_int_op(struct r600_bytecode *bc,
5718			   struct r600_bytecode_alu *alu_src)
5719{
5720	struct r600_bytecode_alu alu;
5721	int i, r;
5722	alu = *alu_src;
5723	if (bc->chip_class == CAYMAN) {
5724		for (i = 0; i < 4; i++) {
5725			alu.dst.chan = i;
5726			alu.dst.write = (i == alu_src->dst.chan);
5727			alu.last = (i == 3);
5728
5729			r = r600_bytecode_add_alu(bc, &alu);
5730			if (r)
5731				return r;
5732		}
5733	} else {
5734		alu.last = 1;
5735		r = r600_bytecode_add_alu(bc, &alu);
5736		if (r)
5737			return r;
5738	}
5739	return 0;
5740}
5741
5742static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5743{
5744	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5745	struct r600_bytecode_alu alu;
5746	int i, r, j;
5747	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5748	int lasti = tgsi_last_instruction(write_mask);
5749	int tmp0 = ctx->temp_reg;
5750	int tmp1 = r600_get_temp(ctx);
5751	int tmp2 = r600_get_temp(ctx);
5752	int tmp3 = r600_get_temp(ctx);
5753	int tmp4 = 0;
5754
5755	/* Use additional temp if dst register and src register are the same */
5756	if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5757	    inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5758		tmp4 = r600_get_temp(ctx);
5759	}
5760
5761	/* Unsigned path:
5762	 *
5763	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5764	 *
5765	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5766	 * 2. tmp0.z = lo (tmp0.x * src2)
5767	 * 3. tmp0.w = -tmp0.z
5768	 * 4. tmp0.y = hi (tmp0.x * src2)
5769	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5770	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5771	 * 7. tmp1.x = tmp0.x - tmp0.w
5772	 * 8. tmp1.y = tmp0.x + tmp0.w
5773	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5774	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5775	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5776	 *
5777	 * 12. tmp0.w = src1 - tmp0.y       = r
5778	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5779	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5780	 *
5781	 * if DIV
5782	 *
5783	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5784	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5785	 *
5786	 * else MOD
5787	 *
5788	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5789	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5790	 *
5791	 * endif
5792	 *
5793	 * 17. tmp1.x = tmp1.x & tmp1.y
5794	 *
5795	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5796	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5797	 *
5798	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5799	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5800	 *
5801	 * Signed path:
5802	 *
5803	 * Same as unsigned, using abs values of the operands,
5804	 * and fixing the sign of the result in the end.
5805	 */
5806
5807	for (i = 0; i < 4; i++) {
5808		if (!(write_mask & (1<<i)))
5809			continue;
5810
5811		if (signed_op) {
5812
5813			/* tmp2.x = -src0 */
5814			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5815			alu.op = ALU_OP2_SUB_INT;
5816
5817			alu.dst.sel = tmp2;
5818			alu.dst.chan = 0;
5819			alu.dst.write = 1;
5820
5821			alu.src[0].sel = V_SQ_ALU_SRC_0;
5822
5823			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5824
5825			alu.last = 1;
5826			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5827				return r;
5828
5829			/* tmp2.y = -src1 */
5830			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5831			alu.op = ALU_OP2_SUB_INT;
5832
5833			alu.dst.sel = tmp2;
5834			alu.dst.chan = 1;
5835			alu.dst.write = 1;
5836
5837			alu.src[0].sel = V_SQ_ALU_SRC_0;
5838
5839			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5840
5841			alu.last = 1;
5842			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5843				return r;
5844
5845			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5846			/* it will be a sign of the quotient */
5847			if (!mod) {
5848
5849				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5850				alu.op = ALU_OP2_XOR_INT;
5851
5852				alu.dst.sel = tmp2;
5853				alu.dst.chan = 2;
5854				alu.dst.write = 1;
5855
5856				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5857				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5858
5859				alu.last = 1;
5860				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5861					return r;
5862			}
5863
5864			/* tmp2.x = |src0| */
5865			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5866			alu.op = ALU_OP3_CNDGE_INT;
5867			alu.is_op3 = 1;
5868
5869			alu.dst.sel = tmp2;
5870			alu.dst.chan = 0;
5871			alu.dst.write = 1;
5872
5873			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5874			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5875			alu.src[2].sel = tmp2;
5876			alu.src[2].chan = 0;
5877
5878			alu.last = 1;
5879			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5880				return r;
5881
5882			/* tmp2.y = |src1| */
5883			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5884			alu.op = ALU_OP3_CNDGE_INT;
5885			alu.is_op3 = 1;
5886
5887			alu.dst.sel = tmp2;
5888			alu.dst.chan = 1;
5889			alu.dst.write = 1;
5890
5891			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5892			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5893			alu.src[2].sel = tmp2;
5894			alu.src[2].chan = 1;
5895
5896			alu.last = 1;
5897			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5898				return r;
5899
5900		}
5901
5902		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5903		if (ctx->bc->chip_class == CAYMAN) {
5904			/* tmp3.x = u2f(src2) */
5905			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5906			alu.op = ALU_OP1_UINT_TO_FLT;
5907
5908			alu.dst.sel = tmp3;
5909			alu.dst.chan = 0;
5910			alu.dst.write = 1;
5911
5912			if (signed_op) {
5913				alu.src[0].sel = tmp2;
5914				alu.src[0].chan = 1;
5915			} else {
5916				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5917			}
5918
5919			alu.last = 1;
5920			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5921				return r;
5922
5923			/* tmp0.x = recip(tmp3.x) */
5924			for (j = 0 ; j < 3; j++) {
5925				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5926				alu.op = ALU_OP1_RECIP_IEEE;
5927
5928				alu.dst.sel = tmp0;
5929				alu.dst.chan = j;
5930				alu.dst.write = (j == 0);
5931
5932				alu.src[0].sel = tmp3;
5933				alu.src[0].chan = 0;
5934
5935				if (j == 2)
5936					alu.last = 1;
5937				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5938					return r;
5939			}
5940
5941			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5942			alu.op = ALU_OP2_MUL;
5943
5944			alu.src[0].sel = tmp0;
5945			alu.src[0].chan = 0;
5946
5947			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5948			alu.src[1].value = 0x4f800000;
5949
5950			alu.dst.sel = tmp3;
5951			alu.dst.write = 1;
5952			alu.last = 1;
5953			r = r600_bytecode_add_alu(ctx->bc, &alu);
5954			if (r)
5955				return r;
5956
5957			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5958			alu.op = ALU_OP1_FLT_TO_UINT;
5959
5960			alu.dst.sel = tmp0;
5961			alu.dst.chan = 0;
5962			alu.dst.write = 1;
5963
5964			alu.src[0].sel = tmp3;
5965			alu.src[0].chan = 0;
5966
5967			alu.last = 1;
5968			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5969				return r;
5970
5971		} else {
5972			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5973			alu.op = ALU_OP1_RECIP_UINT;
5974
5975			alu.dst.sel = tmp0;
5976			alu.dst.chan = 0;
5977			alu.dst.write = 1;
5978
5979			if (signed_op) {
5980				alu.src[0].sel = tmp2;
5981				alu.src[0].chan = 1;
5982			} else {
5983				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5984			}
5985
5986			alu.last = 1;
5987			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5988				return r;
5989		}
5990
5991		/* 2. tmp0.z = lo (tmp0.x * src2) */
5992		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5993		alu.op = ALU_OP2_MULLO_UINT;
5994
5995		alu.dst.sel = tmp0;
5996		alu.dst.chan = 2;
5997		alu.dst.write = 1;
5998
5999		alu.src[0].sel = tmp0;
6000		alu.src[0].chan = 0;
6001		if (signed_op) {
6002			alu.src[1].sel = tmp2;
6003			alu.src[1].chan = 1;
6004		} else {
6005			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6006		}
6007
6008		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6009			return r;
6010
6011		/* 3. tmp0.w = -tmp0.z */
6012		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6013		alu.op = ALU_OP2_SUB_INT;
6014
6015		alu.dst.sel = tmp0;
6016		alu.dst.chan = 3;
6017		alu.dst.write = 1;
6018
6019		alu.src[0].sel = V_SQ_ALU_SRC_0;
6020		alu.src[1].sel = tmp0;
6021		alu.src[1].chan = 2;
6022
6023		alu.last = 1;
6024		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6025			return r;
6026
6027		/* 4. tmp0.y = hi (tmp0.x * src2) */
6028		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6029		alu.op = ALU_OP2_MULHI_UINT;
6030
6031		alu.dst.sel = tmp0;
6032		alu.dst.chan = 1;
6033		alu.dst.write = 1;
6034
6035		alu.src[0].sel = tmp0;
6036		alu.src[0].chan = 0;
6037
6038		if (signed_op) {
6039			alu.src[1].sel = tmp2;
6040			alu.src[1].chan = 1;
6041		} else {
6042			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6043		}
6044
6045		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6046			return r;
6047
6048		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
6049		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6050		alu.op = ALU_OP3_CNDE_INT;
6051		alu.is_op3 = 1;
6052
6053		alu.dst.sel = tmp0;
6054		alu.dst.chan = 2;
6055		alu.dst.write = 1;
6056
6057		alu.src[0].sel = tmp0;
6058		alu.src[0].chan = 1;
6059		alu.src[1].sel = tmp0;
6060		alu.src[1].chan = 3;
6061		alu.src[2].sel = tmp0;
6062		alu.src[2].chan = 2;
6063
6064		alu.last = 1;
6065		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6066			return r;
6067
6068		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
6069		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6070		alu.op = ALU_OP2_MULHI_UINT;
6071
6072		alu.dst.sel = tmp0;
6073		alu.dst.chan = 3;
6074		alu.dst.write = 1;
6075
6076		alu.src[0].sel = tmp0;
6077		alu.src[0].chan = 2;
6078
6079		alu.src[1].sel = tmp0;
6080		alu.src[1].chan = 0;
6081
6082		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6083				return r;
6084
6085		/* 7. tmp1.x = tmp0.x - tmp0.w */
6086		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6087		alu.op = ALU_OP2_SUB_INT;
6088
6089		alu.dst.sel = tmp1;
6090		alu.dst.chan = 0;
6091		alu.dst.write = 1;
6092
6093		alu.src[0].sel = tmp0;
6094		alu.src[0].chan = 0;
6095		alu.src[1].sel = tmp0;
6096		alu.src[1].chan = 3;
6097
6098		alu.last = 1;
6099		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6100			return r;
6101
6102		/* 8. tmp1.y = tmp0.x + tmp0.w */
6103		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6104		alu.op = ALU_OP2_ADD_INT;
6105
6106		alu.dst.sel = tmp1;
6107		alu.dst.chan = 1;
6108		alu.dst.write = 1;
6109
6110		alu.src[0].sel = tmp0;
6111		alu.src[0].chan = 0;
6112		alu.src[1].sel = tmp0;
6113		alu.src[1].chan = 3;
6114
6115		alu.last = 1;
6116		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6117			return r;
6118
6119		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6120		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6121		alu.op = ALU_OP3_CNDE_INT;
6122		alu.is_op3 = 1;
6123
6124		alu.dst.sel = tmp0;
6125		alu.dst.chan = 0;
6126		alu.dst.write = 1;
6127
6128		alu.src[0].sel = tmp0;
6129		alu.src[0].chan = 1;
6130		alu.src[1].sel = tmp1;
6131		alu.src[1].chan = 1;
6132		alu.src[2].sel = tmp1;
6133		alu.src[2].chan = 0;
6134
6135		alu.last = 1;
6136		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6137			return r;
6138
6139		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
6140		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6141		alu.op = ALU_OP2_MULHI_UINT;
6142
6143		alu.dst.sel = tmp0;
6144		alu.dst.chan = 2;
6145		alu.dst.write = 1;
6146
6147		alu.src[0].sel = tmp0;
6148		alu.src[0].chan = 0;
6149
6150		if (signed_op) {
6151			alu.src[1].sel = tmp2;
6152			alu.src[1].chan = 0;
6153		} else {
6154			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6155		}
6156
6157		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6158			return r;
6159
6160		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
6161		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6162		alu.op = ALU_OP2_MULLO_UINT;
6163
6164		alu.dst.sel = tmp0;
6165		alu.dst.chan = 1;
6166		alu.dst.write = 1;
6167
6168		if (signed_op) {
6169			alu.src[0].sel = tmp2;
6170			alu.src[0].chan = 1;
6171		} else {
6172			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6173		}
6174
6175		alu.src[1].sel = tmp0;
6176		alu.src[1].chan = 2;
6177
6178		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6179			return r;
6180
6181		/* 12. tmp0.w = src1 - tmp0.y       = r */
6182		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6183		alu.op = ALU_OP2_SUB_INT;
6184
6185		alu.dst.sel = tmp0;
6186		alu.dst.chan = 3;
6187		alu.dst.write = 1;
6188
6189		if (signed_op) {
6190			alu.src[0].sel = tmp2;
6191			alu.src[0].chan = 0;
6192		} else {
6193			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6194		}
6195
6196		alu.src[1].sel = tmp0;
6197		alu.src[1].chan = 1;
6198
6199		alu.last = 1;
6200		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6201			return r;
6202
6203		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
6204		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6205		alu.op = ALU_OP2_SETGE_UINT;
6206
6207		alu.dst.sel = tmp1;
6208		alu.dst.chan = 0;
6209		alu.dst.write = 1;
6210
6211		alu.src[0].sel = tmp0;
6212		alu.src[0].chan = 3;
6213		if (signed_op) {
6214			alu.src[1].sel = tmp2;
6215			alu.src[1].chan = 1;
6216		} else {
6217			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6218		}
6219
6220		alu.last = 1;
6221		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6222			return r;
6223
6224		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
6225		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6226		alu.op = ALU_OP2_SETGE_UINT;
6227
6228		alu.dst.sel = tmp1;
6229		alu.dst.chan = 1;
6230		alu.dst.write = 1;
6231
6232		if (signed_op) {
6233			alu.src[0].sel = tmp2;
6234			alu.src[0].chan = 0;
6235		} else {
6236			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6237		}
6238
6239		alu.src[1].sel = tmp0;
6240		alu.src[1].chan = 1;
6241
6242		alu.last = 1;
6243		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6244			return r;
6245
6246		if (mod) { /* UMOD */
6247
6248			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
6249			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6250			alu.op = ALU_OP2_SUB_INT;
6251
6252			alu.dst.sel = tmp1;
6253			alu.dst.chan = 2;
6254			alu.dst.write = 1;
6255
6256			alu.src[0].sel = tmp0;
6257			alu.src[0].chan = 3;
6258
6259			if (signed_op) {
6260				alu.src[1].sel = tmp2;
6261				alu.src[1].chan = 1;
6262			} else {
6263				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6264			}
6265
6266			alu.last = 1;
6267			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6268				return r;
6269
6270			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
6271			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6272			alu.op = ALU_OP2_ADD_INT;
6273
6274			alu.dst.sel = tmp1;
6275			alu.dst.chan = 3;
6276			alu.dst.write = 1;
6277
6278			alu.src[0].sel = tmp0;
6279			alu.src[0].chan = 3;
6280			if (signed_op) {
6281				alu.src[1].sel = tmp2;
6282				alu.src[1].chan = 1;
6283			} else {
6284				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6285			}
6286
6287			alu.last = 1;
6288			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6289				return r;
6290
6291		} else { /* UDIV */
6292
6293			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
6294			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6295			alu.op = ALU_OP2_ADD_INT;
6296
6297			alu.dst.sel = tmp1;
6298			alu.dst.chan = 2;
6299			alu.dst.write = 1;
6300
6301			alu.src[0].sel = tmp0;
6302			alu.src[0].chan = 2;
6303			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6304
6305			alu.last = 1;
6306			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6307				return r;
6308
6309			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
6310			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6311			alu.op = ALU_OP2_ADD_INT;
6312
6313			alu.dst.sel = tmp1;
6314			alu.dst.chan = 3;
6315			alu.dst.write = 1;
6316
6317			alu.src[0].sel = tmp0;
6318			alu.src[0].chan = 2;
6319			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6320
6321			alu.last = 1;
6322			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6323				return r;
6324
6325		}
6326
6327		/* 17. tmp1.x = tmp1.x & tmp1.y */
6328		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6329		alu.op = ALU_OP2_AND_INT;
6330
6331		alu.dst.sel = tmp1;
6332		alu.dst.chan = 0;
6333		alu.dst.write = 1;
6334
6335		alu.src[0].sel = tmp1;
6336		alu.src[0].chan = 0;
6337		alu.src[1].sel = tmp1;
6338		alu.src[1].chan = 1;
6339
6340		alu.last = 1;
6341		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6342			return r;
6343
6344		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
6345		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
6346		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6347		alu.op = ALU_OP3_CNDE_INT;
6348		alu.is_op3 = 1;
6349
6350		alu.dst.sel = tmp0;
6351		alu.dst.chan = 2;
6352		alu.dst.write = 1;
6353
6354		alu.src[0].sel = tmp1;
6355		alu.src[0].chan = 0;
6356		alu.src[1].sel = tmp0;
6357		alu.src[1].chan = mod ? 3 : 2;
6358		alu.src[2].sel = tmp1;
6359		alu.src[2].chan = 2;
6360
6361		alu.last = 1;
6362		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6363			return r;
6364
6365		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6366		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6367		alu.op = ALU_OP3_CNDE_INT;
6368		alu.is_op3 = 1;
6369
6370		if (signed_op) {
6371			alu.dst.sel = tmp0;
6372			alu.dst.chan = 2;
6373			alu.dst.write = 1;
6374		} else {
6375			if (tmp4 > 0) {
6376				alu.dst.sel = tmp4;
6377				alu.dst.chan = i;
6378				alu.dst.write = 1;
6379			} else {
6380				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6381			}
6382		}
6383
6384		alu.src[0].sel = tmp1;
6385		alu.src[0].chan = 1;
6386		alu.src[1].sel = tmp1;
6387		alu.src[1].chan = 3;
6388		alu.src[2].sel = tmp0;
6389		alu.src[2].chan = 2;
6390
6391		alu.last = 1;
6392		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6393			return r;
6394
6395		if (signed_op) {
6396
6397			/* fix the sign of the result */
6398
6399			if (mod) {
6400
6401				/* tmp0.x = -tmp0.z */
6402				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6403				alu.op = ALU_OP2_SUB_INT;
6404
6405				alu.dst.sel = tmp0;
6406				alu.dst.chan = 0;
6407				alu.dst.write = 1;
6408
6409				alu.src[0].sel = V_SQ_ALU_SRC_0;
6410				alu.src[1].sel = tmp0;
6411				alu.src[1].chan = 2;
6412
6413				alu.last = 1;
6414				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6415					return r;
6416
6417				/* sign of the remainder is the same as the sign of src0 */
6418				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6419				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6420				alu.op = ALU_OP3_CNDGE_INT;
6421				alu.is_op3 = 1;
6422
6423				if (tmp4 > 0) {
6424					alu.dst.sel = tmp4;
6425					alu.dst.chan = i;
6426					alu.dst.write = 1;
6427				} else {
6428					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6429				}
6430
6431				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6432				alu.src[1].sel = tmp0;
6433				alu.src[1].chan = 2;
6434				alu.src[2].sel = tmp0;
6435				alu.src[2].chan = 0;
6436
6437				alu.last = 1;
6438				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6439					return r;
6440
6441			} else {
6442
6443				/* tmp0.x = -tmp0.z */
6444				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6445				alu.op = ALU_OP2_SUB_INT;
6446
6447				alu.dst.sel = tmp0;
6448				alu.dst.chan = 0;
6449				alu.dst.write = 1;
6450
6451				alu.src[0].sel = V_SQ_ALU_SRC_0;
6452				alu.src[1].sel = tmp0;
6453				alu.src[1].chan = 2;
6454
6455				alu.last = 1;
6456				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6457					return r;
6458
6459				/* fix the quotient sign (same as the sign of src0*src1) */
6460				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6461				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6462				alu.op = ALU_OP3_CNDGE_INT;
6463				alu.is_op3 = 1;
6464
6465				if (tmp4 > 0) {
6466					alu.dst.sel = tmp4;
6467					alu.dst.chan = i;
6468					alu.dst.write = 1;
6469				} else {
6470					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6471				}
6472
6473				alu.src[0].sel = tmp2;
6474				alu.src[0].chan = 2;
6475				alu.src[1].sel = tmp0;
6476				alu.src[1].chan = 2;
6477				alu.src[2].sel = tmp0;
6478				alu.src[2].chan = 0;
6479
6480				alu.last = 1;
6481				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6482					return r;
6483			}
6484		}
6485	}
6486
6487	if (tmp4 > 0) {
6488		for (i = 0; i <= lasti; ++i) {
6489			if (!(write_mask & (1<<i)))
6490				continue;
6491
6492			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6493			alu.op = ALU_OP1_MOV;
6494			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6495			alu.src[0].sel = tmp4;
6496			alu.src[0].chan = i;
6497
6498			if (i == lasti)
6499				alu.last = 1;
6500			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6501				return r;
6502		}
6503	}
6504
6505	return 0;
6506}
6507
6508static int tgsi_udiv(struct r600_shader_ctx *ctx)
6509{
6510	return tgsi_divmod(ctx, 0, 0);
6511}
6512
6513static int tgsi_umod(struct r600_shader_ctx *ctx)
6514{
6515	return tgsi_divmod(ctx, 1, 0);
6516}
6517
6518static int tgsi_idiv(struct r600_shader_ctx *ctx)
6519{
6520	return tgsi_divmod(ctx, 0, 1);
6521}
6522
6523static int tgsi_imod(struct r600_shader_ctx *ctx)
6524{
6525	return tgsi_divmod(ctx, 1, 1);
6526}
6527
6528
6529static int tgsi_f2i(struct r600_shader_ctx *ctx)
6530{
6531	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6532	struct r600_bytecode_alu alu;
6533	int i, r;
6534	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6535	int last_inst = tgsi_last_instruction(write_mask);
6536
6537	for (i = 0; i < 4; i++) {
6538		if (!(write_mask & (1<<i)))
6539			continue;
6540
6541		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6542		alu.op = ALU_OP1_TRUNC;
6543
6544		alu.dst.sel = ctx->temp_reg;
6545		alu.dst.chan = i;
6546		alu.dst.write = 1;
6547
6548		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6549		if (i == last_inst)
6550			alu.last = 1;
6551		r = r600_bytecode_add_alu(ctx->bc, &alu);
6552		if (r)
6553			return r;
6554	}
6555
6556	for (i = 0; i < 4; i++) {
6557		if (!(write_mask & (1<<i)))
6558			continue;
6559
6560		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6561		alu.op = ctx->inst_info->op;
6562
6563		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6564
6565		alu.src[0].sel = ctx->temp_reg;
6566		alu.src[0].chan = i;
6567
6568		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6569			alu.last = 1;
6570		r = r600_bytecode_add_alu(ctx->bc, &alu);
6571		if (r)
6572			return r;
6573	}
6574
6575	return 0;
6576}
6577
6578static int tgsi_iabs(struct r600_shader_ctx *ctx)
6579{
6580	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6581	struct r600_bytecode_alu alu;
6582	int i, r;
6583	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6584	int last_inst = tgsi_last_instruction(write_mask);
6585
6586	/* tmp = -src */
6587	for (i = 0; i < 4; i++) {
6588		if (!(write_mask & (1<<i)))
6589			continue;
6590
6591		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6592		alu.op = ALU_OP2_SUB_INT;
6593
6594		alu.dst.sel = ctx->temp_reg;
6595		alu.dst.chan = i;
6596		alu.dst.write = 1;
6597
6598		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6599		alu.src[0].sel = V_SQ_ALU_SRC_0;
6600
6601		if (i == last_inst)
6602			alu.last = 1;
6603		r = r600_bytecode_add_alu(ctx->bc, &alu);
6604		if (r)
6605			return r;
6606	}
6607
6608	/* dst = (src >= 0 ? src : tmp) */
6609	for (i = 0; i < 4; i++) {
6610		if (!(write_mask & (1<<i)))
6611			continue;
6612
6613		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6614		alu.op = ALU_OP3_CNDGE_INT;
6615		alu.is_op3 = 1;
6616		alu.dst.write = 1;
6617
6618		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6619
6620		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6621		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6622		alu.src[2].sel = ctx->temp_reg;
6623		alu.src[2].chan = i;
6624
6625		if (i == last_inst)
6626			alu.last = 1;
6627		r = r600_bytecode_add_alu(ctx->bc, &alu);
6628		if (r)
6629			return r;
6630	}
6631	return 0;
6632}
6633
6634static int tgsi_issg(struct r600_shader_ctx *ctx)
6635{
6636	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6637	struct r600_bytecode_alu alu;
6638	int i, r;
6639	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6640	int last_inst = tgsi_last_instruction(write_mask);
6641
6642	/* tmp = (src >= 0 ? src : -1) */
6643	for (i = 0; i < 4; i++) {
6644		if (!(write_mask & (1<<i)))
6645			continue;
6646
6647		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6648		alu.op = ALU_OP3_CNDGE_INT;
6649		alu.is_op3 = 1;
6650
6651		alu.dst.sel = ctx->temp_reg;
6652		alu.dst.chan = i;
6653		alu.dst.write = 1;
6654
6655		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6656		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6657		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6658
6659		if (i == last_inst)
6660			alu.last = 1;
6661		r = r600_bytecode_add_alu(ctx->bc, &alu);
6662		if (r)
6663			return r;
6664	}
6665
6666	/* dst = (tmp > 0 ? 1 : tmp) */
6667	for (i = 0; i < 4; i++) {
6668		if (!(write_mask & (1<<i)))
6669			continue;
6670
6671		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6672		alu.op = ALU_OP3_CNDGT_INT;
6673		alu.is_op3 = 1;
6674		alu.dst.write = 1;
6675
6676		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6677
6678		alu.src[0].sel = ctx->temp_reg;
6679		alu.src[0].chan = i;
6680
6681		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6682
6683		alu.src[2].sel = ctx->temp_reg;
6684		alu.src[2].chan = i;
6685
6686		if (i == last_inst)
6687			alu.last = 1;
6688		r = r600_bytecode_add_alu(ctx->bc, &alu);
6689		if (r)
6690			return r;
6691	}
6692	return 0;
6693}
6694
6695
6696
6697static int tgsi_ssg(struct r600_shader_ctx *ctx)
6698{
6699	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6700	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6701	int last_inst = tgsi_last_instruction(write_mask);
6702	struct r600_bytecode_alu alu;
6703	int i, r;
6704
6705	/* tmp = (src > 0 ? 1 : src) */
6706	for (i = 0; i <= last_inst; i++) {
6707		if (!(write_mask & (1 << i)))
6708			continue;
6709		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6710		alu.op = ALU_OP3_CNDGT;
6711		alu.is_op3 = 1;
6712
6713		alu.dst.sel = ctx->temp_reg;
6714		alu.dst.chan = i;
6715
6716		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6717		alu.src[1].sel = V_SQ_ALU_SRC_1;
6718		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6719
6720		if (i == last_inst)
6721			alu.last = 1;
6722		r = r600_bytecode_add_alu(ctx->bc, &alu);
6723		if (r)
6724			return r;
6725	}
6726
6727	/* dst = (-tmp > 0 ? -1 : tmp) */
6728	for (i = 0; i <= last_inst; i++) {
6729		if (!(write_mask & (1 << i)))
6730			continue;
6731		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6732		alu.op = ALU_OP3_CNDGT;
6733		alu.is_op3 = 1;
6734		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6735
6736		alu.src[0].sel = ctx->temp_reg;
6737		alu.src[0].chan = i;
6738		alu.src[0].neg = 1;
6739
6740		alu.src[1].sel = V_SQ_ALU_SRC_1;
6741		alu.src[1].neg = 1;
6742
6743		alu.src[2].sel = ctx->temp_reg;
6744		alu.src[2].chan = i;
6745
6746		if (i == last_inst)
6747			alu.last = 1;
6748		r = r600_bytecode_add_alu(ctx->bc, &alu);
6749		if (r)
6750			return r;
6751	}
6752	return 0;
6753}
6754
6755static int tgsi_bfi(struct r600_shader_ctx *ctx)
6756{
6757	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6758	struct r600_bytecode_alu alu;
6759	int i, r, t1, t2;
6760
6761	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6762	int last_inst = tgsi_last_instruction(write_mask);
6763
6764	t1 = r600_get_temp(ctx);
6765
6766	for (i = 0; i < 4; i++) {
6767		if (!(write_mask & (1<<i)))
6768			continue;
6769
6770		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6771		alu.op = ALU_OP2_SETGE_INT;
6772		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6773		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6774		alu.src[1].value = 32;
6775		alu.dst.sel = ctx->temp_reg;
6776		alu.dst.chan = i;
6777		alu.dst.write = 1;
6778		alu.last = i == last_inst;
6779		r = r600_bytecode_add_alu(ctx->bc, &alu);
6780		if (r)
6781			return r;
6782	}
6783
6784	for (i = 0; i < 4; i++) {
6785		if (!(write_mask & (1<<i)))
6786			continue;
6787
6788		/* create mask tmp */
6789		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6790		alu.op = ALU_OP2_BFM_INT;
6791		alu.dst.sel = t1;
6792		alu.dst.chan = i;
6793		alu.dst.write = 1;
6794		alu.last = i == last_inst;
6795
6796		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6797		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6798
6799		r = r600_bytecode_add_alu(ctx->bc, &alu);
6800		if (r)
6801			return r;
6802	}
6803
6804	t2 = r600_get_temp(ctx);
6805
6806	for (i = 0; i < 4; i++) {
6807		if (!(write_mask & (1<<i)))
6808			continue;
6809
6810		/* shift insert left */
6811		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6812		alu.op = ALU_OP2_LSHL_INT;
6813		alu.dst.sel = t2;
6814		alu.dst.chan = i;
6815		alu.dst.write = 1;
6816		alu.last = i == last_inst;
6817
6818		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6819		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6820
6821		r = r600_bytecode_add_alu(ctx->bc, &alu);
6822		if (r)
6823			return r;
6824	}
6825
6826	for (i = 0; i < 4; i++) {
6827		if (!(write_mask & (1<<i)))
6828			continue;
6829
6830		/* actual bitfield insert */
6831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6832		alu.op = ALU_OP3_BFI_INT;
6833		alu.is_op3 = 1;
6834		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6835		alu.dst.chan = i;
6836		alu.dst.write = 1;
6837		alu.last = i == last_inst;
6838
6839		alu.src[0].sel = t1;
6840		alu.src[0].chan = i;
6841		alu.src[1].sel = t2;
6842		alu.src[1].chan = i;
6843		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6844
6845		r = r600_bytecode_add_alu(ctx->bc, &alu);
6846		if (r)
6847			return r;
6848	}
6849
6850	for (i = 0; i < 4; i++) {
6851		if (!(write_mask & (1<<i)))
6852			continue;
6853		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6854		alu.op = ALU_OP3_CNDE_INT;
6855		alu.is_op3 = 1;
6856		alu.src[0].sel = ctx->temp_reg;
6857		alu.src[0].chan = i;
6858		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6859
6860		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6861
6862		alu.src[1].sel = alu.dst.sel;
6863		alu.src[1].chan = i;
6864
6865		alu.last = i == last_inst;
6866		r = r600_bytecode_add_alu(ctx->bc, &alu);
6867		if (r)
6868			return r;
6869	}
6870	return 0;
6871}
6872
6873static int tgsi_msb(struct r600_shader_ctx *ctx)
6874{
6875	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6876	struct r600_bytecode_alu alu;
6877	int i, r, t1, t2;
6878
6879	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6880	int last_inst = tgsi_last_instruction(write_mask);
6881
6882	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6883		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6884
6885	t1 = ctx->temp_reg;
6886
6887	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6888	for (i = 0; i < 4; i++) {
6889		if (!(write_mask & (1<<i)))
6890			continue;
6891
6892		/* t1 = FFBH_INT / FFBH_UINT */
6893		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6894		alu.op = ctx->inst_info->op;
6895		alu.dst.sel = t1;
6896		alu.dst.chan = i;
6897		alu.dst.write = 1;
6898		alu.last = i == last_inst;
6899
6900		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6901
6902		r = r600_bytecode_add_alu(ctx->bc, &alu);
6903		if (r)
6904			return r;
6905	}
6906
6907	t2 = r600_get_temp(ctx);
6908
6909	for (i = 0; i < 4; i++) {
6910		if (!(write_mask & (1<<i)))
6911			continue;
6912
6913		/* t2 = 31 - t1 */
6914		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6915		alu.op = ALU_OP2_SUB_INT;
6916		alu.dst.sel = t2;
6917		alu.dst.chan = i;
6918		alu.dst.write = 1;
6919		alu.last = i == last_inst;
6920
6921		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6922		alu.src[0].value = 31;
6923		alu.src[1].sel = t1;
6924		alu.src[1].chan = i;
6925
6926		r = r600_bytecode_add_alu(ctx->bc, &alu);
6927		if (r)
6928			return r;
6929	}
6930
6931	for (i = 0; i < 4; i++) {
6932		if (!(write_mask & (1<<i)))
6933			continue;
6934
6935		/* result = t1 >= 0 ? t2 : t1 */
6936		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6937		alu.op = ALU_OP3_CNDGE_INT;
6938		alu.is_op3 = 1;
6939		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6940		alu.dst.chan = i;
6941		alu.dst.write = 1;
6942		alu.last = i == last_inst;
6943
6944		alu.src[0].sel = t1;
6945		alu.src[0].chan = i;
6946		alu.src[1].sel = t2;
6947		alu.src[1].chan = i;
6948		alu.src[2].sel = t1;
6949		alu.src[2].chan = i;
6950
6951		r = r600_bytecode_add_alu(ctx->bc, &alu);
6952		if (r)
6953			return r;
6954	}
6955
6956	return 0;
6957}
6958
6959static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6960{
6961	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6962	struct r600_bytecode_alu alu;
6963	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6964	unsigned location;
6965	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6966
6967	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6968
6969	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6970	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6971		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6972		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6973	}
6974	else {
6975		location = TGSI_INTERPOLATE_LOC_CENTROID;
6976	}
6977
6978	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6979	if (k < 0)
6980		k = 0;
6981	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6982	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6983
6984	/* NOTE: currently offset is not perspective correct */
6985	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6986		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6987		int sample_gpr = -1;
6988		int gradientsH, gradientsV;
6989		struct r600_bytecode_tex tex;
6990
6991		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6992			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6993		}
6994
6995		gradientsH = r600_get_temp(ctx);
6996		gradientsV = r600_get_temp(ctx);
6997		for (i = 0; i < 2; i++) {
6998			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6999			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7000			tex.src_gpr = interp_gpr;
7001			tex.src_sel_x = interp_base_chan + 0;
7002			tex.src_sel_y = interp_base_chan + 1;
7003			tex.src_sel_z = 0;
7004			tex.src_sel_w = 0;
7005			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7006			tex.dst_sel_x = 0;
7007			tex.dst_sel_y = 1;
7008			tex.dst_sel_z = 7;
7009			tex.dst_sel_w = 7;
7010			tex.inst_mod = 1; // Use per pixel gradient calculation
7011			tex.sampler_id = 0;
7012			tex.resource_id = tex.sampler_id;
7013			r = r600_bytecode_add_tex(ctx->bc, &tex);
7014			if (r)
7015				return r;
7016		}
7017
7018		for (i = 0; i < 2; i++) {
7019			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7020			alu.op = ALU_OP3_MULADD;
7021			alu.is_op3 = 1;
7022			alu.src[0].sel = gradientsH;
7023			alu.src[0].chan = i;
7024			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7025				alu.src[1].sel = sample_gpr;
7026				alu.src[1].chan = 2;
7027			}
7028			else {
7029				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7030			}
7031			alu.src[2].sel = interp_gpr;
7032			alu.src[2].chan = interp_base_chan + i;
7033			alu.dst.sel = ctx->temp_reg;
7034			alu.dst.chan = i;
7035			alu.last = i == 1;
7036
7037			r = r600_bytecode_add_alu(ctx->bc, &alu);
7038			if (r)
7039				return r;
7040		}
7041
7042		for (i = 0; i < 2; i++) {
7043			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7044			alu.op = ALU_OP3_MULADD;
7045			alu.is_op3 = 1;
7046			alu.src[0].sel = gradientsV;
7047			alu.src[0].chan = i;
7048			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7049				alu.src[1].sel = sample_gpr;
7050				alu.src[1].chan = 3;
7051			}
7052			else {
7053				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7054			}
7055			alu.src[2].sel = ctx->temp_reg;
7056			alu.src[2].chan = i;
7057			alu.dst.sel = ctx->temp_reg;
7058			alu.dst.chan = i;
7059			alu.last = i == 1;
7060
7061			r = r600_bytecode_add_alu(ctx->bc, &alu);
7062			if (r)
7063				return r;
7064		}
7065	}
7066
7067	tmp = r600_get_temp(ctx);
7068	for (i = 0; i < 8; i++) {
7069		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7070		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7071
7072		alu.dst.sel = tmp;
7073		if ((i > 1 && i < 6)) {
7074			alu.dst.write = 1;
7075		}
7076		else {
7077			alu.dst.write = 0;
7078		}
7079		alu.dst.chan = i % 4;
7080
7081		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7082			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7083			alu.src[0].sel = ctx->temp_reg;
7084			alu.src[0].chan = 1 - (i % 2);
7085		} else {
7086			alu.src[0].sel = interp_gpr;
7087			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7088		}
7089		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7090		alu.src[1].chan = 0;
7091
7092		alu.last = i % 4 == 3;
7093		alu.bank_swizzle_force = SQ_ALU_VEC_210;
7094
7095		r = r600_bytecode_add_alu(ctx->bc, &alu);
7096		if (r)
7097			return r;
7098	}
7099
7100	// INTERP can't swizzle dst
7101	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7102	for (i = 0; i <= lasti; i++) {
7103		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7104			continue;
7105
7106		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7107		alu.op = ALU_OP1_MOV;
7108		alu.src[0].sel = tmp;
7109		alu.src[0].chan = ctx->src[0].swizzle[i];
7110		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7111		alu.dst.write = 1;
7112		alu.last = i == lasti;
7113		r = r600_bytecode_add_alu(ctx->bc, &alu);
7114		if (r)
7115			return r;
7116	}
7117
7118	return 0;
7119}
7120
7121
7122static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7123{
7124	struct r600_bytecode_alu alu;
7125	int i, r;
7126
7127	for (i = 0; i < 4; i++) {
7128		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7129		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7130			alu.op = ALU_OP0_NOP;
7131			alu.dst.chan = i;
7132		} else {
7133			alu.op = ALU_OP1_MOV;
7134			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7135			alu.src[0].sel = ctx->temp_reg;
7136			alu.src[0].chan = i;
7137		}
7138		if (i == 3) {
7139			alu.last = 1;
7140		}
7141		r = r600_bytecode_add_alu(ctx->bc, &alu);
7142		if (r)
7143			return r;
7144	}
7145	return 0;
7146}
7147
7148static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7149                                 unsigned writemask,
7150                                 struct r600_bytecode_alu_src *bc_src,
7151                                 const struct r600_shader_src *shader_src)
7152{
7153	struct r600_bytecode_alu alu;
7154	int i, r;
7155	int lasti = tgsi_last_instruction(writemask);
7156	int temp_reg = 0;
7157
7158	r600_bytecode_src(&bc_src[0], shader_src, 0);
7159	r600_bytecode_src(&bc_src[1], shader_src, 1);
7160	r600_bytecode_src(&bc_src[2], shader_src, 2);
7161	r600_bytecode_src(&bc_src[3], shader_src, 3);
7162
7163	if (bc_src->abs) {
7164		temp_reg = r600_get_temp(ctx);
7165
7166		for (i = 0; i < lasti + 1; i++) {
7167			if (!(writemask & (1 << i)))
7168				continue;
7169			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7170			alu.op = ALU_OP1_MOV;
7171			alu.dst.sel = temp_reg;
7172			alu.dst.chan = i;
7173			alu.dst.write = 1;
7174			alu.src[0] = bc_src[i];
7175			if (i == lasti) {
7176				alu.last = 1;
7177			}
7178			r = r600_bytecode_add_alu(ctx->bc, &alu);
7179			if (r)
7180				return r;
7181			memset(&bc_src[i], 0, sizeof(*bc_src));
7182			bc_src[i].sel = temp_reg;
7183			bc_src[i].chan = i;
7184		}
7185	}
7186	return 0;
7187}
7188
7189static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7190{
7191	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7192	struct r600_bytecode_alu alu;
7193	struct r600_bytecode_alu_src srcs[4][4];
7194	int i, j, r;
7195	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7196	unsigned op = ctx->inst_info->op;
7197
7198	if (op == ALU_OP3_MULADD_IEEE &&
7199	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7200		op = ALU_OP3_MULADD;
7201
7202	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7203		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7204					  srcs[j], &ctx->src[j]);
7205		if (r)
7206			return r;
7207	}
7208
7209	for (i = 0; i < lasti + 1; i++) {
7210		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7211			continue;
7212
7213		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7214		alu.op = op;
7215		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7216			alu.src[j] = srcs[j][i];
7217		}
7218
7219		if (dst == -1) {
7220			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7221		} else {
7222			alu.dst.sel = dst;
7223		}
7224		alu.dst.chan = i;
7225		alu.dst.write = 1;
7226		alu.is_op3 = 1;
7227		if (i == lasti) {
7228			alu.last = 1;
7229		}
7230		r = r600_bytecode_add_alu(ctx->bc, &alu);
7231		if (r)
7232			return r;
7233	}
7234	return 0;
7235}
7236
7237static int tgsi_op3(struct r600_shader_ctx *ctx)
7238{
7239	return tgsi_op3_dst(ctx, -1);
7240}
7241
7242static int tgsi_dp(struct r600_shader_ctx *ctx)
7243{
7244	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7245	struct r600_bytecode_alu alu;
7246	int i, j, r;
7247	unsigned op = ctx->inst_info->op;
7248	if (op == ALU_OP2_DOT4_IEEE &&
7249	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7250		op = ALU_OP2_DOT4;
7251
7252	for (i = 0; i < 4; i++) {
7253		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7254		alu.op = op;
7255		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7256			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7257		}
7258
7259		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7260		alu.dst.chan = i;
7261		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7262		/* handle some special cases */
7263		switch (inst->Instruction.Opcode) {
7264		case TGSI_OPCODE_DP2:
7265			if (i > 1) {
7266				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7267				alu.src[0].chan = alu.src[1].chan = 0;
7268			}
7269			break;
7270		case TGSI_OPCODE_DP3:
7271			if (i > 2) {
7272				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7273				alu.src[0].chan = alu.src[1].chan = 0;
7274			}
7275			break;
7276		default:
7277			break;
7278		}
7279		if (i == 3) {
7280			alu.last = 1;
7281		}
7282		r = r600_bytecode_add_alu(ctx->bc, &alu);
7283		if (r)
7284			return r;
7285	}
7286	return 0;
7287}
7288
7289static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7290						    unsigned index)
7291{
7292	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7293	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7294		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7295		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7296		ctx->src[index].neg || ctx->src[index].abs ||
7297		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7298}
7299
7300static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7301					unsigned index)
7302{
7303	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7304	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7305}
7306
7307static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7308{
7309	struct r600_bytecode_vtx vtx;
7310	struct r600_bytecode_alu alu;
7311	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7312	int src_gpr, r, i;
7313	int id = tgsi_tex_get_src_gpr(ctx, 1);
7314	int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7315
7316	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7317	if (src_requires_loading) {
7318		for (i = 0; i < 4; i++) {
7319			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7320			alu.op = ALU_OP1_MOV;
7321			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7322			alu.dst.sel = ctx->temp_reg;
7323			alu.dst.chan = i;
7324			if (i == 3)
7325				alu.last = 1;
7326			alu.dst.write = 1;
7327			r = r600_bytecode_add_alu(ctx->bc, &alu);
7328			if (r)
7329				return r;
7330		}
7331		src_gpr = ctx->temp_reg;
7332	}
7333
7334	memset(&vtx, 0, sizeof(vtx));
7335	vtx.op = FETCH_OP_VFETCH;
7336	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7337	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7338	vtx.src_gpr = src_gpr;
7339	vtx.mega_fetch_count = 16;
7340	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7341	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7342	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
7343	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
7344	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
7345	vtx.use_const_fields = 1;
7346	vtx.buffer_index_mode = sampler_index_mode;
7347
7348	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7349		return r;
7350
7351	if (ctx->bc->chip_class >= EVERGREEN)
7352		return 0;
7353
7354	for (i = 0; i < 4; i++) {
7355		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7356		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7357			continue;
7358
7359		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7360		alu.op = ALU_OP2_AND_INT;
7361
7362		alu.dst.chan = i;
7363		alu.dst.sel = vtx.dst_gpr;
7364		alu.dst.write = 1;
7365
7366		alu.src[0].sel = vtx.dst_gpr;
7367		alu.src[0].chan = i;
7368
7369		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7370		alu.src[1].sel += (id * 2);
7371		alu.src[1].chan = i % 4;
7372		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7373
7374		if (i == lasti)
7375			alu.last = 1;
7376		r = r600_bytecode_add_alu(ctx->bc, &alu);
7377		if (r)
7378			return r;
7379	}
7380
7381	if (inst->Dst[0].Register.WriteMask & 3) {
7382		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7383		alu.op = ALU_OP2_OR_INT;
7384
7385		alu.dst.chan = 3;
7386		alu.dst.sel = vtx.dst_gpr;
7387		alu.dst.write = 1;
7388
7389		alu.src[0].sel = vtx.dst_gpr;
7390		alu.src[0].chan = 3;
7391
7392		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7393		alu.src[1].chan = 0;
7394		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7395
7396		alu.last = 1;
7397		r = r600_bytecode_add_alu(ctx->bc, &alu);
7398		if (r)
7399			return r;
7400	}
7401	return 0;
7402}
7403
7404static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7405{
7406	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7407	int r;
7408	int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7409	int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7410
7411	if (ctx->bc->chip_class < EVERGREEN) {
7412		struct r600_bytecode_alu alu;
7413		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7414		alu.op = ALU_OP1_MOV;
7415		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7416		/* r600 we have them at channel 2 of the second dword */
7417		alu.src[0].sel += (id * 2) + 1;
7418		alu.src[0].chan = 1;
7419		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7420		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7421		alu.last = 1;
7422		r = r600_bytecode_add_alu(ctx->bc, &alu);
7423		if (r)
7424			return r;
7425		return 0;
7426	} else {
7427		struct r600_bytecode_vtx vtx;
7428		memset(&vtx, 0, sizeof(vtx));
7429		vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7430		vtx.buffer_id = id + eg_buffer_base;
7431		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7432		vtx.src_gpr = 0;
7433		vtx.mega_fetch_count = 16; /* no idea here really... */
7434		vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7435		vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7436		vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;		/* SEL_Y */
7437		vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;		/* SEL_Z */
7438		vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;		/* SEL_W */
7439		vtx.data_format = FMT_32_32_32_32;
7440		vtx.buffer_index_mode = sampler_index_mode;
7441
7442		if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7443			return r;
7444		return 0;
7445	}
7446}
7447
7448
7449static int tgsi_tex(struct r600_shader_ctx *ctx)
7450{
7451	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7452	struct r600_bytecode_tex tex;
7453	struct r600_bytecode_tex grad_offs[3];
7454	struct r600_bytecode_alu alu;
7455	unsigned src_gpr;
7456	int r, i, j, n_grad_offs = 0;
7457	int opcode;
7458	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7459				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7460				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7461				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7462
7463	bool txf_add_offsets = inst->Texture.NumOffsets &&
7464			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7465			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7466
7467	/* Texture fetch instructions can only use gprs as source.
7468	 * Also they cannot negate the source or take the absolute value */
7469	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7470                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
7471					     read_compressed_msaa || txf_add_offsets;
7472
7473	boolean src_loaded = FALSE;
7474	unsigned sampler_src_reg = 1;
7475	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7476	boolean has_txq_cube_array_z = false;
7477	unsigned sampler_index_mode;
7478	int array_index_offset_channel = -1;
7479
7480	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7481	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7482	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7483		if (inst->Dst[0].Register.WriteMask & 4) {
7484			ctx->shader->has_txq_cube_array_z_comp = true;
7485			has_txq_cube_array_z = true;
7486		}
7487
7488	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7489	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7490	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7491	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7492		sampler_src_reg = 2;
7493
7494	/* TGSI moves the sampler to src reg 3 for TXD */
7495	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7496		sampler_src_reg = 3;
7497
7498	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7499
7500	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7501
7502	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7503		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7504			if (ctx->bc->chip_class < EVERGREEN)
7505				ctx->shader->uses_tex_buffers = true;
7506			return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7507		}
7508		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7509			if (ctx->bc->chip_class < EVERGREEN)
7510				ctx->shader->uses_tex_buffers = true;
7511			return do_vtx_fetch_inst(ctx, src_requires_loading);
7512		}
7513	}
7514
7515	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7516		int out_chan;
7517		/* Add perspective divide */
7518		if (ctx->bc->chip_class == CAYMAN) {
7519			out_chan = 2;
7520			for (i = 0; i < 3; i++) {
7521				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7522				alu.op = ALU_OP1_RECIP_IEEE;
7523				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7524
7525				alu.dst.sel = ctx->temp_reg;
7526				alu.dst.chan = i;
7527				if (i == 2)
7528					alu.last = 1;
7529				if (out_chan == i)
7530					alu.dst.write = 1;
7531				r = r600_bytecode_add_alu(ctx->bc, &alu);
7532				if (r)
7533					return r;
7534			}
7535
7536		} else {
7537			out_chan = 3;
7538			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7539			alu.op = ALU_OP1_RECIP_IEEE;
7540			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7541
7542			alu.dst.sel = ctx->temp_reg;
7543			alu.dst.chan = out_chan;
7544			alu.last = 1;
7545			alu.dst.write = 1;
7546			r = r600_bytecode_add_alu(ctx->bc, &alu);
7547			if (r)
7548				return r;
7549		}
7550
7551		for (i = 0; i < 3; i++) {
7552			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7553			alu.op = ALU_OP2_MUL;
7554			alu.src[0].sel = ctx->temp_reg;
7555			alu.src[0].chan = out_chan;
7556			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7557			alu.dst.sel = ctx->temp_reg;
7558			alu.dst.chan = i;
7559			alu.dst.write = 1;
7560			r = r600_bytecode_add_alu(ctx->bc, &alu);
7561			if (r)
7562				return r;
7563		}
7564		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7565		alu.op = ALU_OP1_MOV;
7566		alu.src[0].sel = V_SQ_ALU_SRC_1;
7567		alu.src[0].chan = 0;
7568		alu.dst.sel = ctx->temp_reg;
7569		alu.dst.chan = 3;
7570		alu.last = 1;
7571		alu.dst.write = 1;
7572		r = r600_bytecode_add_alu(ctx->bc, &alu);
7573		if (r)
7574			return r;
7575		src_loaded = TRUE;
7576		src_gpr = ctx->temp_reg;
7577	}
7578
7579
7580	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7581	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7582	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7583	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7584	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7585
7586		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7587		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7588
7589		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7590		for (i = 0; i < 4; i++) {
7591			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7592			alu.op = ALU_OP2_CUBE;
7593			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7594			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7595			alu.dst.sel = ctx->temp_reg;
7596			alu.dst.chan = i;
7597			if (i == 3)
7598				alu.last = 1;
7599			alu.dst.write = 1;
7600			r = r600_bytecode_add_alu(ctx->bc, &alu);
7601			if (r)
7602				return r;
7603		}
7604
7605		/* tmp1.z = RCP_e(|tmp1.z|) */
7606		if (ctx->bc->chip_class == CAYMAN) {
7607			for (i = 0; i < 3; i++) {
7608				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7609				alu.op = ALU_OP1_RECIP_IEEE;
7610				alu.src[0].sel = ctx->temp_reg;
7611				alu.src[0].chan = 2;
7612				alu.src[0].abs = 1;
7613				alu.dst.sel = ctx->temp_reg;
7614				alu.dst.chan = i;
7615				if (i == 2)
7616					alu.dst.write = 1;
7617				if (i == 2)
7618					alu.last = 1;
7619				r = r600_bytecode_add_alu(ctx->bc, &alu);
7620				if (r)
7621					return r;
7622			}
7623		} else {
7624			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7625			alu.op = ALU_OP1_RECIP_IEEE;
7626			alu.src[0].sel = ctx->temp_reg;
7627			alu.src[0].chan = 2;
7628			alu.src[0].abs = 1;
7629			alu.dst.sel = ctx->temp_reg;
7630			alu.dst.chan = 2;
7631			alu.dst.write = 1;
7632			alu.last = 1;
7633			r = r600_bytecode_add_alu(ctx->bc, &alu);
7634			if (r)
7635				return r;
7636		}
7637
7638		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
7639		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
7640		 * muladd has no writemask, have to use another temp
7641		 */
7642		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7643		alu.op = ALU_OP3_MULADD;
7644		alu.is_op3 = 1;
7645
7646		alu.src[0].sel = ctx->temp_reg;
7647		alu.src[0].chan = 0;
7648		alu.src[1].sel = ctx->temp_reg;
7649		alu.src[1].chan = 2;
7650
7651		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7652		alu.src[2].chan = 0;
7653		alu.src[2].value = u_bitcast_f2u(1.5f);
7654
7655		alu.dst.sel = ctx->temp_reg;
7656		alu.dst.chan = 0;
7657		alu.dst.write = 1;
7658
7659		r = r600_bytecode_add_alu(ctx->bc, &alu);
7660		if (r)
7661			return r;
7662
7663		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7664		alu.op = ALU_OP3_MULADD;
7665		alu.is_op3 = 1;
7666
7667		alu.src[0].sel = ctx->temp_reg;
7668		alu.src[0].chan = 1;
7669		alu.src[1].sel = ctx->temp_reg;
7670		alu.src[1].chan = 2;
7671
7672		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7673		alu.src[2].chan = 0;
7674		alu.src[2].value = u_bitcast_f2u(1.5f);
7675
7676		alu.dst.sel = ctx->temp_reg;
7677		alu.dst.chan = 1;
7678		alu.dst.write = 1;
7679
7680		alu.last = 1;
7681		r = r600_bytecode_add_alu(ctx->bc, &alu);
7682		if (r)
7683			return r;
7684		/* write initial compare value into Z component
7685		  - W src 0 for shadow cube
7686		  - X src 1 for shadow cube array */
7687		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7688		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7689			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7690			alu.op = ALU_OP1_MOV;
7691			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7692				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7693			else
7694				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7695			alu.dst.sel = ctx->temp_reg;
7696			alu.dst.chan = 2;
7697			alu.dst.write = 1;
7698			alu.last = 1;
7699			r = r600_bytecode_add_alu(ctx->bc, &alu);
7700			if (r)
7701				return r;
7702		}
7703
7704		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7705		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7706			if (ctx->bc->chip_class >= EVERGREEN) {
7707				int mytmp = r600_get_temp(ctx);
7708				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7709				alu.op = ALU_OP1_MOV;
7710				alu.src[0].sel = ctx->temp_reg;
7711				alu.src[0].chan = 3;
7712				alu.dst.sel = mytmp;
7713				alu.dst.chan = 0;
7714				alu.dst.write = 1;
7715				alu.last = 1;
7716				r = r600_bytecode_add_alu(ctx->bc, &alu);
7717				if (r)
7718					return r;
7719
7720				/* Evaluate the array index according to floor(idx + 0.5). This
7721				 * needs to be done before merging the face select value, because
7722				 * otherwise the fractional part of the array index will interfere
7723				 * with the face select value */
7724				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7725				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7726				alu.op = ALU_OP1_RNDNE;
7727				alu.dst.sel = ctx->temp_reg;
7728				alu.dst.chan = 3;
7729				alu.dst.write = 1;
7730				alu.last = 1;
7731				r = r600_bytecode_add_alu(ctx->bc, &alu);
7732				if (r)
7733					return r;
7734
7735				/* Because the array slice index and the cube face index are merged
7736				 * into one value we have to make sure the array slice index is >= 0,
7737				 * otherwise the face selection will fail */
7738				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7739				alu.op = ALU_OP2_MAX;
7740				alu.src[0].sel = ctx->temp_reg;
7741				alu.src[0].chan = 3;
7742				alu.src[1].sel = V_SQ_ALU_SRC_0;
7743				alu.dst.sel = ctx->temp_reg;
7744				alu.dst.chan = 3;
7745				alu.dst.write = 1;
7746				alu.last = 1;
7747				r = r600_bytecode_add_alu(ctx->bc, &alu);
7748				if (r)
7749					return r;
7750
7751				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7752				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7753				alu.op = ALU_OP3_MULADD;
7754				alu.is_op3 = 1;
7755				alu.src[0].sel = ctx->temp_reg;
7756				alu.src[0].chan = 3;
7757				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7758				alu.src[1].chan = 0;
7759				alu.src[1].value = u_bitcast_f2u(8.0f);
7760				alu.src[2].sel = mytmp;
7761				alu.src[2].chan = 0;
7762				alu.dst.sel = ctx->temp_reg;
7763				alu.dst.chan = 3;
7764				alu.dst.write = 1;
7765				alu.last = 1;
7766				r = r600_bytecode_add_alu(ctx->bc, &alu);
7767				if (r)
7768					return r;
7769			} else if (ctx->bc->chip_class < EVERGREEN) {
7770				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7771				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7772				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7773				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7774				tex.src_gpr = r600_get_temp(ctx);
7775				tex.src_sel_x = 0;
7776				tex.src_sel_y = 0;
7777				tex.src_sel_z = 0;
7778				tex.src_sel_w = 0;
7779				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7780				tex.coord_type_x = 1;
7781				tex.coord_type_y = 1;
7782				tex.coord_type_z = 1;
7783				tex.coord_type_w = 1;
7784				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7785				alu.op = ALU_OP1_MOV;
7786				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7787				alu.dst.sel = tex.src_gpr;
7788				alu.dst.chan = 0;
7789				alu.last = 1;
7790				alu.dst.write = 1;
7791				r = r600_bytecode_add_alu(ctx->bc, &alu);
7792				if (r)
7793					return r;
7794
7795				r = r600_bytecode_add_tex(ctx->bc, &tex);
7796				if (r)
7797					return r;
7798			}
7799
7800		}
7801
7802		/* for cube forms of lod and bias we need to route things */
7803		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7804		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7805		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7806		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7807			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7808			alu.op = ALU_OP1_MOV;
7809			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7810			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7811				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7812			else
7813				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7814			alu.dst.sel = ctx->temp_reg;
7815			alu.dst.chan = 2;
7816			alu.last = 1;
7817			alu.dst.write = 1;
7818			r = r600_bytecode_add_alu(ctx->bc, &alu);
7819			if (r)
7820				return r;
7821		}
7822
7823		src_loaded = TRUE;
7824		src_gpr = ctx->temp_reg;
7825	}
7826
7827	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7828		int temp_h = 0, temp_v = 0;
7829		int start_val = 0;
7830
7831		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7832		if (src_loaded == TRUE)
7833			start_val = 1;
7834		else
7835			src_loaded = TRUE;
7836		for (i = start_val; i < 3; i++) {
7837			int treg = r600_get_temp(ctx);
7838
7839			if (i == 0)
7840				src_gpr = treg;
7841			else if (i == 1)
7842				temp_h = treg;
7843			else
7844				temp_v = treg;
7845
7846			for (j = 0; j < 4; j++) {
7847				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7848				alu.op = ALU_OP1_MOV;
7849                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7850                                alu.dst.sel = treg;
7851                                alu.dst.chan = j;
7852                                if (j == 3)
7853                                   alu.last = 1;
7854                                alu.dst.write = 1;
7855                                r = r600_bytecode_add_alu(ctx->bc, &alu);
7856                                if (r)
7857                                    return r;
7858			}
7859		}
7860		for (i = 1; i < 3; i++) {
7861			/* set gradients h/v */
7862			struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7863			memset(t, 0, sizeof(struct r600_bytecode_tex));
7864			t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7865				FETCH_OP_SET_GRADIENTS_V;
7866			t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7867			t->sampler_index_mode = sampler_index_mode;
7868			t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7869			t->resource_index_mode = sampler_index_mode;
7870
7871			t->src_gpr = (i == 1) ? temp_h : temp_v;
7872			t->src_sel_x = 0;
7873			t->src_sel_y = 1;
7874			t->src_sel_z = 2;
7875			t->src_sel_w = 3;
7876
7877			t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7878			t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7879			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7880				t->coord_type_x = 1;
7881				t->coord_type_y = 1;
7882				t->coord_type_z = 1;
7883				t->coord_type_w = 1;
7884			}
7885		}
7886	}
7887
7888	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7889		/* Gather4 should follow the same rules as bilinear filtering, but the hardware
7890		 * incorrectly forces nearest filtering if the texture format is integer.
7891		 * The only effect it has on Gather4, which always returns 4 texels for
7892		 * bilinear filtering, is that the final coordinates are off by 0.5 of
7893		 * the texel size.
7894		 *
7895		 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7896		 * or (0.5 / size) from the normalized coordinates.
7897		 */
7898		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7899		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7900			int treg = r600_get_temp(ctx);
7901
7902			/* mov array and comparison oordinate to temp_reg if needed */
7903			if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7904			     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7905			     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7906				int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7907				for (i = 2; i <= end; i++) {
7908					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7909					alu.op = ALU_OP1_MOV;
7910					alu.dst.sel = ctx->temp_reg;
7911					alu.dst.chan = i;
7912					alu.dst.write = 1;
7913					alu.last = (i == end);
7914					r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7915					r = r600_bytecode_add_alu(ctx->bc, &alu);
7916					if (r)
7917						return r;
7918				}
7919			}
7920
7921			if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
7922			    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
7923				for (i = 0; i < 2; i++) {
7924					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7925					alu.op = ALU_OP2_ADD;
7926					alu.dst.sel = ctx->temp_reg;
7927					alu.dst.chan = i;
7928					alu.dst.write = 1;
7929					alu.last = i == 1;
7930					if (src_loaded) {
7931						alu.src[0].sel = ctx->temp_reg;
7932						alu.src[0].chan = i;
7933					} else
7934						r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7935					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7936					alu.src[1].neg = 1;
7937					r = r600_bytecode_add_alu(ctx->bc, &alu);
7938					if (r)
7939						return r;
7940				}
7941			} else {
7942				/* execute a TXQ */
7943				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7944				tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
7945				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7946				tex.sampler_index_mode = sampler_index_mode;
7947				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7948				tex.resource_index_mode = sampler_index_mode;
7949				tex.dst_gpr = treg;
7950				tex.src_sel_x = 4;
7951				tex.src_sel_y = 4;
7952				tex.src_sel_z = 4;
7953				tex.src_sel_w = 4;
7954				tex.dst_sel_x = 0;
7955				tex.dst_sel_y = 1;
7956				tex.dst_sel_z = 7;
7957				tex.dst_sel_w = 7;
7958				r = r600_bytecode_add_tex(ctx->bc, &tex);
7959				if (r)
7960					return r;
7961
7962				/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
7963				if (ctx->bc->chip_class == CAYMAN) {
7964					/* */
7965					for (i = 0; i < 2; i++) {
7966						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7967						alu.op = ALU_OP1_INT_TO_FLT;
7968						alu.dst.sel = treg;
7969						alu.dst.chan = i;
7970						alu.dst.write = 1;
7971						alu.src[0].sel = treg;
7972						alu.src[0].chan = i;
7973						alu.last = (i == 1) ? 1 : 0;
7974						r = r600_bytecode_add_alu(ctx->bc, &alu);
7975						if (r)
7976							return r;
7977					}
7978					for (j = 0; j < 2; j++) {
7979						for (i = 0; i < 3; i++) {
7980							memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7981							alu.op = ALU_OP1_RECIP_IEEE;
7982							alu.src[0].sel = treg;
7983							alu.src[0].chan = j;
7984							alu.dst.sel = treg;
7985							alu.dst.chan = i;
7986							if (i == 2)
7987								alu.last = 1;
7988							if (i == j)
7989								alu.dst.write = 1;
7990							r = r600_bytecode_add_alu(ctx->bc, &alu);
7991							if (r)
7992								return r;
7993						}
7994					}
7995				} else {
7996					for (i = 0; i < 2; i++) {
7997						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7998						alu.op = ALU_OP1_INT_TO_FLT;
7999						alu.dst.sel = treg;
8000						alu.dst.chan = i;
8001						alu.dst.write = 1;
8002						alu.src[0].sel = treg;
8003						alu.src[0].chan = i;
8004						alu.last = 1;
8005						r = r600_bytecode_add_alu(ctx->bc, &alu);
8006						if (r)
8007							return r;
8008					}
8009					for (i = 0; i < 2; i++) {
8010						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8011						alu.op = ALU_OP1_RECIP_IEEE;
8012						alu.src[0].sel = treg;
8013						alu.src[0].chan = i;
8014						alu.dst.sel = treg;
8015						alu.dst.chan = i;
8016						alu.last = 1;
8017						alu.dst.write = 1;
8018						r = r600_bytecode_add_alu(ctx->bc, &alu);
8019						if (r)
8020							return r;
8021					}
8022				}
8023				for (i = 0; i < 2; i++) {
8024					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8025					alu.op = ALU_OP3_MULADD;
8026					alu.is_op3 = 1;
8027					alu.dst.sel = ctx->temp_reg;
8028					alu.dst.chan = i;
8029					alu.dst.write = 1;
8030					alu.last = i == 1;
8031					alu.src[0].sel = treg;
8032					alu.src[0].chan = i;
8033					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8034					alu.src[1].neg = 1;
8035					if (src_loaded) {
8036						alu.src[2].sel = ctx->temp_reg;
8037						alu.src[2].chan = i;
8038					} else
8039						r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8040					r = r600_bytecode_add_alu(ctx->bc, &alu);
8041					if (r)
8042						return r;
8043				}
8044			}
8045			src_loaded = TRUE;
8046			src_gpr = ctx->temp_reg;
8047		}
8048	}
8049
8050	if (src_requires_loading && !src_loaded) {
8051		for (i = 0; i < 4; i++) {
8052			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8053			alu.op = ALU_OP1_MOV;
8054			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8055			alu.dst.sel = ctx->temp_reg;
8056			alu.dst.chan = i;
8057			if (i == 3)
8058				alu.last = 1;
8059			alu.dst.write = 1;
8060			r = r600_bytecode_add_alu(ctx->bc, &alu);
8061			if (r)
8062				return r;
8063		}
8064		src_loaded = TRUE;
8065		src_gpr = ctx->temp_reg;
8066	}
8067
8068	/* get offset values */
8069	if (inst->Texture.NumOffsets) {
8070		assert(inst->Texture.NumOffsets == 1);
8071
8072		/* The texture offset feature doesn't work with the TXF instruction
8073		 * and must be emulated by adding the offset to the texture coordinates. */
8074		if (txf_add_offsets) {
8075			const struct tgsi_texture_offset *off = inst->TexOffsets;
8076
8077			switch (inst->Texture.Texture) {
8078			case TGSI_TEXTURE_3D:
8079				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8080				alu.op = ALU_OP2_ADD_INT;
8081				alu.src[0].sel = src_gpr;
8082				alu.src[0].chan = 2;
8083				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8084				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8085				alu.dst.sel = src_gpr;
8086				alu.dst.chan = 2;
8087				alu.dst.write = 1;
8088				alu.last = 1;
8089				r = r600_bytecode_add_alu(ctx->bc, &alu);
8090				if (r)
8091					return r;
8092				/* fall through */
8093
8094			case TGSI_TEXTURE_2D:
8095			case TGSI_TEXTURE_SHADOW2D:
8096			case TGSI_TEXTURE_RECT:
8097			case TGSI_TEXTURE_SHADOWRECT:
8098			case TGSI_TEXTURE_2D_ARRAY:
8099			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8100				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8101				alu.op = ALU_OP2_ADD_INT;
8102				alu.src[0].sel = src_gpr;
8103				alu.src[0].chan = 1;
8104				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8105				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8106				alu.dst.sel = src_gpr;
8107				alu.dst.chan = 1;
8108				alu.dst.write = 1;
8109				alu.last = 1;
8110				r = r600_bytecode_add_alu(ctx->bc, &alu);
8111				if (r)
8112					return r;
8113				/* fall through */
8114
8115			case TGSI_TEXTURE_1D:
8116			case TGSI_TEXTURE_SHADOW1D:
8117			case TGSI_TEXTURE_1D_ARRAY:
8118			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8119				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8120				alu.op = ALU_OP2_ADD_INT;
8121				alu.src[0].sel = src_gpr;
8122				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8123				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8124				alu.dst.sel = src_gpr;
8125				alu.dst.write = 1;
8126				alu.last = 1;
8127				r = r600_bytecode_add_alu(ctx->bc, &alu);
8128				if (r)
8129					return r;
8130				break;
8131				/* texture offsets do not apply to other texture targets */
8132			}
8133		} else {
8134			switch (inst->Texture.Texture) {
8135			case TGSI_TEXTURE_3D:
8136				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8137				/* fallthrough */
8138			case TGSI_TEXTURE_2D:
8139			case TGSI_TEXTURE_SHADOW2D:
8140			case TGSI_TEXTURE_RECT:
8141			case TGSI_TEXTURE_SHADOWRECT:
8142			case TGSI_TEXTURE_2D_ARRAY:
8143			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8144				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8145				/* fallthrough */
8146			case TGSI_TEXTURE_1D:
8147			case TGSI_TEXTURE_SHADOW1D:
8148			case TGSI_TEXTURE_1D_ARRAY:
8149			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8150				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8151			}
8152		}
8153	}
8154
8155	/* Obtain the sample index for reading a compressed MSAA color texture.
8156	 * To read the FMASK, we use the ldfptr instruction, which tells us
8157	 * where the samples are stored.
8158	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8159	 * which is the identity mapping. Each nibble says which physical sample
8160	 * should be fetched to get that sample.
8161	 *
8162	 * Assume src.z contains the sample index. It should be modified like this:
8163	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8164	 * Then fetch the texel with src.
8165	 */
8166	if (read_compressed_msaa) {
8167		unsigned sample_chan = 3;
8168		unsigned temp = r600_get_temp(ctx);
8169		assert(src_loaded);
8170
8171		/* temp.w = ldfptr() */
8172		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8173		tex.op = FETCH_OP_LD;
8174		tex.inst_mod = 1; /* to indicate this is ldfptr */
8175		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8176		tex.sampler_index_mode = sampler_index_mode;
8177		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8178		tex.resource_index_mode = sampler_index_mode;
8179		tex.src_gpr = src_gpr;
8180		tex.dst_gpr = temp;
8181		tex.dst_sel_x = 7; /* mask out these components */
8182		tex.dst_sel_y = 7;
8183		tex.dst_sel_z = 7;
8184		tex.dst_sel_w = 0; /* store X */
8185		tex.src_sel_x = 0;
8186		tex.src_sel_y = 1;
8187		tex.src_sel_z = 2;
8188		tex.src_sel_w = 3;
8189		tex.offset_x = offset_x;
8190		tex.offset_y = offset_y;
8191		tex.offset_z = offset_z;
8192		r = r600_bytecode_add_tex(ctx->bc, &tex);
8193		if (r)
8194			return r;
8195
8196		/* temp.x = sample_index*4 */
8197		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8198		alu.op = ALU_OP2_MULLO_INT;
8199		alu.src[0].sel = src_gpr;
8200		alu.src[0].chan = sample_chan;
8201		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8202		alu.src[1].value = 4;
8203		alu.dst.sel = temp;
8204		alu.dst.chan = 0;
8205		alu.dst.write = 1;
8206		r = emit_mul_int_op(ctx->bc, &alu);
8207		if (r)
8208			return r;
8209
8210		/* sample_index = temp.w >> temp.x */
8211		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8212		alu.op = ALU_OP2_LSHR_INT;
8213		alu.src[0].sel = temp;
8214		alu.src[0].chan = 3;
8215		alu.src[1].sel = temp;
8216		alu.src[1].chan = 0;
8217		alu.dst.sel = src_gpr;
8218		alu.dst.chan = sample_chan;
8219		alu.dst.write = 1;
8220		alu.last = 1;
8221		r = r600_bytecode_add_alu(ctx->bc, &alu);
8222		if (r)
8223			return r;
8224
8225		/* sample_index & 0xF */
8226		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8227		alu.op = ALU_OP2_AND_INT;
8228		alu.src[0].sel = src_gpr;
8229		alu.src[0].chan = sample_chan;
8230		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8231		alu.src[1].value = 0xF;
8232		alu.dst.sel = src_gpr;
8233		alu.dst.chan = sample_chan;
8234		alu.dst.write = 1;
8235		alu.last = 1;
8236		r = r600_bytecode_add_alu(ctx->bc, &alu);
8237		if (r)
8238			return r;
8239#if 0
8240		/* visualize the FMASK */
8241		for (i = 0; i < 4; i++) {
8242			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8243			alu.op = ALU_OP1_INT_TO_FLT;
8244			alu.src[0].sel = src_gpr;
8245			alu.src[0].chan = sample_chan;
8246			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8247			alu.dst.chan = i;
8248			alu.dst.write = 1;
8249			alu.last = 1;
8250			r = r600_bytecode_add_alu(ctx->bc, &alu);
8251			if (r)
8252				return r;
8253		}
8254		return 0;
8255#endif
8256	}
8257
8258	/* does this shader want a num layers from TXQ for a cube array? */
8259	if (has_txq_cube_array_z) {
8260		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8261
8262		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8263		alu.op = ALU_OP1_MOV;
8264
8265		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8266		if (ctx->bc->chip_class >= EVERGREEN) {
8267			/* with eg each dword is number of cubes */
8268			alu.src[0].sel += id / 4;
8269			alu.src[0].chan = id % 4;
8270		} else {
8271			/* r600 we have them at channel 2 of the second dword */
8272			alu.src[0].sel += (id * 2) + 1;
8273			alu.src[0].chan = 2;
8274		}
8275		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8276		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8277		alu.last = 1;
8278		r = r600_bytecode_add_alu(ctx->bc, &alu);
8279		if (r)
8280			return r;
8281		/* disable writemask from texture instruction */
8282		inst->Dst[0].Register.WriteMask &= ~4;
8283	}
8284
8285	opcode = ctx->inst_info->op;
8286	if (opcode == FETCH_OP_GATHER4 &&
8287		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8288		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8289		struct r600_bytecode_tex *t;
8290		opcode = FETCH_OP_GATHER4_O;
8291
8292		/* GATHER4_O/GATHER4_C_O use offset values loaded by
8293		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
8294		   encoded in the instruction are ignored. */
8295		t = &grad_offs[n_grad_offs++];
8296		memset(t, 0, sizeof(struct r600_bytecode_tex));
8297		t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8298		t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8299		t->sampler_index_mode = sampler_index_mode;
8300		t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8301		t->resource_index_mode = sampler_index_mode;
8302
8303		t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8304		t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8305		t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8306		if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8307			 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8308			/* make sure array index selector is 0, this is just a safety
8309			 * precausion because TGSI seems to emit something strange here */
8310			t->src_sel_z = 4;
8311		else
8312			t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8313
8314		t->src_sel_w = 4;
8315
8316		t->dst_sel_x = 7;
8317		t->dst_sel_y = 7;
8318		t->dst_sel_z = 7;
8319		t->dst_sel_w = 7;
8320	}
8321
8322	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8323	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8324	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8325	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8326	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8327	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8328	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8329		switch (opcode) {
8330		case FETCH_OP_SAMPLE:
8331			opcode = FETCH_OP_SAMPLE_C;
8332			break;
8333		case FETCH_OP_SAMPLE_L:
8334			opcode = FETCH_OP_SAMPLE_C_L;
8335			break;
8336		case FETCH_OP_SAMPLE_LB:
8337			opcode = FETCH_OP_SAMPLE_C_LB;
8338			break;
8339		case FETCH_OP_SAMPLE_G:
8340			opcode = FETCH_OP_SAMPLE_C_G;
8341			break;
8342		/* Texture gather variants */
8343		case FETCH_OP_GATHER4:
8344			opcode = FETCH_OP_GATHER4_C;
8345			break;
8346		case FETCH_OP_GATHER4_O:
8347			opcode = FETCH_OP_GATHER4_C_O;
8348			break;
8349		}
8350	}
8351
8352	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8353	tex.op = opcode;
8354
8355	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8356	tex.sampler_index_mode = sampler_index_mode;
8357	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8358	tex.resource_index_mode = sampler_index_mode;
8359	tex.src_gpr = src_gpr;
8360	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8361
8362	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8363		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8364		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8365	}
8366
8367	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8368		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8369		tex.inst_mod = texture_component_select;
8370
8371		if (ctx->bc->chip_class == CAYMAN) {
8372			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8373			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8374			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8375			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8376		} else {
8377			/* GATHER4 result order is different from TGSI TG4 */
8378			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8379			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8380			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8381			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8382		}
8383	}
8384	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8385		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8386		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8387		tex.dst_sel_z = 7;
8388		tex.dst_sel_w = 7;
8389	}
8390	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8391		tex.dst_sel_x = 3;
8392		tex.dst_sel_y = 7;
8393		tex.dst_sel_z = 7;
8394		tex.dst_sel_w = 7;
8395	}
8396	else {
8397		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8398		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8399		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8400		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8401	}
8402
8403
8404	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8405		tex.src_sel_x = 4;
8406		tex.src_sel_y = 4;
8407		tex.src_sel_z = 4;
8408		tex.src_sel_w = 4;
8409	} else if (src_loaded) {
8410		tex.src_sel_x = 0;
8411		tex.src_sel_y = 1;
8412		tex.src_sel_z = 2;
8413		tex.src_sel_w = 3;
8414	} else {
8415		tex.src_sel_x = ctx->src[0].swizzle[0];
8416		tex.src_sel_y = ctx->src[0].swizzle[1];
8417		tex.src_sel_z = ctx->src[0].swizzle[2];
8418		tex.src_sel_w = ctx->src[0].swizzle[3];
8419		tex.src_rel = ctx->src[0].rel;
8420	}
8421
8422	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8423	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8424	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8425	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8426		tex.src_sel_x = 1;
8427		tex.src_sel_y = 0;
8428		tex.src_sel_z = 3;
8429		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8430	}
8431
8432	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8433	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8434		tex.coord_type_x = 1;
8435		tex.coord_type_y = 1;
8436	}
8437	tex.coord_type_z = 1;
8438	tex.coord_type_w = 1;
8439
8440	tex.offset_x = offset_x;
8441	tex.offset_y = offset_y;
8442	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8443		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8444		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8445		tex.offset_z = 0;
8446	}
8447	else {
8448		tex.offset_z = offset_z;
8449	}
8450
8451	/* Put the depth for comparison in W.
8452	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8453	 * Some instructions expect the depth in Z. */
8454	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8455	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8456	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8457	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8458	    opcode != FETCH_OP_SAMPLE_C_L &&
8459	    opcode != FETCH_OP_SAMPLE_C_LB) {
8460		tex.src_sel_w = tex.src_sel_z;
8461	}
8462
8463	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8464	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8465		if (opcode == FETCH_OP_SAMPLE_C_L ||
8466		    opcode == FETCH_OP_SAMPLE_C_LB) {
8467			/* the array index is read from Y */
8468			tex.coord_type_y = 0;
8469			array_index_offset_channel = tex.src_sel_y;
8470		} else {
8471			/* the array index is read from Z */
8472			tex.coord_type_z = 0;
8473			tex.src_sel_z = tex.src_sel_y;
8474			array_index_offset_channel = tex.src_sel_z;
8475		}
8476	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8477		    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8478		tex.coord_type_z = 0;
8479		array_index_offset_channel = tex.src_sel_z;
8480	} else if  ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8481		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8482		    (ctx->bc->chip_class >= EVERGREEN))
8483		/* the array index is read from Z, coordinate will be corrected elsewhere  */
8484		tex.coord_type_z = 0;
8485
8486	/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8487	 * evaluate the array index  */
8488	if (array_index_offset_channel >= 0 &&
8489		 opcode != FETCH_OP_LD &&
8490		 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8491		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8492		alu.src[0].sel =  tex.src_gpr;
8493		alu.src[0].chan =  array_index_offset_channel;
8494		alu.src[0].rel = tex.src_rel;
8495		alu.op = ALU_OP1_RNDNE;
8496		alu.dst.sel = tex.src_gpr;
8497		alu.dst.chan = array_index_offset_channel;
8498		alu.dst.rel = tex.src_rel;
8499		alu.dst.write = 1;
8500		alu.last = 1;
8501		r = r600_bytecode_add_alu(ctx->bc, &alu);
8502		if (r)
8503			return r;
8504	}
8505
8506	/* mask unused source components */
8507	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8508		switch (inst->Texture.Texture) {
8509		case TGSI_TEXTURE_2D:
8510		case TGSI_TEXTURE_RECT:
8511			tex.src_sel_z = 7;
8512			tex.src_sel_w = 7;
8513			break;
8514		case TGSI_TEXTURE_1D_ARRAY:
8515			tex.src_sel_y = 7;
8516			tex.src_sel_w = 7;
8517			break;
8518		case TGSI_TEXTURE_1D:
8519			tex.src_sel_y = 7;
8520			tex.src_sel_z = 7;
8521			tex.src_sel_w = 7;
8522			break;
8523		}
8524	}
8525
8526	/* Emit set gradient and offset instructions. */
8527	for (i = 0; i < n_grad_offs; ++i) {
8528		r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8529		if (r)
8530			return r;
8531	}
8532
8533	r = r600_bytecode_add_tex(ctx->bc, &tex);
8534	if (r)
8535		return r;
8536
8537	/* add shadow ambient support  - gallium doesn't do it yet */
8538	return 0;
8539}
8540
8541static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8542				  struct tgsi_full_src_register *src)
8543{
8544	unsigned i;
8545
8546	if (src->Register.Indirect) {
8547		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8548			if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8549				return ctx->shader->atomics[i].hw_idx;
8550		}
8551	} else {
8552		uint32_t index = src->Register.Index;
8553		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8554			if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8555				continue;
8556			if (index > ctx->shader->atomics[i].end)
8557				continue;
8558			if (index < ctx->shader->atomics[i].start)
8559				continue;
8560			uint32_t offset = (index - ctx->shader->atomics[i].start);
8561			return ctx->shader->atomics[i].hw_idx + offset;
8562		}
8563	}
8564	assert(0);
8565	return -1;
8566}
8567
8568static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8569			     int *uav_id_p, int *uav_index_mode_p)
8570{
8571	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8572	int uav_id, uav_index_mode = 0;
8573	int r;
8574	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8575
8576	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8577
8578	if (inst->Src[0].Register.Indirect) {
8579		if (is_cm) {
8580			struct r600_bytecode_alu alu;
8581			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8582			alu.op = ALU_OP2_LSHL_INT;
8583			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8584			alu.src[0].chan = 0;
8585			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8586			alu.src[1].value = 2;
8587			alu.dst.sel = ctx->temp_reg;
8588			alu.dst.chan = 0;
8589			alu.dst.write = 1;
8590			alu.last = 1;
8591			r = r600_bytecode_add_alu(ctx->bc, &alu);
8592			if (r)
8593				return r;
8594
8595			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8596					   ctx->temp_reg, 0,
8597					   ctx->temp_reg, 0,
8598					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8599			if (r)
8600				return r;
8601		} else
8602			uav_index_mode = 2;
8603	} else if (is_cm) {
8604		r = single_alu_op2(ctx, ALU_OP1_MOV,
8605				   ctx->temp_reg, 0,
8606				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8607				   0, 0);
8608		if (r)
8609			return r;
8610	}
8611	*uav_id_p = uav_id;
8612	*uav_index_mode_p = uav_index_mode;
8613	return 0;
8614}
8615
8616static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8617{
8618	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8619	int r;
8620	struct r600_bytecode_gds gds;
8621	int uav_id = 0;
8622	int uav_index_mode = 0;
8623	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8624
8625	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8626	if (r)
8627		return r;
8628
8629	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8630	gds.op = FETCH_OP_GDS_READ_RET;
8631	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8632	gds.uav_id = is_cm ? 0 : uav_id;
8633	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8634	gds.src_gpr = ctx->temp_reg;
8635	gds.src_sel_x = (is_cm) ? 0 : 4;
8636	gds.src_sel_y = 4;
8637	gds.src_sel_z = 4;
8638	gds.dst_sel_x = 0;
8639	gds.dst_sel_y = 7;
8640	gds.dst_sel_z = 7;
8641	gds.dst_sel_w = 7;
8642	gds.src_gpr2 = 0;
8643	gds.alloc_consume = !is_cm;
8644	r = r600_bytecode_add_gds(ctx->bc, &gds);
8645	if (r)
8646		return r;
8647
8648	ctx->bc->cf_last->vpm = 1;
8649	return 0;
8650}
8651
8652/* this fixes up 1D arrays properly */
8653static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8654{
8655	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8656	int r, i;
8657	struct r600_bytecode_alu alu;
8658	int temp_reg = r600_get_temp(ctx);
8659
8660	for (i = 0; i < 4; i++) {
8661		bool def_val = true, write_zero = false;
8662		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8663		alu.op = ALU_OP1_MOV;
8664		alu.dst.sel = temp_reg;
8665		alu.dst.chan = i;
8666
8667		switch (inst->Memory.Texture) {
8668		case TGSI_TEXTURE_BUFFER:
8669		case TGSI_TEXTURE_1D:
8670			if (i == 1 || i == 2 || i == 3) {
8671				write_zero = true;
8672			}
8673			break;
8674		case TGSI_TEXTURE_1D_ARRAY:
8675			if (i == 1 || i == 3)
8676				write_zero = true;
8677			else if (i == 2) {
8678				r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8679				def_val = false;
8680			}
8681			break;
8682		case TGSI_TEXTURE_2D:
8683			if (i == 2 || i == 3)
8684				write_zero = true;
8685			break;
8686		default:
8687			if (i == 3)
8688				write_zero = true;
8689			break;
8690		}
8691
8692		if (write_zero) {
8693			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8694			alu.src[0].value = 0;
8695		} else if (def_val) {
8696			r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8697		}
8698
8699		if (i == 3)
8700			alu.last = 1;
8701		alu.dst.write = 1;
8702		r = r600_bytecode_add_alu(ctx->bc, &alu);
8703		if (r)
8704			return r;
8705	}
8706	*idx_gpr = temp_reg;
8707	return 0;
8708}
8709
8710static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8711			     int temp_reg)
8712{
8713	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8714	int r;
8715	if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8716		int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8717		r = single_alu_op2(ctx, ALU_OP1_MOV,
8718				   temp_reg, 0,
8719				   V_SQ_ALU_SRC_LITERAL, value >> 2,
8720				   0, 0);
8721		if (r)
8722			return r;
8723	} else {
8724		struct r600_bytecode_alu alu;
8725		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8726		alu.op = ALU_OP2_LSHR_INT;
8727		r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8728		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8729		alu.src[1].value = 2;
8730		alu.dst.sel = temp_reg;
8731		alu.dst.write = 1;
8732		alu.last = 1;
8733		r = r600_bytecode_add_alu(ctx->bc, &alu);
8734		if (r)
8735			return r;
8736	}
8737	return 0;
8738}
8739
8740static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8741{
8742	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8743	/* have to work out the offset into the RAT immediate return buffer */
8744	struct r600_bytecode_vtx vtx;
8745	struct r600_bytecode_cf *cf;
8746	int r;
8747	int temp_reg = r600_get_temp(ctx);
8748	unsigned rat_index_mode;
8749	unsigned base;
8750
8751	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8752	base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8753
8754	r = load_buffer_coord(ctx, 1, temp_reg);
8755	if (r)
8756		return r;
8757	ctx->bc->cf_last->barrier = 1;
8758	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8759	vtx.op = FETCH_OP_VFETCH;
8760	vtx.buffer_id = inst->Src[0].Register.Index + base;
8761	vtx.buffer_index_mode = rat_index_mode;
8762	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8763	vtx.src_gpr = temp_reg;
8764	vtx.src_sel_x = 0;
8765	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8766	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
8767	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
8768	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
8769	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
8770	vtx.num_format_all = 1;
8771	vtx.format_comp_all = 1;
8772	vtx.srf_mode_all = 0;
8773
8774	if (inst->Dst[0].Register.WriteMask & 8) {
8775		vtx.data_format = FMT_32_32_32_32;
8776		vtx.use_const_fields = 0;
8777	} else if (inst->Dst[0].Register.WriteMask & 4) {
8778		vtx.data_format = FMT_32_32_32;
8779		vtx.use_const_fields = 0;
8780	} else if (inst->Dst[0].Register.WriteMask & 2) {
8781		vtx.data_format = FMT_32_32;
8782		vtx.use_const_fields = 0;
8783	} else {
8784		vtx.data_format = FMT_32;
8785		vtx.use_const_fields = 0;
8786	}
8787
8788	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8789	if (r)
8790		return r;
8791	cf = ctx->bc->cf_last;
8792	cf->barrier = 1;
8793	return 0;
8794}
8795
8796static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8797{
8798	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8799	/* have to work out the offset into the RAT immediate return buffer */
8800	struct r600_bytecode_vtx vtx;
8801	struct r600_bytecode_cf *cf;
8802	int r;
8803	int idx_gpr;
8804	unsigned format, num_format, format_comp, endian;
8805	const struct util_format_description *desc;
8806	unsigned rat_index_mode;
8807	unsigned immed_base;
8808
8809	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8810
8811	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8812	r = load_index_src(ctx, 1, &idx_gpr);
8813	if (r)
8814		return r;
8815
8816	if (rat_index_mode)
8817		egcm_load_index_reg(ctx->bc, 1, false);
8818
8819	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8820	cf = ctx->bc->cf_last;
8821
8822	cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8823	cf->rat.inst = V_RAT_INST_NOP_RTN;
8824	cf->rat.index_mode = rat_index_mode;
8825	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8826	cf->output.gpr = ctx->thread_id_gpr;
8827	cf->output.index_gpr = idx_gpr;
8828	cf->output.comp_mask = 0xf;
8829	cf->output.burst_count = 1;
8830	cf->vpm = 1;
8831	cf->barrier = 1;
8832	cf->mark = 1;
8833	cf->output.elem_size = 0;
8834
8835	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8836	cf = ctx->bc->cf_last;
8837	cf->barrier = 1;
8838
8839	desc = util_format_description(inst->Memory.Format);
8840	r600_vertex_data_type(inst->Memory.Format,
8841			      &format, &num_format, &format_comp, &endian);
8842	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8843	vtx.op = FETCH_OP_VFETCH;
8844	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8845	vtx.buffer_index_mode = rat_index_mode;
8846	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8847	vtx.src_gpr = ctx->thread_id_gpr;
8848	vtx.src_sel_x = 1;
8849	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8850	vtx.dst_sel_x = desc->swizzle[0];
8851	vtx.dst_sel_y = desc->swizzle[1];
8852	vtx.dst_sel_z = desc->swizzle[2];
8853	vtx.dst_sel_w = desc->swizzle[3];
8854	vtx.srf_mode_all = 1;
8855	vtx.data_format = format;
8856	vtx.num_format_all = num_format;
8857	vtx.format_comp_all = format_comp;
8858	vtx.endian = endian;
8859	vtx.offset = 0;
8860	vtx.mega_fetch_count = 3;
8861	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8862	if (r)
8863		return r;
8864	cf = ctx->bc->cf_last;
8865	cf->barrier = 1;
8866	return 0;
8867}
8868
8869static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8870{
8871	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8872	struct r600_bytecode_alu alu;
8873	int r;
8874	int temp_reg = r600_get_temp(ctx);
8875
8876	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8877	alu.op = ALU_OP1_MOV;
8878	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8879	alu.dst.sel = temp_reg;
8880	alu.dst.write = 1;
8881	alu.last = 1;
8882	r = r600_bytecode_add_alu(ctx->bc, &alu);
8883	if (r)
8884		return r;
8885
8886	r = do_lds_fetch_values(ctx, temp_reg,
8887				ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8888	if (r)
8889		return r;
8890	return 0;
8891}
8892
8893static int tgsi_load(struct r600_shader_ctx *ctx)
8894{
8895	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8896	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8897		return tgsi_load_rat(ctx);
8898	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8899		return tgsi_load_gds(ctx);
8900	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8901		return tgsi_load_buffer(ctx);
8902	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8903		return tgsi_load_lds(ctx);
8904	return 0;
8905}
8906
8907static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8908{
8909	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8910	struct r600_bytecode_cf *cf;
8911	int r, i;
8912	unsigned rat_index_mode;
8913	int lasti;
8914	int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8915
8916	r = load_buffer_coord(ctx, 0, treg2);
8917	if (r)
8918		return r;
8919
8920	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8921	if (rat_index_mode)
8922		egcm_load_index_reg(ctx->bc, 1, false);
8923
8924	for (i = 0; i <= 3; i++) {
8925		struct r600_bytecode_alu alu;
8926		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8927		alu.op = ALU_OP1_MOV;
8928		alu.dst.sel = temp_reg;
8929		alu.dst.chan = i;
8930		alu.src[0].sel = V_SQ_ALU_SRC_0;
8931		alu.last = (i == 3);
8932		alu.dst.write = 1;
8933		r = r600_bytecode_add_alu(ctx->bc, &alu);
8934		if (r)
8935			return r;
8936	}
8937
8938	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8939	for (i = 0; i <= lasti; i++) {
8940		struct r600_bytecode_alu alu;
8941		if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8942			continue;
8943
8944		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8945				   temp_reg, 0,
8946				   treg2, 0,
8947				   V_SQ_ALU_SRC_LITERAL, i);
8948		if (r)
8949			return r;
8950
8951		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8952		alu.op = ALU_OP1_MOV;
8953		alu.dst.sel = ctx->temp_reg;
8954		alu.dst.chan = 0;
8955
8956		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8957		alu.last = 1;
8958		alu.dst.write = 1;
8959		r = r600_bytecode_add_alu(ctx->bc, &alu);
8960		if (r)
8961			return r;
8962
8963		r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8964		cf = ctx->bc->cf_last;
8965
8966		cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8967		cf->rat.inst = V_RAT_INST_STORE_TYPED;
8968		cf->rat.index_mode = rat_index_mode;
8969		cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8970		cf->output.gpr = ctx->temp_reg;
8971		cf->output.index_gpr = temp_reg;
8972		cf->output.comp_mask = 1;
8973		cf->output.burst_count = 1;
8974		cf->vpm = 1;
8975		cf->barrier = 1;
8976		cf->output.elem_size = 0;
8977	}
8978	return 0;
8979}
8980
8981static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8982{
8983	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8984	struct r600_bytecode_cf *cf;
8985	bool src_requires_loading = false;
8986	int val_gpr, idx_gpr;
8987	int r, i;
8988	unsigned rat_index_mode;
8989
8990	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8991
8992	r = load_index_src(ctx, 0, &idx_gpr);
8993	if (r)
8994		return r;
8995
8996	if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8997		src_requires_loading = true;
8998
8999	if (src_requires_loading) {
9000		struct r600_bytecode_alu alu;
9001		for (i = 0; i < 4; i++) {
9002			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9003			alu.op = ALU_OP1_MOV;
9004			alu.dst.sel = ctx->temp_reg;
9005			alu.dst.chan = i;
9006
9007			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9008			if (i == 3)
9009				alu.last = 1;
9010			alu.dst.write = 1;
9011			r = r600_bytecode_add_alu(ctx->bc, &alu);
9012			if (r)
9013				return r;
9014		}
9015		val_gpr = ctx->temp_reg;
9016	} else
9017		val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9018	if (rat_index_mode)
9019		egcm_load_index_reg(ctx->bc, 1, false);
9020
9021	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9022	cf = ctx->bc->cf_last;
9023
9024	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9025	cf->rat.inst = V_RAT_INST_STORE_TYPED;
9026	cf->rat.index_mode = rat_index_mode;
9027	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9028	cf->output.gpr = val_gpr;
9029	cf->output.index_gpr = idx_gpr;
9030	cf->output.comp_mask = 0xf;
9031	cf->output.burst_count = 1;
9032	cf->vpm = 1;
9033	cf->barrier = 1;
9034	cf->output.elem_size = 0;
9035	return 0;
9036}
9037
9038static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9039{
9040	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9041	struct r600_bytecode_alu alu;
9042	int r, i, lasti;
9043	int write_mask = inst->Dst[0].Register.WriteMask;
9044	int temp_reg = r600_get_temp(ctx);
9045
9046	/* LDS write */
9047	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9048	alu.op = ALU_OP1_MOV;
9049	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9050	alu.dst.sel = temp_reg;
9051	alu.dst.write = 1;
9052	alu.last = 1;
9053	r = r600_bytecode_add_alu(ctx->bc, &alu);
9054	if (r)
9055		return r;
9056
9057	lasti = tgsi_last_instruction(write_mask);
9058	for (i = 1; i <= lasti; i++) {
9059		if (!(write_mask & (1 << i)))
9060			continue;
9061		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9062				   temp_reg, i,
9063				   temp_reg, 0,
9064				   V_SQ_ALU_SRC_LITERAL, 4 * i);
9065		if (r)
9066			return r;
9067	}
9068	for (i = 0; i <= lasti; i++) {
9069		if (!(write_mask & (1 << i)))
9070			continue;
9071
9072		if ((i == 0 && ((write_mask & 3) == 3)) ||
9073		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
9074			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9075			alu.op = LDS_OP3_LDS_WRITE_REL;
9076
9077			alu.src[0].sel = temp_reg;
9078			alu.src[0].chan = i;
9079			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9080			r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9081			alu.last = 1;
9082			alu.is_lds_idx_op = true;
9083			alu.lds_idx = 1;
9084			r = r600_bytecode_add_alu(ctx->bc, &alu);
9085			if (r)
9086				return r;
9087			i += 1;
9088			continue;
9089		}
9090		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9091		alu.op = LDS_OP2_LDS_WRITE;
9092
9093		alu.src[0].sel = temp_reg;
9094		alu.src[0].chan = i;
9095		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9096
9097		alu.last = 1;
9098		alu.is_lds_idx_op = true;
9099
9100		r = r600_bytecode_add_alu(ctx->bc, &alu);
9101		if (r)
9102			return r;
9103	}
9104	return 0;
9105}
9106
9107static int tgsi_store(struct r600_shader_ctx *ctx)
9108{
9109	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9110	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9111		return tgsi_store_buffer_rat(ctx);
9112	else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9113		return tgsi_store_lds(ctx);
9114	else
9115		return tgsi_store_rat(ctx);
9116}
9117
9118static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9119{
9120	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9121	/* have to work out the offset into the RAT immediate return buffer */
9122	struct r600_bytecode_alu alu;
9123	struct r600_bytecode_vtx vtx;
9124	struct r600_bytecode_cf *cf;
9125	int r;
9126	int idx_gpr;
9127	unsigned format, num_format, format_comp, endian;
9128	const struct util_format_description *desc;
9129	unsigned rat_index_mode;
9130	unsigned immed_base;
9131	unsigned rat_base;
9132
9133	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9134	rat_base = ctx->shader->rat_base;
9135
9136        if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9137		immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9138		rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9139
9140		r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9141		if (r)
9142			return r;
9143		idx_gpr = ctx->temp_reg;
9144	} else {
9145		r = load_index_src(ctx, 1, &idx_gpr);
9146		if (r)
9147			return r;
9148	}
9149
9150	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9151
9152	if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9153		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9154		alu.op = ALU_OP1_MOV;
9155		alu.dst.sel = ctx->thread_id_gpr;
9156		alu.dst.chan = 0;
9157		alu.dst.write = 1;
9158		r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9159		alu.last = 1;
9160		r = r600_bytecode_add_alu(ctx->bc, &alu);
9161		if (r)
9162			return r;
9163
9164		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9165		alu.op = ALU_OP1_MOV;
9166		alu.dst.sel = ctx->thread_id_gpr;
9167		if (ctx->bc->chip_class == CAYMAN)
9168			alu.dst.chan = 2;
9169		else
9170			alu.dst.chan = 3;
9171		alu.dst.write = 1;
9172		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9173		alu.last = 1;
9174		r = r600_bytecode_add_alu(ctx->bc, &alu);
9175		if (r)
9176			return r;
9177	} else {
9178		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9179		alu.op = ALU_OP1_MOV;
9180		alu.dst.sel = ctx->thread_id_gpr;
9181		alu.dst.chan = 0;
9182		alu.dst.write = 1;
9183		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9184		alu.last = 1;
9185		r = r600_bytecode_add_alu(ctx->bc, &alu);
9186		if (r)
9187			return r;
9188	}
9189
9190	if (rat_index_mode)
9191		egcm_load_index_reg(ctx->bc, 1, false);
9192	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9193	cf = ctx->bc->cf_last;
9194
9195	cf->rat.id = rat_base + inst->Src[0].Register.Index;
9196	cf->rat.inst = ctx->inst_info->op;
9197	cf->rat.index_mode = rat_index_mode;
9198	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9199	cf->output.gpr = ctx->thread_id_gpr;
9200	cf->output.index_gpr = idx_gpr;
9201	cf->output.comp_mask = 0xf;
9202	cf->output.burst_count = 1;
9203	cf->vpm = 1;
9204	cf->barrier = 1;
9205	cf->mark = 1;
9206	cf->output.elem_size = 0;
9207	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9208	cf = ctx->bc->cf_last;
9209	cf->barrier = 1;
9210	cf->cf_addr = 1;
9211
9212	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9213	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9214		desc = util_format_description(inst->Memory.Format);
9215		r600_vertex_data_type(inst->Memory.Format,
9216				      &format, &num_format, &format_comp, &endian);
9217		vtx.dst_sel_x = desc->swizzle[0];
9218	} else {
9219		format = FMT_32;
9220		num_format = 1;
9221		format_comp = 0;
9222		endian = 0;
9223		vtx.dst_sel_x = 0;
9224	}
9225	vtx.op = FETCH_OP_VFETCH;
9226	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9227	vtx.buffer_index_mode = rat_index_mode;
9228	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9229	vtx.src_gpr = ctx->thread_id_gpr;
9230	vtx.src_sel_x = 1;
9231	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9232	vtx.dst_sel_y = 7;
9233	vtx.dst_sel_z = 7;
9234	vtx.dst_sel_w = 7;
9235	vtx.use_const_fields = 0;
9236	vtx.srf_mode_all = 1;
9237	vtx.data_format = format;
9238	vtx.num_format_all = num_format;
9239	vtx.format_comp_all = format_comp;
9240	vtx.endian = endian;
9241	vtx.offset = 0;
9242	vtx.mega_fetch_count = 0xf;
9243	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9244	if (r)
9245		return r;
9246	cf = ctx->bc->cf_last;
9247	cf->vpm = 1;
9248	cf->barrier = 1;
9249	return 0;
9250}
9251
9252static int get_gds_op(int opcode)
9253{
9254	switch (opcode) {
9255	case TGSI_OPCODE_ATOMUADD:
9256		return FETCH_OP_GDS_ADD_RET;
9257	case TGSI_OPCODE_ATOMAND:
9258		return FETCH_OP_GDS_AND_RET;
9259	case TGSI_OPCODE_ATOMOR:
9260		return FETCH_OP_GDS_OR_RET;
9261	case TGSI_OPCODE_ATOMXOR:
9262		return FETCH_OP_GDS_XOR_RET;
9263	case TGSI_OPCODE_ATOMUMIN:
9264		return FETCH_OP_GDS_MIN_UINT_RET;
9265	case TGSI_OPCODE_ATOMUMAX:
9266		return FETCH_OP_GDS_MAX_UINT_RET;
9267	case TGSI_OPCODE_ATOMXCHG:
9268		return FETCH_OP_GDS_XCHG_RET;
9269	case TGSI_OPCODE_ATOMCAS:
9270		return FETCH_OP_GDS_CMP_XCHG_RET;
9271	default:
9272		return -1;
9273	}
9274}
9275
9276static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9277{
9278	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9279	struct r600_bytecode_gds gds;
9280	struct r600_bytecode_alu alu;
9281	int gds_op = get_gds_op(inst->Instruction.Opcode);
9282	int r;
9283	int uav_id = 0;
9284	int uav_index_mode = 0;
9285	bool is_cm = (ctx->bc->chip_class == CAYMAN);
9286
9287	if (gds_op == -1) {
9288		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9289		return -1;
9290	}
9291
9292	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9293	if (r)
9294		return r;
9295
9296	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9297		if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9298			int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9299			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9300			alu.op = ALU_OP1_MOV;
9301			alu.dst.sel = ctx->temp_reg;
9302			alu.dst.chan = is_cm ? 2 : 1;
9303			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9304			alu.src[0].value = value;
9305			alu.last = 1;
9306			alu.dst.write = 1;
9307			r = r600_bytecode_add_alu(ctx->bc, &alu);
9308			if (r)
9309				return r;
9310		} else {
9311			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9312			alu.op = ALU_OP1_MOV;
9313			alu.dst.sel = ctx->temp_reg;
9314			alu.dst.chan = is_cm ? 2 : 1;
9315			r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9316			alu.last = 1;
9317			alu.dst.write = 1;
9318			r = r600_bytecode_add_alu(ctx->bc, &alu);
9319			if (r)
9320				return r;
9321		}
9322	}
9323	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9324		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9325		int abs_value = abs(value);
9326		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9327			gds_op = FETCH_OP_GDS_SUB_RET;
9328		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9329		alu.op = ALU_OP1_MOV;
9330		alu.dst.sel = ctx->temp_reg;
9331		alu.dst.chan = is_cm ? 1 : 0;
9332		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9333		alu.src[0].value = abs_value;
9334		alu.last = 1;
9335		alu.dst.write = 1;
9336		r = r600_bytecode_add_alu(ctx->bc, &alu);
9337		if (r)
9338			return r;
9339	} else {
9340		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9341		alu.op = ALU_OP1_MOV;
9342		alu.dst.sel = ctx->temp_reg;
9343		alu.dst.chan = is_cm ? 1 : 0;
9344		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9345		alu.last = 1;
9346		alu.dst.write = 1;
9347		r = r600_bytecode_add_alu(ctx->bc, &alu);
9348		if (r)
9349			return r;
9350	}
9351
9352
9353	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9354	gds.op = gds_op;
9355	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9356	gds.uav_id = is_cm ? 0 : uav_id;
9357	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9358	gds.src_gpr = ctx->temp_reg;
9359	gds.src_gpr2 = 0;
9360	gds.src_sel_x = is_cm ? 0 : 4;
9361	gds.src_sel_y = is_cm ? 1 : 0;
9362	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9363		gds.src_sel_z = is_cm ? 2 : 1;
9364	else
9365		gds.src_sel_z = 7;
9366	gds.dst_sel_x = 0;
9367	gds.dst_sel_y = 7;
9368	gds.dst_sel_z = 7;
9369	gds.dst_sel_w = 7;
9370	gds.alloc_consume = !is_cm;
9371
9372	r = r600_bytecode_add_gds(ctx->bc, &gds);
9373	if (r)
9374		return r;
9375	ctx->bc->cf_last->vpm = 1;
9376	return 0;
9377}
9378
9379static int get_lds_op(int opcode)
9380{
9381	switch (opcode) {
9382	case TGSI_OPCODE_ATOMUADD:
9383		return LDS_OP2_LDS_ADD_RET;
9384	case TGSI_OPCODE_ATOMAND:
9385		return LDS_OP2_LDS_AND_RET;
9386	case TGSI_OPCODE_ATOMOR:
9387		return LDS_OP2_LDS_OR_RET;
9388	case TGSI_OPCODE_ATOMXOR:
9389		return LDS_OP2_LDS_XOR_RET;
9390	case TGSI_OPCODE_ATOMUMIN:
9391		return LDS_OP2_LDS_MIN_UINT_RET;
9392	case TGSI_OPCODE_ATOMUMAX:
9393		return LDS_OP2_LDS_MAX_UINT_RET;
9394	case TGSI_OPCODE_ATOMIMIN:
9395		return LDS_OP2_LDS_MIN_INT_RET;
9396	case TGSI_OPCODE_ATOMIMAX:
9397		return LDS_OP2_LDS_MAX_INT_RET;
9398	case TGSI_OPCODE_ATOMXCHG:
9399		return LDS_OP2_LDS_XCHG_RET;
9400	case TGSI_OPCODE_ATOMCAS:
9401		return LDS_OP3_LDS_CMP_XCHG_RET;
9402	default:
9403		return -1;
9404	}
9405}
9406
9407static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9408{
9409	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9410	int lds_op = get_lds_op(inst->Instruction.Opcode);
9411	int r;
9412
9413	struct r600_bytecode_alu alu;
9414	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9415	alu.op = lds_op;
9416	alu.is_lds_idx_op = true;
9417	alu.last = 1;
9418	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9419	r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9420	if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9421		r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9422	else
9423		alu.src[2].sel = V_SQ_ALU_SRC_0;
9424	r = r600_bytecode_add_alu(ctx->bc, &alu);
9425	if (r)
9426		return r;
9427
9428	/* then read from LDS_OQ_A_POP */
9429	memset(&alu, 0, sizeof(alu));
9430
9431	alu.op = ALU_OP1_MOV;
9432	alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9433	alu.src[0].chan = 0;
9434	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9435	alu.dst.write = 1;
9436	alu.last = 1;
9437	r = r600_bytecode_add_alu(ctx->bc, &alu);
9438	if (r)
9439		return r;
9440
9441	return 0;
9442}
9443
9444static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9445{
9446	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9447	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9448		return tgsi_atomic_op_rat(ctx);
9449	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9450		return tgsi_atomic_op_gds(ctx);
9451	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9452		return tgsi_atomic_op_rat(ctx);
9453	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9454		return tgsi_atomic_op_lds(ctx);
9455	return 0;
9456}
9457
9458static int tgsi_resq(struct r600_shader_ctx *ctx)
9459{
9460	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9461	unsigned sampler_index_mode;
9462	struct r600_bytecode_tex tex;
9463	int r;
9464	boolean has_txq_cube_array_z = false;
9465
9466	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9467	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9468		if (ctx->bc->chip_class < EVERGREEN)
9469			ctx->shader->uses_tex_buffers = true;
9470		unsigned eg_buffer_base = 0;
9471		eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9472		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9473			eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9474		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9475	}
9476
9477	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9478	    inst->Dst[0].Register.WriteMask & 4) {
9479		ctx->shader->has_txq_cube_array_z_comp = true;
9480		has_txq_cube_array_z = true;
9481	}
9482
9483	sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9484	if (sampler_index_mode)
9485		egcm_load_index_reg(ctx->bc, 1, false);
9486
9487
9488	/* does this shader want a num layers from TXQ for a cube array? */
9489	if (has_txq_cube_array_z) {
9490		int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9491		struct r600_bytecode_alu alu;
9492
9493		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9494		alu.op = ALU_OP1_MOV;
9495
9496		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9497		/* with eg each dword is either number of cubes */
9498		alu.src[0].sel += id / 4;
9499		alu.src[0].chan = id % 4;
9500		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9501		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9502		alu.last = 1;
9503		r = r600_bytecode_add_alu(ctx->bc, &alu);
9504		if (r)
9505			return r;
9506		/* disable writemask from texture instruction */
9507		inst->Dst[0].Register.WriteMask &= ~4;
9508	}
9509	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9510	tex.op = ctx->inst_info->op;
9511	tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9512	tex.sampler_index_mode = sampler_index_mode;
9513	tex.resource_id = tex.sampler_id;
9514	tex.resource_index_mode = sampler_index_mode;
9515	tex.src_sel_x = 4;
9516	tex.src_sel_y = 4;
9517	tex.src_sel_z = 4;
9518	tex.src_sel_w = 4;
9519	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9520	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9521	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9522	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9523	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9524	r = r600_bytecode_add_tex(ctx->bc, &tex);
9525	if (r)
9526		return r;
9527
9528	return 0;
9529}
9530
9531static int tgsi_lrp(struct r600_shader_ctx *ctx)
9532{
9533	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9534	struct r600_bytecode_alu alu;
9535	unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9536	struct r600_bytecode_alu_src srcs[2][4];
9537	unsigned i;
9538	int r;
9539
9540	/* optimize if it's just an equal balance */
9541	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9542		for (i = 0; i < lasti + 1; i++) {
9543			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9544				continue;
9545
9546			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9547			alu.op = ALU_OP2_ADD;
9548			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9549			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9550			alu.omod = 3;
9551			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9552			alu.dst.chan = i;
9553			if (i == lasti) {
9554				alu.last = 1;
9555			}
9556			r = r600_bytecode_add_alu(ctx->bc, &alu);
9557			if (r)
9558				return r;
9559		}
9560		return 0;
9561	}
9562
9563	/* 1 - src0 */
9564	for (i = 0; i < lasti + 1; i++) {
9565		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9566			continue;
9567
9568		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9569		alu.op = ALU_OP2_ADD;
9570		alu.src[0].sel = V_SQ_ALU_SRC_1;
9571		alu.src[0].chan = 0;
9572		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9573		r600_bytecode_src_toggle_neg(&alu.src[1]);
9574		alu.dst.sel = ctx->temp_reg;
9575		alu.dst.chan = i;
9576		if (i == lasti) {
9577			alu.last = 1;
9578		}
9579		alu.dst.write = 1;
9580		r = r600_bytecode_add_alu(ctx->bc, &alu);
9581		if (r)
9582			return r;
9583	}
9584
9585	/* (1 - src0) * src2 */
9586	for (i = 0; i < lasti + 1; i++) {
9587		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9588			continue;
9589
9590		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9591		alu.op = ALU_OP2_MUL;
9592		alu.src[0].sel = ctx->temp_reg;
9593		alu.src[0].chan = i;
9594		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9595		alu.dst.sel = ctx->temp_reg;
9596		alu.dst.chan = i;
9597		if (i == lasti) {
9598			alu.last = 1;
9599		}
9600		alu.dst.write = 1;
9601		r = r600_bytecode_add_alu(ctx->bc, &alu);
9602		if (r)
9603			return r;
9604	}
9605
9606	/* src0 * src1 + (1 - src0) * src2 */
9607
9608	for (i = 0; i < 2; i++) {
9609		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9610					  srcs[i], &ctx->src[i]);
9611		if (r)
9612			return r;
9613	}
9614
9615	for (i = 0; i < lasti + 1; i++) {
9616		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9617			continue;
9618
9619		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9620		alu.op = ALU_OP3_MULADD;
9621		alu.is_op3 = 1;
9622		alu.src[0] = srcs[0][i];
9623		alu.src[1] = srcs[1][i];
9624		alu.src[2].sel = ctx->temp_reg;
9625		alu.src[2].chan = i;
9626
9627		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9628		alu.dst.chan = i;
9629		if (i == lasti) {
9630			alu.last = 1;
9631		}
9632		r = r600_bytecode_add_alu(ctx->bc, &alu);
9633		if (r)
9634			return r;
9635	}
9636	return 0;
9637}
9638
9639static int tgsi_cmp(struct r600_shader_ctx *ctx)
9640{
9641	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9642	struct r600_bytecode_alu alu;
9643	int i, r, j;
9644	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9645	struct r600_bytecode_alu_src srcs[3][4];
9646
9647	unsigned op;
9648
9649	if (ctx->src[0].abs && ctx->src[0].neg) {
9650		op = ALU_OP3_CNDE;
9651		ctx->src[0].abs = 0;
9652		ctx->src[0].neg = 0;
9653	} else {
9654		op = ALU_OP3_CNDGE;
9655	}
9656
9657	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9658		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9659					  srcs[j], &ctx->src[j]);
9660		if (r)
9661			return r;
9662	}
9663
9664	for (i = 0; i < lasti + 1; i++) {
9665		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9666			continue;
9667
9668		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9669		alu.op = op;
9670		alu.src[0] = srcs[0][i];
9671		alu.src[1] = srcs[2][i];
9672		alu.src[2] = srcs[1][i];
9673
9674		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9675		alu.dst.chan = i;
9676		alu.dst.write = 1;
9677		alu.is_op3 = 1;
9678		if (i == lasti)
9679			alu.last = 1;
9680		r = r600_bytecode_add_alu(ctx->bc, &alu);
9681		if (r)
9682			return r;
9683	}
9684	return 0;
9685}
9686
9687static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9688{
9689	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9690	struct r600_bytecode_alu alu;
9691	int i, r;
9692	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9693
9694	for (i = 0; i < lasti + 1; i++) {
9695		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9696			continue;
9697
9698		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9699		alu.op = ALU_OP3_CNDE_INT;
9700		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9701		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9702		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9703		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9704		alu.dst.chan = i;
9705		alu.dst.write = 1;
9706		alu.is_op3 = 1;
9707		if (i == lasti)
9708			alu.last = 1;
9709		r = r600_bytecode_add_alu(ctx->bc, &alu);
9710		if (r)
9711			return r;
9712	}
9713	return 0;
9714}
9715
9716static int tgsi_exp(struct r600_shader_ctx *ctx)
9717{
9718	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9719	struct r600_bytecode_alu alu;
9720	int r;
9721	unsigned i;
9722
9723	/* result.x = 2^floor(src); */
9724	if (inst->Dst[0].Register.WriteMask & 1) {
9725		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9726
9727		alu.op = ALU_OP1_FLOOR;
9728		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9729
9730		alu.dst.sel = ctx->temp_reg;
9731		alu.dst.chan = 0;
9732		alu.dst.write = 1;
9733		alu.last = 1;
9734		r = r600_bytecode_add_alu(ctx->bc, &alu);
9735		if (r)
9736			return r;
9737
9738		if (ctx->bc->chip_class == CAYMAN) {
9739			for (i = 0; i < 3; i++) {
9740				alu.op = ALU_OP1_EXP_IEEE;
9741				alu.src[0].sel = ctx->temp_reg;
9742				alu.src[0].chan = 0;
9743
9744				alu.dst.sel = ctx->temp_reg;
9745				alu.dst.chan = i;
9746				alu.dst.write = i == 0;
9747				alu.last = i == 2;
9748				r = r600_bytecode_add_alu(ctx->bc, &alu);
9749				if (r)
9750					return r;
9751			}
9752		} else {
9753			alu.op = ALU_OP1_EXP_IEEE;
9754			alu.src[0].sel = ctx->temp_reg;
9755			alu.src[0].chan = 0;
9756
9757			alu.dst.sel = ctx->temp_reg;
9758			alu.dst.chan = 0;
9759			alu.dst.write = 1;
9760			alu.last = 1;
9761			r = r600_bytecode_add_alu(ctx->bc, &alu);
9762			if (r)
9763				return r;
9764		}
9765	}
9766
9767	/* result.y = tmp - floor(tmp); */
9768	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9769		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9770
9771		alu.op = ALU_OP1_FRACT;
9772		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9773
9774		alu.dst.sel = ctx->temp_reg;
9775#if 0
9776		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9777		if (r)
9778			return r;
9779#endif
9780		alu.dst.write = 1;
9781		alu.dst.chan = 1;
9782
9783		alu.last = 1;
9784
9785		r = r600_bytecode_add_alu(ctx->bc, &alu);
9786		if (r)
9787			return r;
9788	}
9789
9790	/* result.z = RoughApprox2ToX(tmp);*/
9791	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9792		if (ctx->bc->chip_class == CAYMAN) {
9793			for (i = 0; i < 3; i++) {
9794				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9795				alu.op = ALU_OP1_EXP_IEEE;
9796				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9797
9798				alu.dst.sel = ctx->temp_reg;
9799				alu.dst.chan = i;
9800				if (i == 2) {
9801					alu.dst.write = 1;
9802					alu.last = 1;
9803				}
9804
9805				r = r600_bytecode_add_alu(ctx->bc, &alu);
9806				if (r)
9807					return r;
9808			}
9809		} else {
9810			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9811			alu.op = ALU_OP1_EXP_IEEE;
9812			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9813
9814			alu.dst.sel = ctx->temp_reg;
9815			alu.dst.write = 1;
9816			alu.dst.chan = 2;
9817
9818			alu.last = 1;
9819
9820			r = r600_bytecode_add_alu(ctx->bc, &alu);
9821			if (r)
9822				return r;
9823		}
9824	}
9825
9826	/* result.w = 1.0;*/
9827	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9828		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9829
9830		alu.op = ALU_OP1_MOV;
9831		alu.src[0].sel = V_SQ_ALU_SRC_1;
9832		alu.src[0].chan = 0;
9833
9834		alu.dst.sel = ctx->temp_reg;
9835		alu.dst.chan = 3;
9836		alu.dst.write = 1;
9837		alu.last = 1;
9838		r = r600_bytecode_add_alu(ctx->bc, &alu);
9839		if (r)
9840			return r;
9841	}
9842	return tgsi_helper_copy(ctx, inst);
9843}
9844
9845static int tgsi_log(struct r600_shader_ctx *ctx)
9846{
9847	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9848	struct r600_bytecode_alu alu;
9849	int r;
9850	unsigned i;
9851
9852	/* result.x = floor(log2(|src|)); */
9853	if (inst->Dst[0].Register.WriteMask & 1) {
9854		if (ctx->bc->chip_class == CAYMAN) {
9855			for (i = 0; i < 3; i++) {
9856				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9857
9858				alu.op = ALU_OP1_LOG_IEEE;
9859				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9860				r600_bytecode_src_set_abs(&alu.src[0]);
9861
9862				alu.dst.sel = ctx->temp_reg;
9863				alu.dst.chan = i;
9864				if (i == 0)
9865					alu.dst.write = 1;
9866				if (i == 2)
9867					alu.last = 1;
9868				r = r600_bytecode_add_alu(ctx->bc, &alu);
9869				if (r)
9870					return r;
9871			}
9872
9873		} else {
9874			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9875
9876			alu.op = ALU_OP1_LOG_IEEE;
9877			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9878			r600_bytecode_src_set_abs(&alu.src[0]);
9879
9880			alu.dst.sel = ctx->temp_reg;
9881			alu.dst.chan = 0;
9882			alu.dst.write = 1;
9883			alu.last = 1;
9884			r = r600_bytecode_add_alu(ctx->bc, &alu);
9885			if (r)
9886				return r;
9887		}
9888
9889		alu.op = ALU_OP1_FLOOR;
9890		alu.src[0].sel = ctx->temp_reg;
9891		alu.src[0].chan = 0;
9892
9893		alu.dst.sel = ctx->temp_reg;
9894		alu.dst.chan = 0;
9895		alu.dst.write = 1;
9896		alu.last = 1;
9897
9898		r = r600_bytecode_add_alu(ctx->bc, &alu);
9899		if (r)
9900			return r;
9901	}
9902
9903	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9904	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9905
9906		if (ctx->bc->chip_class == CAYMAN) {
9907			for (i = 0; i < 3; i++) {
9908				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9909
9910				alu.op = ALU_OP1_LOG_IEEE;
9911				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9912				r600_bytecode_src_set_abs(&alu.src[0]);
9913
9914				alu.dst.sel = ctx->temp_reg;
9915				alu.dst.chan = i;
9916				if (i == 1)
9917					alu.dst.write = 1;
9918				if (i == 2)
9919					alu.last = 1;
9920
9921				r = r600_bytecode_add_alu(ctx->bc, &alu);
9922				if (r)
9923					return r;
9924			}
9925		} else {
9926			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9927
9928			alu.op = ALU_OP1_LOG_IEEE;
9929			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9930			r600_bytecode_src_set_abs(&alu.src[0]);
9931
9932			alu.dst.sel = ctx->temp_reg;
9933			alu.dst.chan = 1;
9934			alu.dst.write = 1;
9935			alu.last = 1;
9936
9937			r = r600_bytecode_add_alu(ctx->bc, &alu);
9938			if (r)
9939				return r;
9940		}
9941
9942		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9943
9944		alu.op = ALU_OP1_FLOOR;
9945		alu.src[0].sel = ctx->temp_reg;
9946		alu.src[0].chan = 1;
9947
9948		alu.dst.sel = ctx->temp_reg;
9949		alu.dst.chan = 1;
9950		alu.dst.write = 1;
9951		alu.last = 1;
9952
9953		r = r600_bytecode_add_alu(ctx->bc, &alu);
9954		if (r)
9955			return r;
9956
9957		if (ctx->bc->chip_class == CAYMAN) {
9958			for (i = 0; i < 3; i++) {
9959				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9960				alu.op = ALU_OP1_EXP_IEEE;
9961				alu.src[0].sel = ctx->temp_reg;
9962				alu.src[0].chan = 1;
9963
9964				alu.dst.sel = ctx->temp_reg;
9965				alu.dst.chan = i;
9966				if (i == 1)
9967					alu.dst.write = 1;
9968				if (i == 2)
9969					alu.last = 1;
9970
9971				r = r600_bytecode_add_alu(ctx->bc, &alu);
9972				if (r)
9973					return r;
9974			}
9975		} else {
9976			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9977			alu.op = ALU_OP1_EXP_IEEE;
9978			alu.src[0].sel = ctx->temp_reg;
9979			alu.src[0].chan = 1;
9980
9981			alu.dst.sel = ctx->temp_reg;
9982			alu.dst.chan = 1;
9983			alu.dst.write = 1;
9984			alu.last = 1;
9985
9986			r = r600_bytecode_add_alu(ctx->bc, &alu);
9987			if (r)
9988				return r;
9989		}
9990
9991		if (ctx->bc->chip_class == CAYMAN) {
9992			for (i = 0; i < 3; i++) {
9993				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9994				alu.op = ALU_OP1_RECIP_IEEE;
9995				alu.src[0].sel = ctx->temp_reg;
9996				alu.src[0].chan = 1;
9997
9998				alu.dst.sel = ctx->temp_reg;
9999				alu.dst.chan = i;
10000				if (i == 1)
10001					alu.dst.write = 1;
10002				if (i == 2)
10003					alu.last = 1;
10004
10005				r = r600_bytecode_add_alu(ctx->bc, &alu);
10006				if (r)
10007					return r;
10008			}
10009		} else {
10010			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10011			alu.op = ALU_OP1_RECIP_IEEE;
10012			alu.src[0].sel = ctx->temp_reg;
10013			alu.src[0].chan = 1;
10014
10015			alu.dst.sel = ctx->temp_reg;
10016			alu.dst.chan = 1;
10017			alu.dst.write = 1;
10018			alu.last = 1;
10019
10020			r = r600_bytecode_add_alu(ctx->bc, &alu);
10021			if (r)
10022				return r;
10023		}
10024
10025		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10026
10027		alu.op = ALU_OP2_MUL;
10028
10029		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10030		r600_bytecode_src_set_abs(&alu.src[0]);
10031
10032		alu.src[1].sel = ctx->temp_reg;
10033		alu.src[1].chan = 1;
10034
10035		alu.dst.sel = ctx->temp_reg;
10036		alu.dst.chan = 1;
10037		alu.dst.write = 1;
10038		alu.last = 1;
10039
10040		r = r600_bytecode_add_alu(ctx->bc, &alu);
10041		if (r)
10042			return r;
10043	}
10044
10045	/* result.z = log2(|src|);*/
10046	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10047		if (ctx->bc->chip_class == CAYMAN) {
10048			for (i = 0; i < 3; i++) {
10049				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10050
10051				alu.op = ALU_OP1_LOG_IEEE;
10052				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10053				r600_bytecode_src_set_abs(&alu.src[0]);
10054
10055				alu.dst.sel = ctx->temp_reg;
10056				if (i == 2)
10057					alu.dst.write = 1;
10058				alu.dst.chan = i;
10059				if (i == 2)
10060					alu.last = 1;
10061
10062				r = r600_bytecode_add_alu(ctx->bc, &alu);
10063				if (r)
10064					return r;
10065			}
10066		} else {
10067			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10068
10069			alu.op = ALU_OP1_LOG_IEEE;
10070			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10071			r600_bytecode_src_set_abs(&alu.src[0]);
10072
10073			alu.dst.sel = ctx->temp_reg;
10074			alu.dst.write = 1;
10075			alu.dst.chan = 2;
10076			alu.last = 1;
10077
10078			r = r600_bytecode_add_alu(ctx->bc, &alu);
10079			if (r)
10080				return r;
10081		}
10082	}
10083
10084	/* result.w = 1.0; */
10085	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10086		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10087
10088		alu.op = ALU_OP1_MOV;
10089		alu.src[0].sel = V_SQ_ALU_SRC_1;
10090		alu.src[0].chan = 0;
10091
10092		alu.dst.sel = ctx->temp_reg;
10093		alu.dst.chan = 3;
10094		alu.dst.write = 1;
10095		alu.last = 1;
10096
10097		r = r600_bytecode_add_alu(ctx->bc, &alu);
10098		if (r)
10099			return r;
10100	}
10101
10102	return tgsi_helper_copy(ctx, inst);
10103}
10104
10105static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10106{
10107	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10108	struct r600_bytecode_alu alu;
10109	int r;
10110	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10111	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10112
10113	assert(inst->Dst[0].Register.Index < 3);
10114	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10115
10116	switch (inst->Instruction.Opcode) {
10117	case TGSI_OPCODE_ARL:
10118		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10119		break;
10120	case TGSI_OPCODE_ARR:
10121		alu.op = ALU_OP1_FLT_TO_INT;
10122		break;
10123	case TGSI_OPCODE_UARL:
10124		alu.op = ALU_OP1_MOV;
10125		break;
10126	default:
10127		assert(0);
10128		return -1;
10129	}
10130
10131	for (i = 0; i <= lasti; ++i) {
10132		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10133			continue;
10134		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10135		alu.last = i == lasti;
10136		alu.dst.sel = reg;
10137	        alu.dst.chan = i;
10138		alu.dst.write = 1;
10139		r = r600_bytecode_add_alu(ctx->bc, &alu);
10140		if (r)
10141			return r;
10142	}
10143
10144	if (inst->Dst[0].Register.Index > 0)
10145		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10146	else
10147		ctx->bc->ar_loaded = 0;
10148
10149	return 0;
10150}
10151static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10152{
10153	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10154	struct r600_bytecode_alu alu;
10155	int r;
10156	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10157
10158	switch (inst->Instruction.Opcode) {
10159	case TGSI_OPCODE_ARL:
10160		memset(&alu, 0, sizeof(alu));
10161		alu.op = ALU_OP1_FLOOR;
10162		alu.dst.sel = ctx->bc->ar_reg;
10163		alu.dst.write = 1;
10164		for (i = 0; i <= lasti; ++i) {
10165			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
10166				alu.dst.chan = i;
10167				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10168				alu.last = i == lasti;
10169				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10170					return r;
10171			}
10172		}
10173
10174		memset(&alu, 0, sizeof(alu));
10175		alu.op = ALU_OP1_FLT_TO_INT;
10176		alu.src[0].sel = ctx->bc->ar_reg;
10177		alu.dst.sel = ctx->bc->ar_reg;
10178		alu.dst.write = 1;
10179		/* FLT_TO_INT is trans-only on r600/r700 */
10180		alu.last = TRUE;
10181		for (i = 0; i <= lasti; ++i) {
10182			alu.dst.chan = i;
10183			alu.src[0].chan = i;
10184			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10185				return r;
10186		}
10187		break;
10188	case TGSI_OPCODE_ARR:
10189		memset(&alu, 0, sizeof(alu));
10190		alu.op = ALU_OP1_FLT_TO_INT;
10191		alu.dst.sel = ctx->bc->ar_reg;
10192		alu.dst.write = 1;
10193		/* FLT_TO_INT is trans-only on r600/r700 */
10194		alu.last = TRUE;
10195		for (i = 0; i <= lasti; ++i) {
10196			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10197				alu.dst.chan = i;
10198				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10199				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10200					return r;
10201			}
10202		}
10203		break;
10204	case TGSI_OPCODE_UARL:
10205		memset(&alu, 0, sizeof(alu));
10206		alu.op = ALU_OP1_MOV;
10207		alu.dst.sel = ctx->bc->ar_reg;
10208		alu.dst.write = 1;
10209		for (i = 0; i <= lasti; ++i) {
10210			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10211				alu.dst.chan = i;
10212				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10213				alu.last = i == lasti;
10214				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10215					return r;
10216			}
10217		}
10218		break;
10219	default:
10220		assert(0);
10221		return -1;
10222	}
10223
10224	ctx->bc->ar_loaded = 0;
10225	return 0;
10226}
10227
10228static int tgsi_opdst(struct r600_shader_ctx *ctx)
10229{
10230	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10231	struct r600_bytecode_alu alu;
10232	int i, r = 0;
10233
10234	for (i = 0; i < 4; i++) {
10235		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10236
10237		alu.op = ALU_OP2_MUL;
10238		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10239
10240		if (i == 0 || i == 3) {
10241			alu.src[0].sel = V_SQ_ALU_SRC_1;
10242		} else {
10243			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10244		}
10245
10246		if (i == 0 || i == 2) {
10247			alu.src[1].sel = V_SQ_ALU_SRC_1;
10248		} else {
10249			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10250		}
10251		if (i == 3)
10252			alu.last = 1;
10253		r = r600_bytecode_add_alu(ctx->bc, &alu);
10254		if (r)
10255			return r;
10256	}
10257	return 0;
10258}
10259
10260static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10261			   struct r600_bytecode_alu_src *src)
10262{
10263	struct r600_bytecode_alu alu;
10264	int r;
10265
10266	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10267	alu.op = opcode;
10268	alu.execute_mask = 1;
10269	alu.update_pred = 1;
10270
10271	alu.dst.sel = ctx->temp_reg;
10272	alu.dst.write = 1;
10273	alu.dst.chan = 0;
10274
10275	alu.src[0] = *src;
10276	alu.src[1].sel = V_SQ_ALU_SRC_0;
10277	alu.src[1].chan = 0;
10278
10279	alu.last = 1;
10280
10281	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10282	if (r)
10283		return r;
10284	return 0;
10285}
10286
10287static int pops(struct r600_shader_ctx *ctx, int pops)
10288{
10289	unsigned force_pop = ctx->bc->force_add_cf;
10290
10291	if (!force_pop) {
10292		int alu_pop = 3;
10293		if (ctx->bc->cf_last) {
10294			if (ctx->bc->cf_last->op == CF_OP_ALU)
10295				alu_pop = 0;
10296			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10297				alu_pop = 1;
10298		}
10299		alu_pop += pops;
10300		if (alu_pop == 1) {
10301			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10302			ctx->bc->force_add_cf = 1;
10303		} else if (alu_pop == 2) {
10304			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10305			ctx->bc->force_add_cf = 1;
10306		} else {
10307			force_pop = 1;
10308		}
10309	}
10310
10311	if (force_pop) {
10312		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10313		ctx->bc->cf_last->pop_count = pops;
10314		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10315	}
10316
10317	return 0;
10318}
10319
10320static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10321                                              unsigned reason)
10322{
10323	struct r600_stack_info *stack = &ctx->bc->stack;
10324	unsigned elements;
10325	int entries;
10326
10327	unsigned entry_size = stack->entry_size;
10328
10329	elements = (stack->loop + stack->push_wqm ) * entry_size;
10330	elements += stack->push;
10331
10332	switch (ctx->bc->chip_class) {
10333	case R600:
10334	case R700:
10335		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10336		 * the stack must be reserved to hold the current active/continue
10337		 * masks */
10338		if (reason == FC_PUSH_VPM || stack->push > 0) {
10339			elements += 2;
10340		}
10341		break;
10342
10343	case CAYMAN:
10344		/* r9xx: any stack operation on empty stack consumes 2 additional
10345		 * elements */
10346		elements += 2;
10347
10348		/* fallthrough */
10349		/* FIXME: do the two elements added above cover the cases for the
10350		 * r8xx+ below? */
10351
10352	case EVERGREEN:
10353		/* r8xx+: 2 extra elements are not always required, but one extra
10354		 * element must be added for each of the following cases:
10355		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10356		 *    stack usage.
10357		 *    (Currently we don't use ALU_ELSE_AFTER.)
10358		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10359		 *    PUSH instruction executed.
10360		 *
10361		 *    NOTE: it seems we also need to reserve additional element in some
10362		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10363		 *    then STACK_SIZE should be 2 instead of 1 */
10364		if (reason == FC_PUSH_VPM || stack->push > 0) {
10365			elements += 1;
10366		}
10367		break;
10368
10369	default:
10370		assert(0);
10371		break;
10372	}
10373
10374	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10375	 * for all chips, so we use 4 in the final formula, not the real entry_size
10376	 * for the chip */
10377	entry_size = 4;
10378
10379	entries = (elements + (entry_size - 1)) / entry_size;
10380
10381	if (entries > stack->max_entries)
10382		stack->max_entries = entries;
10383	return elements;
10384}
10385
10386static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10387{
10388	switch(reason) {
10389	case FC_PUSH_VPM:
10390		--ctx->bc->stack.push;
10391		assert(ctx->bc->stack.push >= 0);
10392		break;
10393	case FC_PUSH_WQM:
10394		--ctx->bc->stack.push_wqm;
10395		assert(ctx->bc->stack.push_wqm >= 0);
10396		break;
10397	case FC_LOOP:
10398		--ctx->bc->stack.loop;
10399		assert(ctx->bc->stack.loop >= 0);
10400		break;
10401	default:
10402		assert(0);
10403		break;
10404	}
10405}
10406
10407static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10408{
10409	switch (reason) {
10410	case FC_PUSH_VPM:
10411		++ctx->bc->stack.push;
10412		break;
10413	case FC_PUSH_WQM:
10414		++ctx->bc->stack.push_wqm;
10415		break;
10416	case FC_LOOP:
10417		++ctx->bc->stack.loop;
10418		break;
10419	default:
10420		assert(0);
10421	}
10422
10423	return callstack_update_max_depth(ctx, reason);
10424}
10425
10426static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10427{
10428	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10429
10430	sp->mid = realloc((void *)sp->mid,
10431						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10432	sp->mid[sp->num_mid] = ctx->bc->cf_last;
10433	sp->num_mid++;
10434}
10435
10436static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10437{
10438	assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10439	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10440	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10441	ctx->bc->fc_sp++;
10442}
10443
10444static void fc_poplevel(struct r600_shader_ctx *ctx)
10445{
10446	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10447	free(sp->mid);
10448	sp->mid = NULL;
10449	sp->num_mid = 0;
10450	sp->start = NULL;
10451	sp->type = 0;
10452	ctx->bc->fc_sp--;
10453}
10454
10455#if 0
10456static int emit_return(struct r600_shader_ctx *ctx)
10457{
10458	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10459	return 0;
10460}
10461
10462static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10463{
10464
10465	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10466	ctx->bc->cf_last->pop_count = pops;
10467	/* XXX work out offset */
10468	return 0;
10469}
10470
10471static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10472{
10473	return 0;
10474}
10475
10476static void emit_testflag(struct r600_shader_ctx *ctx)
10477{
10478
10479}
10480
10481static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10482{
10483	emit_testflag(ctx);
10484	emit_jump_to_offset(ctx, 1, 4);
10485	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10486	pops(ctx, ifidx + 1);
10487	emit_return(ctx);
10488}
10489
10490static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10491{
10492	emit_testflag(ctx);
10493
10494	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10495	ctx->bc->cf_last->pop_count = 1;
10496
10497	fc_set_mid(ctx, fc_sp);
10498
10499	pops(ctx, 1);
10500}
10501#endif
10502
10503static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10504		   struct r600_bytecode_alu_src *src)
10505{
10506	int alu_type = CF_OP_ALU_PUSH_BEFORE;
10507	bool needs_workaround = false;
10508	int elems = callstack_push(ctx, FC_PUSH_VPM);
10509
10510	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
10511		needs_workaround = true;
10512
10513	if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10514		unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10515		unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10516
10517		if (elems && (!dmod1 || !dmod2))
10518			needs_workaround = true;
10519	}
10520
10521	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10522	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10523	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10524	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10525	if (needs_workaround) {
10526		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10527		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10528		alu_type = CF_OP_ALU;
10529	}
10530
10531	emit_logic_pred(ctx, opcode, alu_type, src);
10532
10533	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10534
10535	fc_pushlevel(ctx, FC_IF);
10536
10537	return 0;
10538}
10539
10540static int tgsi_if(struct r600_shader_ctx *ctx)
10541{
10542	struct r600_bytecode_alu_src alu_src;
10543	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10544
10545	return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10546}
10547
10548static int tgsi_uif(struct r600_shader_ctx *ctx)
10549{
10550	struct r600_bytecode_alu_src alu_src;
10551	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10552	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10553}
10554
10555static int tgsi_else(struct r600_shader_ctx *ctx)
10556{
10557	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10558	ctx->bc->cf_last->pop_count = 1;
10559
10560	fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10561	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10562	return 0;
10563}
10564
10565static int tgsi_endif(struct r600_shader_ctx *ctx)
10566{
10567	int offset = 2;
10568	pops(ctx, 1);
10569	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10570		R600_ERR("if/endif unbalanced in shader\n");
10571		return -1;
10572	}
10573
10574	/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10575	if (ctx->bc->cf_last->eg_alu_extended)
10576			offset += 2;
10577
10578	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10579		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10580		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10581	} else {
10582		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10583	}
10584	fc_poplevel(ctx);
10585
10586	callstack_pop(ctx, FC_PUSH_VPM);
10587	return 0;
10588}
10589
10590static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10591{
10592	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10593	 * limited to 4096 iterations, like the other LOOP_* instructions. */
10594	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10595
10596	fc_pushlevel(ctx, FC_LOOP);
10597
10598	/* check stack depth */
10599	callstack_push(ctx, FC_LOOP);
10600	return 0;
10601}
10602
10603static int tgsi_endloop(struct r600_shader_ctx *ctx)
10604{
10605	int i;
10606
10607	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10608
10609	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10610		R600_ERR("loop/endloop in shader code are not paired.\n");
10611		return -EINVAL;
10612	}
10613
10614	/* fixup loop pointers - from r600isa
10615	   LOOP END points to CF after LOOP START,
10616	   LOOP START point to CF after LOOP END
10617	   BRK/CONT point to LOOP END CF
10618	*/
10619	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10620
10621	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10622
10623	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10624		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10625	}
10626	/* XXX add LOOPRET support */
10627	fc_poplevel(ctx);
10628	callstack_pop(ctx, FC_LOOP);
10629	return 0;
10630}
10631
10632static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10633{
10634	unsigned int fscp;
10635
10636	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10637	{
10638		if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10639			break;
10640	}
10641
10642	if (fscp == 0) {
10643		R600_ERR("Break not inside loop/endloop pair\n");
10644		return -EINVAL;
10645	}
10646
10647	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10648
10649	fc_set_mid(ctx, fscp - 1);
10650
10651	return 0;
10652}
10653
10654static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10655{
10656	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10657	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10658	int r;
10659
10660	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10661		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10662
10663	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10664	if (!r) {
10665		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10666		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10667			return emit_inc_ring_offset(ctx, stream, TRUE);
10668	}
10669	return r;
10670}
10671
10672static int tgsi_umad(struct r600_shader_ctx *ctx)
10673{
10674	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10675	struct r600_bytecode_alu alu;
10676	int i, j, r;
10677	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10678
10679	/* src0 * src1 */
10680	for (i = 0; i < lasti + 1; i++) {
10681		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10682			continue;
10683
10684		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10685
10686		alu.dst.chan = i;
10687		alu.dst.sel = ctx->temp_reg;
10688		alu.dst.write = 1;
10689
10690		alu.op = ALU_OP2_MULLO_UINT;
10691		for (j = 0; j < 2; j++) {
10692			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10693		}
10694
10695		alu.last = 1;
10696		r = emit_mul_int_op(ctx->bc, &alu);
10697		if (r)
10698			return r;
10699	}
10700
10701
10702	for (i = 0; i < lasti + 1; i++) {
10703		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10704			continue;
10705
10706		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10707		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10708
10709		alu.op = ALU_OP2_ADD_INT;
10710
10711		alu.src[0].sel = ctx->temp_reg;
10712		alu.src[0].chan = i;
10713
10714		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10715		if (i == lasti) {
10716			alu.last = 1;
10717		}
10718		r = r600_bytecode_add_alu(ctx->bc, &alu);
10719		if (r)
10720			return r;
10721	}
10722	return 0;
10723}
10724
10725static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10726{
10727	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10728	struct r600_bytecode_alu alu;
10729	int r, i;
10730	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10731
10732	/* temp.xy = f32_to_f16(src) */
10733	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10734	alu.op = ALU_OP1_FLT32_TO_FLT16;
10735	alu.dst.chan = 0;
10736	alu.dst.sel = ctx->temp_reg;
10737	alu.dst.write = 1;
10738	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10739	r = r600_bytecode_add_alu(ctx->bc, &alu);
10740	if (r)
10741		return r;
10742	alu.dst.chan = 1;
10743	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10744	alu.last = 1;
10745	r = r600_bytecode_add_alu(ctx->bc, &alu);
10746	if (r)
10747		return r;
10748
10749	/* dst.x = temp.y * 0x10000 + temp.x */
10750	for (i = 0; i < lasti + 1; i++) {
10751		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10752			continue;
10753
10754		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10755		alu.op = ALU_OP3_MULADD_UINT24;
10756		alu.is_op3 = 1;
10757		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10758		alu.last = i == lasti;
10759		alu.src[0].sel = ctx->temp_reg;
10760		alu.src[0].chan = 1;
10761		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10762		alu.src[1].value = 0x10000;
10763		alu.src[2].sel = ctx->temp_reg;
10764		alu.src[2].chan = 0;
10765		r = r600_bytecode_add_alu(ctx->bc, &alu);
10766		if (r)
10767			return r;
10768	}
10769
10770	return 0;
10771}
10772
10773static int tgsi_up2h(struct r600_shader_ctx *ctx)
10774{
10775	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10776	struct r600_bytecode_alu alu;
10777	int r, i;
10778	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10779
10780	/* temp.x = src.x */
10781	/* note: no need to mask out the high bits */
10782	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10783	alu.op = ALU_OP1_MOV;
10784	alu.dst.chan = 0;
10785	alu.dst.sel = ctx->temp_reg;
10786	alu.dst.write = 1;
10787	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10788	r = r600_bytecode_add_alu(ctx->bc, &alu);
10789	if (r)
10790		return r;
10791
10792	/* temp.y = src.x >> 16 */
10793	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10794	alu.op = ALU_OP2_LSHR_INT;
10795	alu.dst.chan = 1;
10796	alu.dst.sel = ctx->temp_reg;
10797	alu.dst.write = 1;
10798	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10799	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10800	alu.src[1].value = 16;
10801	alu.last = 1;
10802	r = r600_bytecode_add_alu(ctx->bc, &alu);
10803	if (r)
10804		return r;
10805
10806	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10807	for (i = 0; i < lasti + 1; i++) {
10808		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10809			continue;
10810		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10811		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10812		alu.op = ALU_OP1_FLT16_TO_FLT32;
10813		alu.src[0].sel = ctx->temp_reg;
10814		alu.src[0].chan = i % 2;
10815		alu.last = i == lasti;
10816		r = r600_bytecode_add_alu(ctx->bc, &alu);
10817		if (r)
10818			return r;
10819	}
10820
10821	return 0;
10822}
10823
10824static int tgsi_bfe(struct r600_shader_ctx *ctx)
10825{
10826	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10827	struct r600_bytecode_alu alu;
10828	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10829	int r, i;
10830	int dst = -1;
10831
10832	if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10833	     inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10834	    (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10835	     inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10836		dst = r600_get_temp(ctx);
10837
10838	r = tgsi_op3_dst(ctx, dst);
10839	if (r)
10840		return r;
10841
10842	for (i = 0; i < lasti + 1; i++) {
10843		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10844		alu.op = ALU_OP2_SETGE_INT;
10845		r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10846		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10847		alu.src[1].value = 32;
10848		alu.dst.sel = ctx->temp_reg;
10849		alu.dst.chan = i;
10850		alu.dst.write = 1;
10851		if (i == lasti)
10852			alu.last = 1;
10853		r = r600_bytecode_add_alu(ctx->bc, &alu);
10854		if (r)
10855			return r;
10856	}
10857
10858	for (i = 0; i < lasti + 1; i++) {
10859		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10860		alu.op = ALU_OP3_CNDE_INT;
10861		alu.is_op3 = 1;
10862		alu.src[0].sel = ctx->temp_reg;
10863		alu.src[0].chan = i;
10864
10865		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10866		if (dst != -1)
10867			alu.src[1].sel = dst;
10868		else
10869			alu.src[1].sel = alu.dst.sel;
10870		alu.src[1].chan = i;
10871		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10872		alu.dst.write = 1;
10873		if (i == lasti)
10874			alu.last = 1;
10875		r = r600_bytecode_add_alu(ctx->bc, &alu);
10876		if (r)
10877			return r;
10878	}
10879
10880	return 0;
10881}
10882
10883static int tgsi_clock(struct r600_shader_ctx *ctx)
10884{
10885	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10886	struct r600_bytecode_alu alu;
10887	int r;
10888
10889	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10890	alu.op = ALU_OP1_MOV;
10891	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10892	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10893	r = r600_bytecode_add_alu(ctx->bc, &alu);
10894	if (r)
10895		return r;
10896	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10897	alu.op = ALU_OP1_MOV;
10898	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10899	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10900	alu.last = 1;
10901	r = r600_bytecode_add_alu(ctx->bc, &alu);
10902	if (r)
10903		return r;
10904	return 0;
10905}
10906
10907static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10908		       int treg,
10909		       int src0_sel, int src0_chan,
10910		       int src1_sel, int src1_chan)
10911{
10912	struct r600_bytecode_alu alu;
10913	int r;
10914	int opc;
10915
10916	if (op == ALU_OP2_ADD_INT)
10917		opc = ALU_OP2_ADDC_UINT;
10918	else
10919		opc = ALU_OP2_SUBB_UINT;
10920
10921	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10922	alu.op = op;            ;
10923	alu.dst.sel = treg;
10924	alu.dst.chan = 0;
10925	alu.dst.write = 1;
10926	alu.src[0].sel = src0_sel;
10927	alu.src[0].chan = src0_chan + 0;
10928	alu.src[1].sel = src1_sel;
10929	alu.src[1].chan = src1_chan + 0;
10930	alu.src[1].neg = 0;
10931	r = r600_bytecode_add_alu(ctx->bc, &alu);
10932	if (r)
10933		return r;
10934
10935	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10936	alu.op = op;
10937	alu.dst.sel = treg;
10938	alu.dst.chan = 1;
10939	alu.dst.write = 1;
10940	alu.src[0].sel = src0_sel;
10941	alu.src[0].chan = src0_chan + 1;
10942	alu.src[1].sel = src1_sel;
10943	alu.src[1].chan = src1_chan + 1;
10944	alu.src[1].neg = 0;
10945	r = r600_bytecode_add_alu(ctx->bc, &alu);
10946	if (r)
10947		return r;
10948
10949	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10950	alu.op = opc;
10951	alu.dst.sel = treg;
10952	alu.dst.chan = 2;
10953	alu.dst.write = 1;
10954	alu.last = 1;
10955	alu.src[0].sel = src0_sel;
10956	alu.src[0].chan = src0_chan + 0;
10957	alu.src[1].sel = src1_sel;
10958	alu.src[1].chan = src1_chan + 0;
10959	alu.src[1].neg = 0;
10960	r = r600_bytecode_add_alu(ctx->bc, &alu);
10961	if (r)
10962		return r;
10963
10964	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10965	alu.op = op;
10966	alu.dst.sel = treg;
10967	alu.dst.chan = 1;
10968	alu.dst.write = 1;
10969	alu.src[0].sel = treg;
10970	alu.src[0].chan = 1;
10971	alu.src[1].sel = treg;
10972	alu.src[1].chan = 2;
10973	alu.last = 1;
10974	r = r600_bytecode_add_alu(ctx->bc, &alu);
10975	if (r)
10976		return r;
10977	return 0;
10978}
10979
10980static int egcm_u64add(struct r600_shader_ctx *ctx)
10981{
10982	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10983	struct r600_bytecode_alu alu;
10984	int r;
10985	int treg = ctx->temp_reg;
10986	int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
10987
10988	if (ctx->src[1].neg) {
10989		op = ALU_OP2_SUB_INT;
10990		opc = ALU_OP2_SUBB_UINT;
10991	}
10992	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10993	alu.op = op;            ;
10994	alu.dst.sel = treg;
10995	alu.dst.chan = 0;
10996	alu.dst.write = 1;
10997	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10998	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10999	alu.src[1].neg = 0;
11000	r = r600_bytecode_add_alu(ctx->bc, &alu);
11001	if (r)
11002		return r;
11003
11004	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11005	alu.op = op;
11006	alu.dst.sel = treg;
11007	alu.dst.chan = 1;
11008	alu.dst.write = 1;
11009	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11010	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11011	alu.src[1].neg = 0;
11012	r = r600_bytecode_add_alu(ctx->bc, &alu);
11013	if (r)
11014		return r;
11015
11016	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11017	alu.op = opc              ;
11018	alu.dst.sel = treg;
11019	alu.dst.chan = 2;
11020	alu.dst.write = 1;
11021	alu.last = 1;
11022	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11023	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11024	alu.src[1].neg = 0;
11025	r = r600_bytecode_add_alu(ctx->bc, &alu);
11026	if (r)
11027		return r;
11028
11029	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11030	alu.op = op;
11031	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11032	alu.src[0].sel = treg;
11033	alu.src[0].chan = 1;
11034	alu.src[1].sel = treg;
11035	alu.src[1].chan = 2;
11036	alu.last = 1;
11037	r = r600_bytecode_add_alu(ctx->bc, &alu);
11038	if (r)
11039		return r;
11040	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11041	alu.op = ALU_OP1_MOV;
11042	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11043	alu.src[0].sel = treg;
11044	alu.src[0].chan = 0;
11045	alu.last = 1;
11046	r = r600_bytecode_add_alu(ctx->bc, &alu);
11047	if (r)
11048		return r;
11049	return 0;
11050}
11051
11052/* result.y = mul_high a, b
11053   result.x = mul a,b
11054   result.y += a.x * b.y + a.y * b.x;
11055*/
11056static int egcm_u64mul(struct r600_shader_ctx *ctx)
11057{
11058	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11059	struct r600_bytecode_alu alu;
11060	int r;
11061	int treg = ctx->temp_reg;
11062
11063	/* temp.x = mul_lo a.x, b.x */
11064	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11065	alu.op = ALU_OP2_MULLO_UINT;
11066	alu.dst.sel = treg;
11067	alu.dst.chan = 0;
11068	alu.dst.write = 1;
11069	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11070	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11071	r = emit_mul_int_op(ctx->bc, &alu);
11072	if (r)
11073		return r;
11074
11075	/* temp.y = mul_hi a.x, b.x */
11076	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11077	alu.op = ALU_OP2_MULHI_UINT;
11078	alu.dst.sel = treg;
11079	alu.dst.chan = 1;
11080	alu.dst.write = 1;
11081	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11082	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11083	r = emit_mul_int_op(ctx->bc, &alu);
11084	if (r)
11085		return r;
11086
11087	/* temp.z = mul a.x, b.y */
11088	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11089	alu.op = ALU_OP2_MULLO_UINT;
11090	alu.dst.sel = treg;
11091	alu.dst.chan = 2;
11092	alu.dst.write = 1;
11093	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11094	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11095	r = emit_mul_int_op(ctx->bc, &alu);
11096	if (r)
11097		return r;
11098
11099	/* temp.w = mul a.y, b.x */
11100	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11101	alu.op = ALU_OP2_MULLO_UINT;
11102	alu.dst.sel = treg;
11103	alu.dst.chan = 3;
11104	alu.dst.write = 1;
11105	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11106	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11107	r = emit_mul_int_op(ctx->bc, &alu);
11108	if (r)
11109		return r;
11110
11111	/* temp.z = temp.z + temp.w */
11112	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11113	alu.op = ALU_OP2_ADD_INT;
11114	alu.dst.sel = treg;
11115	alu.dst.chan = 2;
11116	alu.dst.write = 1;
11117	alu.src[0].sel = treg;
11118	alu.src[0].chan = 2;
11119	alu.src[1].sel = treg;
11120	alu.src[1].chan = 3;
11121	alu.last = 1;
11122	r = r600_bytecode_add_alu(ctx->bc, &alu);
11123	if (r)
11124		return r;
11125
11126	/* temp.y = temp.y + temp.z */
11127	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11128	alu.op = ALU_OP2_ADD_INT;
11129	alu.dst.sel = treg;
11130	alu.dst.chan = 1;
11131	alu.dst.write = 1;
11132	alu.src[0].sel = treg;
11133	alu.src[0].chan = 1;
11134	alu.src[1].sel = treg;
11135	alu.src[1].chan = 2;
11136	alu.last = 1;
11137	r = r600_bytecode_add_alu(ctx->bc, &alu);
11138	if (r)
11139		return r;
11140
11141	/* dst.x = temp.x */
11142	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11143	alu.op = ALU_OP1_MOV;
11144	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11145	alu.src[0].sel = treg;
11146	alu.src[0].chan = 0;
11147	r = r600_bytecode_add_alu(ctx->bc, &alu);
11148	if (r)
11149		return r;
11150
11151	/* dst.y = temp.y */
11152	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11153	alu.op = ALU_OP1_MOV;
11154	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11155	alu.src[0].sel = treg;
11156	alu.src[0].chan = 1;
11157	alu.last = 1;
11158	r = r600_bytecode_add_alu(ctx->bc, &alu);
11159	if (r)
11160		return r;
11161
11162	return 0;
11163}
11164
11165static int emit_u64sge(struct r600_shader_ctx *ctx,
11166		       int treg,
11167		       int src0_sel, int src0_base_chan,
11168		       int src1_sel, int src1_base_chan)
11169{
11170	int r;
11171	/* for 64-bit sge */
11172	/* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11173	r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11174			   treg, 1,
11175			   src0_sel, src0_base_chan + 1,
11176			   src1_sel, src1_base_chan + 1);
11177	if (r)
11178		return r;
11179
11180	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11181			   treg, 0,
11182			   src0_sel, src0_base_chan,
11183			   src1_sel, src1_base_chan);
11184	if (r)
11185		return r;
11186
11187	r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11188			   treg, 2,
11189			   src0_sel, src0_base_chan + 1,
11190			   src1_sel, src1_base_chan + 1);
11191	if (r)
11192		return r;
11193
11194	r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11195			   treg, 0,
11196			   treg, 0,
11197			   treg, 2);
11198	if (r)
11199		return r;
11200
11201	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11202			   treg, 0,
11203			   treg, 0,
11204			   treg, 1);
11205	if (r)
11206		return r;
11207	return 0;
11208}
11209
11210/* this isn't a complete div it's just enough for qbo shader to work */
11211static int egcm_u64div(struct r600_shader_ctx *ctx)
11212{
11213	struct r600_bytecode_alu alu;
11214	struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11215	int r, i;
11216	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11217
11218	/* make sure we are dividing my a const with 0 in the high bits */
11219	if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11220		return -1;
11221	if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11222		return -1;
11223	/* make sure we are doing one division */
11224	if (inst->Dst[0].Register.WriteMask != 0x3)
11225		return -1;
11226
11227	/* emit_if uses ctx->temp_reg so we can't */
11228	int treg = r600_get_temp(ctx);
11229	int tmp_num = r600_get_temp(ctx);
11230	int sub_tmp = r600_get_temp(ctx);
11231
11232	/* tmp quot are tmp_num.zw */
11233	r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11234	r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11235	r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11236	r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11237
11238	/* MOV tmp_num.xy, numerator */
11239	r = single_alu_op2(ctx, ALU_OP1_MOV,
11240			   tmp_num, 0,
11241			   alu_num_lo.sel, alu_num_lo.chan,
11242			   0, 0);
11243	if (r)
11244		return r;
11245	r = single_alu_op2(ctx, ALU_OP1_MOV,
11246			   tmp_num, 1,
11247			   alu_num_hi.sel, alu_num_hi.chan,
11248			   0, 0);
11249	if (r)
11250		return r;
11251
11252	r = single_alu_op2(ctx, ALU_OP1_MOV,
11253			   tmp_num, 2,
11254			   V_SQ_ALU_SRC_LITERAL, 0,
11255			   0, 0);
11256	if (r)
11257		return r;
11258
11259	r = single_alu_op2(ctx, ALU_OP1_MOV,
11260			   tmp_num, 3,
11261			   V_SQ_ALU_SRC_LITERAL, 0,
11262			   0, 0);
11263	if (r)
11264		return r;
11265
11266	/* treg 0 is log2_denom */
11267	/* normally this gets the MSB for the denom high value
11268	   - however we know this will always be 0 here. */
11269	r = single_alu_op2(ctx,
11270			   ALU_OP1_MOV,
11271			   treg, 0,
11272			   V_SQ_ALU_SRC_LITERAL, 32,
11273			   0, 0);
11274	if (r)
11275		return r;
11276
11277	/* normally check demon hi for 0, but we know it is already */
11278	/* t0.z = num_hi >= denom_lo */
11279	r = single_alu_op2(ctx,
11280			   ALU_OP2_SETGE_UINT,
11281			   treg, 1,
11282			   alu_num_hi.sel, alu_num_hi.chan,
11283			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11284	if (r)
11285		return r;
11286
11287	memset(&alu_src, 0, sizeof(alu_src));
11288	alu_src.sel = treg;
11289	alu_src.chan = 1;
11290	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11291	if (r)
11292		return r;
11293
11294	/* for loops in here */
11295	/* get msb t0.x = msb(src[1].x) first */
11296	int msb_lo = util_last_bit(alu_denom_lo.value);
11297	r = single_alu_op2(ctx, ALU_OP1_MOV,
11298			   treg, 0,
11299			   V_SQ_ALU_SRC_LITERAL, msb_lo,
11300			   0, 0);
11301	if (r)
11302		return r;
11303
11304	/* unroll the asm here */
11305	for (i = 0; i < 31; i++) {
11306		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11307				   treg, 2,
11308				   V_SQ_ALU_SRC_LITERAL, i,
11309				   treg, 0);
11310		if (r)
11311			return r;
11312
11313		/* we can do this on the CPU */
11314		uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11315		/* t0.z = tmp_num.y >= t0.z */
11316		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11317				   treg, 1,
11318				   tmp_num, 1,
11319				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11320		if (r)
11321			return r;
11322
11323		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11324				   treg, 1,
11325				   treg, 1,
11326				   treg, 2);
11327		if (r)
11328			return r;
11329
11330		memset(&alu_src, 0, sizeof(alu_src));
11331		alu_src.sel = treg;
11332		alu_src.chan = 1;
11333		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11334		if (r)
11335			return r;
11336
11337		r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11338				   tmp_num, 1,
11339				   tmp_num, 1,
11340				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11341		if (r)
11342			return r;
11343
11344		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11345				   tmp_num, 3,
11346				   tmp_num, 3,
11347				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11348		if (r)
11349			return r;
11350
11351		r = tgsi_endif(ctx);
11352		if (r)
11353			return r;
11354	}
11355
11356	/* log2_denom is always <= 31, so manually peel the last loop
11357	 * iteration.
11358	 */
11359	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11360			   treg, 1,
11361			   tmp_num, 1,
11362			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11363	if (r)
11364		return r;
11365
11366	memset(&alu_src, 0, sizeof(alu_src));
11367	alu_src.sel = treg;
11368	alu_src.chan = 1;
11369	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11370	if (r)
11371		return r;
11372
11373	r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11374			   tmp_num, 1,
11375			   tmp_num, 1,
11376			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11377	if (r)
11378		return r;
11379
11380	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11381			   tmp_num, 3,
11382			   tmp_num, 3,
11383			   V_SQ_ALU_SRC_LITERAL, 1U);
11384	if (r)
11385		return r;
11386	r = tgsi_endif(ctx);
11387	if (r)
11388		return r;
11389
11390	r = tgsi_endif(ctx);
11391	if (r)
11392		return r;
11393
11394	/* onto the second loop to unroll */
11395	for (i = 0; i < 31; i++) {
11396		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11397				   treg, 1,
11398				   V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11399				   treg, 0);
11400		if (r)
11401			return r;
11402
11403		uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11404		r = single_alu_op2(ctx, ALU_OP1_MOV,
11405				   treg, 2,
11406				   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11407				   0, 0);
11408		if (r)
11409			return r;
11410
11411		r = single_alu_op2(ctx, ALU_OP1_MOV,
11412				   treg, 3,
11413				   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11414				   0, 0);
11415		if (r)
11416			return r;
11417
11418		r = emit_u64sge(ctx, sub_tmp,
11419				tmp_num, 0,
11420				treg, 2);
11421		if (r)
11422			return r;
11423
11424		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11425				   treg, 1,
11426				   treg, 1,
11427				   sub_tmp, 0);
11428		if (r)
11429			return r;
11430
11431		memset(&alu_src, 0, sizeof(alu_src));
11432		alu_src.sel = treg;
11433		alu_src.chan = 1;
11434		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11435		if (r)
11436			return r;
11437
11438
11439		r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11440				sub_tmp,
11441				tmp_num, 0,
11442				treg, 2);
11443		if (r)
11444			return r;
11445
11446		r = single_alu_op2(ctx, ALU_OP1_MOV,
11447				   tmp_num, 0,
11448				   sub_tmp, 0,
11449				   0, 0);
11450		if (r)
11451			return r;
11452
11453		r = single_alu_op2(ctx, ALU_OP1_MOV,
11454				   tmp_num, 1,
11455				   sub_tmp, 1,
11456				   0, 0);
11457		if (r)
11458			return r;
11459
11460		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11461				   tmp_num, 2,
11462				   tmp_num, 2,
11463				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11464		if (r)
11465			return r;
11466
11467		r = tgsi_endif(ctx);
11468		if (r)
11469			return r;
11470	}
11471
11472	/* log2_denom is always <= 63, so manually peel the last loop
11473	 * iteration.
11474	 */
11475	uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11476	r = single_alu_op2(ctx, ALU_OP1_MOV,
11477			   treg, 2,
11478			   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11479			   0, 0);
11480	if (r)
11481		return r;
11482
11483	r = single_alu_op2(ctx, ALU_OP1_MOV,
11484			   treg, 3,
11485			   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11486			   0, 0);
11487	if (r)
11488		return r;
11489
11490	r = emit_u64sge(ctx, sub_tmp,
11491			tmp_num, 0,
11492			treg, 2);
11493	if (r)
11494		return r;
11495
11496	memset(&alu_src, 0, sizeof(alu_src));
11497	alu_src.sel = sub_tmp;
11498	alu_src.chan = 0;
11499	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11500	if (r)
11501		return r;
11502
11503	r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11504			sub_tmp,
11505			tmp_num, 0,
11506			treg, 2);
11507	if (r)
11508		return r;
11509
11510	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11511			   tmp_num, 2,
11512			   tmp_num, 2,
11513			   V_SQ_ALU_SRC_LITERAL, 1U);
11514	if (r)
11515		return r;
11516	r = tgsi_endif(ctx);
11517	if (r)
11518		return r;
11519
11520	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11521	alu.op = ALU_OP1_MOV;
11522	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11523	alu.src[0].sel = tmp_num;
11524	alu.src[0].chan = 2;
11525	r = r600_bytecode_add_alu(ctx->bc, &alu);
11526	if (r)
11527		return r;
11528
11529	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11530	alu.op = ALU_OP1_MOV;
11531	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11532	alu.src[0].sel = tmp_num;
11533	alu.src[0].chan = 3;
11534	alu.last = 1;
11535	r = r600_bytecode_add_alu(ctx->bc, &alu);
11536	if (r)
11537		return r;
11538	return 0;
11539}
11540
11541static int egcm_u64sne(struct r600_shader_ctx *ctx)
11542{
11543	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11544	struct r600_bytecode_alu alu;
11545	int r;
11546	int treg = ctx->temp_reg;
11547
11548	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11549	alu.op = ALU_OP2_SETNE_INT;
11550	alu.dst.sel = treg;
11551	alu.dst.chan = 0;
11552	alu.dst.write = 1;
11553	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11554	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11555	r = r600_bytecode_add_alu(ctx->bc, &alu);
11556	if (r)
11557		return r;
11558
11559	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11560	alu.op = ALU_OP2_SETNE_INT;
11561	alu.dst.sel = treg;
11562	alu.dst.chan = 1;
11563	alu.dst.write = 1;
11564	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11565	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11566	alu.last = 1;
11567	r = r600_bytecode_add_alu(ctx->bc, &alu);
11568	if (r)
11569		return r;
11570
11571	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11572	alu.op = ALU_OP2_OR_INT;
11573	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11574	alu.src[0].sel = treg;
11575	alu.src[0].chan = 0;
11576	alu.src[1].sel = treg;
11577	alu.src[1].chan = 1;
11578	alu.last = 1;
11579	r = r600_bytecode_add_alu(ctx->bc, &alu);
11580	if (r)
11581		return r;
11582	return 0;
11583}
11584
11585static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11586	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
11587	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11588	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11589
11590	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11591
11592	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11593	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11594	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11595	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11596	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11597	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11598	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11599	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11600	/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11601	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11602	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11603	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11604	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11605	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11606	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11607	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11608	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11609	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11610	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11611	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11612	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11613	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11614	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11615	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11616	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11617	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11618	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11619	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11620	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11621	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
11622	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11623	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11624	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11625	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11626	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11627	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11628	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11629	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11630	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11631	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11632	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11633	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11634	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11635	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11636	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11637	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11638	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11639	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11640	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11641	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11642	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11643	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11644	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11645	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11646	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11647	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11648	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11649	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
11650	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11651	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11652	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11653	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11654	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11655	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11656	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11657	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11658	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11659	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11660	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11661	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11662	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11663	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11664	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11665	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11666	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11667	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11668	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11669	[81]			= { ALU_OP0_NOP, tgsi_unsupported},
11670	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11671	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11672	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11673	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11674	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11675	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11676	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11677	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11678	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11679	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11680	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11681	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11682	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11683	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11684	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11685	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11686	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11687	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11688	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11689	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11690	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11691	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11692	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11693	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
11694	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11695	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11696	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11697	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11698	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11699	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11700	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
11701	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11702	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11703	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11704	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11705	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11706	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11707	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11708	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11709	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11710	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11711	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11712	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11713	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11714	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11715	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11716	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11717	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11718	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11719	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11720	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11721	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11722	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11723	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11724	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11725	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11726	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11727	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11728	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11729	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11730	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11731	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11732	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11733	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11734	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11735	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11736	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11737	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11738	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11739	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11740	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11741	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11742	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11743	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11744	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11745	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11746	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11747	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11748	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11749	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
11750	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
11751	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11752	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11753	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11754	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
11755	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
11756	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
11757	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
11758	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
11759	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11760	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11761	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11762	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11763	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11764	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11765	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11766	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11767	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11768	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11769	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11770	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
11771	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
11772	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
11773	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
11774	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
11775	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
11776	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
11777	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
11778	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
11779	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11780	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
11781	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
11782	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
11783	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
11784};
11785
11786static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11787	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
11788	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11789	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11790	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11791	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11792	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11793	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11794	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11795	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11796	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11797	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11798	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11799	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11800	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11801	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11802	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11803	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11804	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11805	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
11806	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11807	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11808	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11809	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11810	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11811	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11812	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11813	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11814	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11815	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11816	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11817	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11818	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11819	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
11820	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11821	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11822	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11823	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11824	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11825	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11826	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
11827	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11828	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11829	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11830	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11831	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11832	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11833	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11834	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11835	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11836	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11837	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11838	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11839	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11840	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11841	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
11842	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11843	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11844	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11845	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11846	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11847	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
11848	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11849	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11850	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11851	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11852	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11853	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11854	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11855	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11856	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11857	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11858	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11859	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11860	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11861	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11862	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11863	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11864	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11865	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11866	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11867	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11868	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11869	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11870	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11871	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11872	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
11873	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11874	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11875	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11876	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11877	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11878	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11879	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11880	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11881	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11882	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11883	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11884	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11885	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11886	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11887	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11888	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11889	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11890	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11891	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11892	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11893	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11894	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11895	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11896	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11897	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11898	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11899	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11900	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11901	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11902	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11903	/* Refer below for TGSI_OPCODE_DFMA */
11904	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11905	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11906	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11907	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11908	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11909	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11910	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
11911	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11912	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11913	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11914	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11915	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11916	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11917	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11918	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11919	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11920	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11921	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11922	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11923	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
11924	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11925	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
11926	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11927	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11928	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11929	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11930	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11931	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11932	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11933	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11934	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11935	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11936	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11937	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11938	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11939	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11940	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11941	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11942	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
11943	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11944	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11945	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11946	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
11947	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
11948	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11949	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11950	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11951	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11952	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
11953	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
11954	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
11955	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
11956	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
11957	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
11958	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
11959	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
11960	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
11961	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
11962	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11963	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11964	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11965	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11966	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11967	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
11968	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
11969	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
11970	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
11971	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
11972	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
11973	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
11974	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
11975	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
11976	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
11977	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11978	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11979	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
11980	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
11981	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
11982	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
11983	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
11984	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
11985	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
11986	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
11987	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
11988	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
11989	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
11990	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
11991	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
11992	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
11993	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
11994	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
11995	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
11996	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
11997	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
11998	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
11999	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12000	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12001	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12002	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12003	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12004	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12005	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12006	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12007	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12008	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12009	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12010};
12011
12012static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12013	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
12014	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
12015	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
12016	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12017	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12018	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
12019	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
12020	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
12021	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
12022	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12023	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12024	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
12025	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
12026	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
12027	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
12028	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
12029	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
12030	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
12031	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
12032	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12033	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
12034	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
12035	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
12036	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
12037	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
12038	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
12039	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
12040	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12041	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12042	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
12043	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
12044	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
12045	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
12046	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
12047	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
12048	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
12049	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12050	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12051	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
12052	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
12053	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12054	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12055	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12056	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12057	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12058	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12059	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12060	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
12061	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12062	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12063	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12064	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12065	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12066	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12067	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12068	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12069	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12070	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12071	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12072	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12073	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12074	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12075	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12076	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12077	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12078	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12079	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12080	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12081	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12082	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12083	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12084	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12085	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12086	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12087	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12088	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12089	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12090	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12091	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12092	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12093	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12094	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12095	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
12096	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12097	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12098	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12099	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12100	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12101	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12102	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12103	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12104	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12105	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12106	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12107	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12108	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12109	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12110	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12111	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12112	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12113	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12114	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12115	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12116	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12117	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12118	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12119	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12120	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12121	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12122	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12123	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12124	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12125	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12126	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12127	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12128	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12129	/* Refer below for TGSI_OPCODE_DFMA */
12130	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
12131	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12132	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12133	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12134	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12135	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12136	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12137	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12138	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12139	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12140	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12141	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12142	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12143	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12144	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12145	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12146	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12147	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12148	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12149	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12150	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12151	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12152	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12153	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12154	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12155	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12156	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12157	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12158	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12159	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12160	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12161	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12162	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12163	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12164	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12165	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12166	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12167	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12168	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12169	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12170	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12171	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12172	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12173	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12174	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12175	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12176	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12177	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12178	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12179	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12180	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12181	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12182	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12183	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12184	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12185	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12186	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12187	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12188	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12189	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12190	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12191	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12192	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12193	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12194	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12195	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12196	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12197	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12198	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12199	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12200	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12201	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12202	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12203	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12204	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12205	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12206	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12207	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12208	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12209	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12210	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12211	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12212	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12213	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12214	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12215	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12216	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12217	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12218	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12219	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12220	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12221	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12222	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12223	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12224	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12225	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12226	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12227	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12228	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12229	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12230	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12231	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12232	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12233	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12234	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12235	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12236};
12237