1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_formats.h"
25#include "r600_opcodes.h"
26#include "r600_shader.h"
27#include "r600_dump.h"
28#include "r600d.h"
29#include "sfn/sfn_nir.h"
30
31#include "sb/sb_public.h"
32
33#include "pipe/p_shader_tokens.h"
34#include "tgsi/tgsi_info.h"
35#include "tgsi/tgsi_parse.h"
36#include "tgsi/tgsi_scan.h"
37#include "tgsi/tgsi_dump.h"
38#include "tgsi/tgsi_from_mesa.h"
39#include "nir/tgsi_to_nir.h"
40#include "nir/nir_to_tgsi_info.h"
41#include "compiler/nir/nir.h"
42#include "util/u_bitcast.h"
43#include "util/u_memory.h"
44#include "util/u_math.h"
45#include <stdio.h>
46#include <errno.h>
47
48/* CAYMAN notes
49Why CAYMAN got loops for lots of instructions is explained here.
50
51-These 8xx t-slot only ops are implemented in all vector slots.
52MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53These 8xx t-slot only opcodes become vector ops, with all four
54slots expecting the arguments on sources a and b. Result is
55broadcast to all channels.
56MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57These 8xx t-slot only opcodes become vector ops in the z, y, and
58x slots.
59EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61SQRT_IEEE/_64
62SIN/COS
63The w slot may have an independent co-issued operation, or if the
64result is required to be in the w slot, the opcode above may be
65issued in the w slot as well.
66The compiler must issue the source argument to slots z, y, and x
67*/
68
69/* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72      .y = RelVertexID (??)
73      .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76      r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79       .y = RelPatchID (??)
80       .z = InvocationID
81       .w = tess factor base.
82
83 TES - .x = TessCoord.x
84     - .y = TessCoord.y
85     - .z = RelPatchID (??)
86     - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89      face_gpr.w = SampleID
90*/
91#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92static int r600_shader_from_tgsi(struct r600_context *rctx,
93				 struct r600_pipe_shader *pipeshader,
94				 union r600_shader_key key);
95
96static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97                           int size, unsigned comp_mask) {
98
99	if (!size)
100		return;
101
102	if (ps->num_arrays == ps->max_arrays) {
103		ps->max_arrays += 64;
104		ps->arrays = realloc(ps->arrays, ps->max_arrays *
105		                     sizeof(struct r600_shader_array));
106	}
107
108	int n = ps->num_arrays;
109	++ps->num_arrays;
110
111	ps->arrays[n].comp_mask = comp_mask;
112	ps->arrays[n].gpr_start = start_gpr;
113	ps->arrays[n].gpr_count = size;
114}
115
116static void r600_dump_streamout(struct pipe_stream_output_info *so)
117{
118	unsigned i;
119
120	fprintf(stderr, "STREAMOUT\n");
121	for (i = 0; i < so->num_outputs; i++) {
122		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123				so->output[i].start_component;
124		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125			i,
126			so->output[i].stream,
127			so->output[i].output_buffer,
128			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129			so->output[i].register_index,
130			mask & 1 ? "x" : "",
131		        mask & 2 ? "y" : "",
132		        mask & 4 ? "z" : "",
133		        mask & 8 ? "w" : "",
134			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135	}
136}
137
138static int store_shader(struct pipe_context *ctx,
139			struct r600_pipe_shader *shader)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	uint32_t *ptr, i;
143
144	if (shader->bo == NULL) {
145		shader->bo = (struct r600_resource*)
146			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147		if (shader->bo == NULL) {
148			return -ENOMEM;
149		}
150		ptr = r600_buffer_map_sync_with_rings(
151			&rctx->b, shader->bo,
152			PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
153		if (R600_BIG_ENDIAN) {
154			for (i = 0; i < shader->shader.bc.ndw; ++i) {
155				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156			}
157		} else {
158			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159		}
160		rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
161	}
162
163	return 0;
164}
165
166extern const struct nir_shader_compiler_options r600_nir_options;
167static int nshader = 0;
168int r600_pipe_shader_create(struct pipe_context *ctx,
169			    struct r600_pipe_shader *shader,
170			    union r600_shader_key key)
171{
172	struct r600_context *rctx = (struct r600_context *)ctx;
173	struct r600_pipe_shader_selector *sel = shader->selector;
174	int r;
175	struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177	int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178		tgsi_get_processor_type(sel->tokens):
179		pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181	bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182	unsigned use_sb = !(rctx->screen->b.debug_flags & (DBG_NO_SB | DBG_NIR)) ||
183                          (rctx->screen->b.debug_flags & DBG_NIR_SB);
184	unsigned sb_disasm;
185	unsigned export_shader;
186
187	shader->shader.bc.isa = rctx->isa;
188
189	if (!(rscreen->b.debug_flags & DBG_NIR_PREFERRED)) {
190		assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191		r = r600_shader_from_tgsi(rctx, shader, key);
192		if (r) {
193			R600_ERR("translation from TGSI failed !\n");
194			goto error;
195		}
196	} else {
197		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198			sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
199                        const nir_shader_compiler_options *nir_options =
200                              (const nir_shader_compiler_options *)
201                              ctx->screen->get_compiler_options(ctx->screen,
202                                                                PIPE_SHADER_IR_NIR,
203                                                                shader->shader.processor_type);
204                        /* Lower int64 ops because we have some r600 build-in shaders that use it */
205			if (nir_options->lower_int64_options) {
206				NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
207				NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
208				NIR_PASS_V(sel->nir, nir_lower_int64);
209				NIR_PASS_V(sel->nir, nir_opt_vectorize, NULL, NULL);
210			}
211			NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
212		}
213		nir_tgsi_scan_shader(sel->nir, &sel->info, true);
214
215		r = r600_shader_from_nir(rctx, shader, &key);
216		if (r) {
217			fprintf(stderr, "--Failed shader--------------------------------------------------\n");
218
219			if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
220				fprintf(stderr, "--TGSI--------------------------------------------------------\n");
221				tgsi_dump(sel->tokens, 0);
222			}
223
224			if (rscreen->b.debug_flags & (DBG_NIR_PREFERRED)) {
225				fprintf(stderr, "--NIR --------------------------------------------------------\n");
226				nir_print_shader(sel->nir, stderr);
227			}
228
229			R600_ERR("translation from NIR failed !\n");
230			goto error;
231		}
232	}
233
234	if (dump) {
235		if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
236			fprintf(stderr, "--TGSI--------------------------------------------------------\n");
237			tgsi_dump(sel->tokens, 0);
238		}
239
240		if (sel->so.num_outputs) {
241			r600_dump_streamout(&sel->so);
242		}
243	}
244
245	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
246		/* only disable for vertex shaders in tess paths */
247		if (key.vs.as_ls)
248			use_sb = 0;
249	}
250	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
251	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
252	use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
253
254	/* disable SB for shaders using doubles */
255	use_sb &= !shader->shader.uses_doubles;
256
257	use_sb &= !shader->shader.uses_atomics;
258	use_sb &= !shader->shader.uses_images;
259	use_sb &= !shader->shader.uses_helper_invocation;
260
261	/* Check if the bytecode has already been built. */
262	if (!shader->shader.bc.bytecode) {
263		r = r600_bytecode_build(&shader->shader.bc);
264		if (r) {
265			R600_ERR("building bytecode failed !\n");
266			goto error;
267		}
268	}
269
270	sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
271	if (dump && !sb_disasm) {
272		fprintf(stderr, "--------------------------------------------------------------\n");
273		r600_bytecode_disasm(&shader->shader.bc);
274		fprintf(stderr, "______________________________________________________________\n");
275	} else if ((dump && sb_disasm) || use_sb) {
276                r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
277		                             dump, use_sb);
278		if (r) {
279			R600_ERR("r600_sb_bytecode_process failed !\n");
280			goto error;
281		}
282	}
283
284        if (dump) {
285           FILE *f;
286           char fname[1024];
287           snprintf(fname, 1024, "shader_from_%s_%d.cpp",
288                    (sel->ir_type == PIPE_SHADER_IR_TGSI ?
289                        (rscreen->b.debug_flags & DBG_NIR_PREFERRED ? "tgsi-nir" : "tgsi")
290                      : "nir"), nshader);
291           f = fopen(fname, "w");
292           print_shader_info(f, nshader++, &shader->shader);
293           print_shader_info(stderr, nshader++, &shader->shader);
294           print_pipe_info(stderr, &sel->info);
295           if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
296              fprintf(f, "/****TGSI**********************************\n");
297              tgsi_dump_to_file(sel->tokens, 0, f);
298           }
299
300           if (rscreen->b.debug_flags & DBG_NIR_PREFERRED){
301              fprintf(f, "/****NIR **********************************\n");
302              nir_print_shader(sel->nir, f);
303           }
304           fprintf(f, "******************************************/\n");
305           fclose(f);
306        }
307
308	if (shader->gs_copy_shader) {
309		if (dump) {
310			// dump copy shader
311			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
312						     &shader->gs_copy_shader->shader, dump, 0);
313			if (r)
314				goto error;
315		}
316
317		if ((r = store_shader(ctx, shader->gs_copy_shader)))
318			goto error;
319	}
320
321	/* Store the shader in a buffer. */
322	if ((r = store_shader(ctx, shader)))
323		goto error;
324
325	/* Build state. */
326	switch (shader->shader.processor_type) {
327	case PIPE_SHADER_TESS_CTRL:
328		evergreen_update_hs_state(ctx, shader);
329		break;
330	case PIPE_SHADER_TESS_EVAL:
331		if (key.tes.as_es)
332			evergreen_update_es_state(ctx, shader);
333		else
334			evergreen_update_vs_state(ctx, shader);
335		break;
336	case PIPE_SHADER_GEOMETRY:
337		if (rctx->b.chip_class >= EVERGREEN) {
338			evergreen_update_gs_state(ctx, shader);
339			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
340		} else {
341			r600_update_gs_state(ctx, shader);
342			r600_update_vs_state(ctx, shader->gs_copy_shader);
343		}
344		break;
345	case PIPE_SHADER_VERTEX:
346		export_shader = key.vs.as_es;
347		if (rctx->b.chip_class >= EVERGREEN) {
348			if (key.vs.as_ls)
349				evergreen_update_ls_state(ctx, shader);
350			else if (key.vs.as_es)
351				evergreen_update_es_state(ctx, shader);
352			else
353				evergreen_update_vs_state(ctx, shader);
354		} else {
355			if (export_shader)
356				r600_update_es_state(ctx, shader);
357			else
358				r600_update_vs_state(ctx, shader);
359		}
360		break;
361	case PIPE_SHADER_FRAGMENT:
362		if (rctx->b.chip_class >= EVERGREEN) {
363			evergreen_update_ps_state(ctx, shader);
364		} else {
365			r600_update_ps_state(ctx, shader);
366		}
367		break;
368	case PIPE_SHADER_COMPUTE:
369		evergreen_update_ls_state(ctx, shader);
370		break;
371	default:
372		r = -EINVAL;
373		goto error;
374	}
375	return 0;
376
377error:
378	r600_pipe_shader_destroy(ctx, shader);
379	return r;
380}
381
382void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
383{
384	r600_resource_reference(&shader->bo, NULL);
385	if (list_is_linked(&shader->shader.bc.cf))
386		r600_bytecode_clear(&shader->shader.bc);
387	r600_release_command_buffer(&shader->command_buffer);
388}
389
390/*
391 * tgsi -> r600 shader
392 */
393struct r600_shader_tgsi_instruction;
394
395struct r600_shader_src {
396	unsigned				sel;
397	unsigned				swizzle[4];
398	unsigned				neg;
399	unsigned				abs;
400	unsigned				rel;
401	unsigned				kc_bank;
402	boolean					kc_rel; /* true if cache bank is indexed */
403	uint32_t				value[4];
404};
405
406struct eg_interp {
407	boolean					enabled;
408	unsigned				ij_index;
409};
410
411struct r600_shader_ctx {
412	struct tgsi_shader_info			info;
413	struct tgsi_array_info			*array_infos;
414	/* flag for each tgsi temp array if its been spilled or not */
415	bool					*spilled_arrays;
416	struct tgsi_parse_context		parse;
417	const struct tgsi_token			*tokens;
418	unsigned				type;
419	unsigned				file_offset[TGSI_FILE_COUNT];
420	unsigned				temp_reg;
421	const struct r600_shader_tgsi_instruction	*inst_info;
422	struct r600_bytecode			*bc;
423	struct r600_shader			*shader;
424	struct r600_shader_src			src[4];
425	uint32_t				*literals;
426	uint32_t				nliterals;
427	uint32_t				max_driver_temp_used;
428	/* needed for evergreen interpolation */
429	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
430	/* evergreen/cayman also store sample mask in face register */
431	int					face_gpr;
432	/* sample id is .w component stored in fixed point position register */
433	int					fixed_pt_position_gpr;
434	int					colors_used;
435	boolean                 clip_vertex_write;
436	unsigned                cv_output;
437	unsigned		edgeflag_output;
438	int					helper_invoc_reg;
439	int                                     cs_block_size_reg;
440	int                                     cs_grid_size_reg;
441	bool cs_block_size_loaded, cs_grid_size_loaded;
442	int					fragcoord_input;
443	int					next_ring_offset;
444	int					gs_out_ring_offset;
445	int					gs_next_vertex;
446	struct r600_shader	*gs_for_vs;
447	int					gs_export_gpr_tregs[4];
448	int                                     gs_rotated_input[2];
449	const struct pipe_stream_output_info	*gs_stream_output_info;
450	unsigned				enabled_stream_buffers_mask;
451	unsigned                                tess_input_info; /* temp with tess input offsets */
452	unsigned                                tess_output_info; /* temp with tess input offsets */
453	unsigned                                thread_id_gpr; /* temp with thread id calculated for images */
454};
455
456struct r600_shader_tgsi_instruction {
457	unsigned	op;
458	int (*process)(struct r600_shader_ctx *ctx);
459};
460
461static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
462static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
463static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
464static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
465static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
466static int tgsi_else(struct r600_shader_ctx *ctx);
467static int tgsi_endif(struct r600_shader_ctx *ctx);
468static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
469static int tgsi_endloop(struct r600_shader_ctx *ctx);
470static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
471static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
472                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
473                                unsigned int dst_reg);
474static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
475			const struct r600_shader_src *shader_src,
476			unsigned chan);
477static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
478			       unsigned dst_reg, unsigned mask);
479
480static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
481{
482	if (ctx->bc->family == CHIP_HEMLOCK ||
483	    ctx->bc->family == CHIP_CYPRESS ||
484	    ctx->bc->family == CHIP_JUNIPER)
485		return false;
486	return true;
487}
488
489static int tgsi_last_instruction(unsigned writemask)
490{
491	int i, lasti = 0;
492
493	for (i = 0; i < 4; i++) {
494		if (writemask & (1 << i)) {
495			lasti = i;
496		}
497	}
498	return lasti;
499}
500
501static int tgsi_is_supported(struct r600_shader_ctx *ctx)
502{
503	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
504	unsigned j;
505
506	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
507		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
508		return -EINVAL;
509	}
510#if 0
511	if (i->Instruction.Label) {
512		R600_ERR("label unsupported\n");
513		return -EINVAL;
514	}
515#endif
516	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
517		if (i->Src[j].Register.Dimension) {
518			switch (i->Src[j].Register.File) {
519			case TGSI_FILE_CONSTANT:
520			case TGSI_FILE_HW_ATOMIC:
521				break;
522			case TGSI_FILE_INPUT:
523				if (ctx->type == PIPE_SHADER_GEOMETRY ||
524				    ctx->type == PIPE_SHADER_TESS_CTRL ||
525				    ctx->type == PIPE_SHADER_TESS_EVAL)
526					break;
527				FALLTHROUGH;
528			case TGSI_FILE_OUTPUT:
529				if (ctx->type == PIPE_SHADER_TESS_CTRL)
530					break;
531				FALLTHROUGH;
532			default:
533				R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
534					 i->Src[j].Register.File,
535					 i->Src[j].Register.Dimension);
536				return -EINVAL;
537			}
538		}
539	}
540	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
541		if (i->Dst[j].Register.Dimension) {
542			if (ctx->type == PIPE_SHADER_TESS_CTRL)
543				continue;
544			R600_ERR("unsupported dst (dimension)\n");
545			return -EINVAL;
546		}
547	}
548	return 0;
549}
550
551int eg_get_interpolator_index(unsigned interpolate, unsigned location)
552{
553	if (interpolate == TGSI_INTERPOLATE_COLOR ||
554		interpolate == TGSI_INTERPOLATE_LINEAR ||
555		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
556	{
557		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
558		int loc;
559
560		switch(location) {
561		case TGSI_INTERPOLATE_LOC_CENTER:
562			loc = 1;
563			break;
564		case TGSI_INTERPOLATE_LOC_CENTROID:
565			loc = 2;
566			break;
567		case TGSI_INTERPOLATE_LOC_SAMPLE:
568		default:
569			loc = 0; break;
570		}
571
572		return is_linear * 3 + loc;
573	}
574
575	return -1;
576}
577
578static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
579		int input)
580{
581	int i = eg_get_interpolator_index(
582		ctx->shader->input[input].interpolate,
583		ctx->shader->input[input].interpolate_location);
584	assert(i >= 0);
585	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
586}
587
588static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
589{
590	int i, r;
591	struct r600_bytecode_alu alu;
592	int gpr = 0, base_chan = 0;
593	int ij_index = ctx->shader->input[input].ij_index;
594
595	/* work out gpr and base_chan from index */
596	gpr = ij_index / 2;
597	base_chan = (2 * (ij_index % 2)) + 1;
598
599	for (i = 0; i < 8; i++) {
600		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
601
602		if (i < 4)
603			alu.op = ALU_OP2_INTERP_ZW;
604		else
605			alu.op = ALU_OP2_INTERP_XY;
606
607		if ((i > 1) && (i < 6)) {
608			alu.dst.sel = ctx->shader->input[input].gpr;
609			alu.dst.write = 1;
610		}
611
612		alu.dst.chan = i % 4;
613
614		alu.src[0].sel = gpr;
615		alu.src[0].chan = (base_chan - (i % 2));
616
617		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
618
619		alu.bank_swizzle_force = SQ_ALU_VEC_210;
620		if ((i % 4) == 3)
621			alu.last = 1;
622		r = r600_bytecode_add_alu(ctx->bc, &alu);
623		if (r)
624			return r;
625	}
626	return 0;
627}
628
629static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
630{
631	int i, r;
632	struct r600_bytecode_alu alu;
633
634	for (i = 0; i < 4; i++) {
635		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
636
637		alu.op = ALU_OP1_INTERP_LOAD_P0;
638
639		alu.dst.sel = ctx->shader->input[input].gpr;
640		alu.dst.write = 1;
641
642		alu.dst.chan = i;
643
644		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
645		alu.src[0].chan = i;
646
647		if (i == 3)
648			alu.last = 1;
649		r = r600_bytecode_add_alu(ctx->bc, &alu);
650		if (r)
651			return r;
652	}
653	return 0;
654}
655
656/*
657 * Special export handling in shaders
658 *
659 * shader export ARRAY_BASE for EXPORT_POS:
660 * 60 is position
661 * 61 is misc vector
662 * 62, 63 are clip distance vectors
663 *
664 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
665 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
666 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
667 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
668 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
669 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
670 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
671 * exclusive from render target index)
672 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
673 *
674 *
675 * shader export ARRAY_BASE for EXPORT_PIXEL:
676 * 0-7 CB targets
677 * 61 computed Z vector
678 *
679 * The use of the values exported in the computed Z vector are controlled
680 * by DB_SHADER_CONTROL:
681 * Z_EXPORT_ENABLE - Z as a float in RED
682 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
683 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
684 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
685 * DB_SOURCE_FORMAT - export control restrictions
686 *
687 */
688
689
690/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
691static int r600_spi_sid(struct r600_shader_io * io)
692{
693	int index, name = io->name;
694
695	/* These params are handled differently, they don't need
696	 * semantic indices, so we'll use 0 for them.
697	 */
698	if (name == TGSI_SEMANTIC_POSITION ||
699	    name == TGSI_SEMANTIC_PSIZE ||
700	    name == TGSI_SEMANTIC_EDGEFLAG ||
701	    name == TGSI_SEMANTIC_FACE ||
702	    name == TGSI_SEMANTIC_SAMPLEMASK)
703		index = 0;
704	else {
705		if (name == TGSI_SEMANTIC_GENERIC) {
706			/* For generic params simply use sid from tgsi */
707			index = 9 + io->sid;
708		} else if (name == TGSI_SEMANTIC_TEXCOORD) {
709			index = io->sid;
710		} else {
711			/* For non-generic params - pack name and sid into 8 bits */
712			index = 0x80 | (name<<3) | (io->sid);
713		}
714
715		/* Make sure that all really used indices have nonzero value, so
716		 * we can just compare it to 0 later instead of comparing the name
717		 * with different values to detect special cases. */
718		index++;
719	}
720
721	return index;
722};
723
724/* we need this to get a common lds index for vs/tcs/tes input/outputs */
725int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
726{
727	switch (semantic_name) {
728	case TGSI_SEMANTIC_POSITION:
729		return 0;
730	case TGSI_SEMANTIC_PSIZE:
731		return 1;
732	case TGSI_SEMANTIC_CLIPDIST:
733		assert(index <= 1);
734		return 2 + index;
735	case TGSI_SEMANTIC_TEXCOORD:
736		return 4 + index;
737	case TGSI_SEMANTIC_GENERIC:
738		if (index <= 63-4)
739			return 4 + index;
740		else
741			/* same explanation as in the default statement,
742			 * the only user hitting this is st/nine.
743			 */
744			return 0;
745
746	/* patch indices are completely separate and thus start from 0 */
747	case TGSI_SEMANTIC_TESSOUTER:
748		return 0;
749	case TGSI_SEMANTIC_TESSINNER:
750		return 1;
751	case TGSI_SEMANTIC_PATCH:
752		return 2 + index;
753
754	default:
755		/* Don't fail here. The result of this function is only used
756		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
757		 * occur, but this function is called for all vertex shaders
758		 * before it's known whether LS will be compiled or not.
759		 */
760		return 0;
761	}
762}
763
764/* turn input into interpolate on EG */
765static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
766{
767	int r = 0;
768
769	if (ctx->shader->input[index].spi_sid) {
770		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
771		if (ctx->shader->input[index].interpolate > 0) {
772			evergreen_interp_assign_ij_index(ctx, index);
773			r = evergreen_interp_alu(ctx, index);
774		} else {
775			r = evergreen_interp_flat(ctx, index);
776		}
777	}
778	return r;
779}
780
781static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
782{
783	struct r600_bytecode_alu alu;
784	int i, r;
785	int gpr_front = ctx->shader->input[front].gpr;
786	int gpr_back = ctx->shader->input[back].gpr;
787
788	for (i = 0; i < 4; i++) {
789		memset(&alu, 0, sizeof(alu));
790		alu.op = ALU_OP3_CNDGT;
791		alu.is_op3 = 1;
792		alu.dst.write = 1;
793		alu.dst.sel = gpr_front;
794		alu.src[0].sel = ctx->face_gpr;
795		alu.src[1].sel = gpr_front;
796		alu.src[2].sel = gpr_back;
797
798		alu.dst.chan = i;
799		alu.src[1].chan = i;
800		alu.src[2].chan = i;
801		alu.last = (i==3);
802
803		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
804			return r;
805	}
806
807	return 0;
808}
809
810/* execute a single slot ALU calculation */
811static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
812			  int dst_sel, int dst_chan,
813			  int src0_sel, unsigned src0_chan_val,
814			  int src1_sel, unsigned src1_chan_val)
815{
816	struct r600_bytecode_alu alu;
817	int r, i;
818
819	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
820		for (i = 0; i < 4; i++) {
821			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
822			alu.op = op;
823			alu.src[0].sel = src0_sel;
824			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
825				alu.src[0].value = src0_chan_val;
826			else
827				alu.src[0].chan = src0_chan_val;
828			alu.src[1].sel = src1_sel;
829			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
830				alu.src[1].value = src1_chan_val;
831			else
832				alu.src[1].chan = src1_chan_val;
833			alu.dst.sel = dst_sel;
834			alu.dst.chan = i;
835			alu.dst.write = i == dst_chan;
836			alu.last = (i == 3);
837			r = r600_bytecode_add_alu(ctx->bc, &alu);
838			if (r)
839				return r;
840		}
841		return 0;
842	}
843
844	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
845	alu.op = op;
846	alu.src[0].sel = src0_sel;
847	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
848		alu.src[0].value = src0_chan_val;
849	else
850		alu.src[0].chan = src0_chan_val;
851	alu.src[1].sel = src1_sel;
852	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
853		alu.src[1].value = src1_chan_val;
854	else
855		alu.src[1].chan = src1_chan_val;
856	alu.dst.sel = dst_sel;
857	alu.dst.chan = dst_chan;
858	alu.dst.write = 1;
859	alu.last = 1;
860	r = r600_bytecode_add_alu(ctx->bc, &alu);
861	if (r)
862		return r;
863	return 0;
864}
865
866/* execute a single slot ALU calculation */
867static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
868			  int dst_sel, int dst_chan,
869			  int src0_sel, unsigned src0_chan_val,
870			  int src1_sel, unsigned src1_chan_val,
871			  int src2_sel, unsigned src2_chan_val)
872{
873	struct r600_bytecode_alu alu;
874	int r;
875
876	/* validate this for other ops */
877	assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
878	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
879	alu.op = op;
880	alu.src[0].sel = src0_sel;
881	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
882		alu.src[0].value = src0_chan_val;
883	else
884		alu.src[0].chan = src0_chan_val;
885	alu.src[1].sel = src1_sel;
886	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
887		alu.src[1].value = src1_chan_val;
888	else
889		alu.src[1].chan = src1_chan_val;
890	alu.src[2].sel = src2_sel;
891	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
892		alu.src[2].value = src2_chan_val;
893	else
894		alu.src[2].chan = src2_chan_val;
895	alu.dst.sel = dst_sel;
896	alu.dst.chan = dst_chan;
897	alu.is_op3 = 1;
898	alu.last = 1;
899	r = r600_bytecode_add_alu(ctx->bc, &alu);
900	if (r)
901		return r;
902	return 0;
903}
904
905/* put it in temp_reg.x */
906static int get_lds_offset0(struct r600_shader_ctx *ctx,
907			   int rel_patch_chan,
908			   int temp_reg, bool is_patch_var)
909{
910	int r;
911
912	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
913	/* ADD
914	   Dimension - patch0_offset (input_vals.z),
915	   Non-dim - patch0_data_offset (input_vals.w)
916	*/
917	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
918			   temp_reg, 0,
919			   ctx->tess_output_info, 0,
920			   0, rel_patch_chan,
921			   ctx->tess_output_info, is_patch_var ? 3 : 2);
922	if (r)
923		return r;
924	return 0;
925}
926
927static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
928{
929	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
930}
931
932static int r600_get_temp(struct r600_shader_ctx *ctx)
933{
934	return ctx->temp_reg + ctx->max_driver_temp_used++;
935}
936
937static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
938{
939	int i;
940	i = ctx->shader->noutput++;
941	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
942	ctx->shader->output[i].sid = 0;
943	ctx->shader->output[i].gpr = 0;
944	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
945	ctx->shader->output[i].write_mask = 0x4;
946	ctx->shader->output[i].spi_sid = prim_id_sid;
947
948	return 0;
949}
950
951static int tgsi_barrier(struct r600_shader_ctx *ctx)
952{
953	struct r600_bytecode_alu alu;
954	int r;
955
956	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
957	alu.op = ctx->inst_info->op;
958	alu.last = 1;
959
960	r = r600_bytecode_add_alu(ctx->bc, &alu);
961	if (r)
962		return r;
963	return 0;
964}
965
966static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
967{
968	// pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
969	unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
970	unsigned narrays_left = n;
971	bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
972
973	*scratch_space_needed = 0;
974	while (*regno > 124 && narrays_left) {
975		unsigned i;
976		unsigned largest = 0;
977		unsigned largest_index = 0;
978
979		for (i = 0; i < n; i++) {
980			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
981			if (!spilled[i] && size > largest) {
982				largest = size;
983				largest_index = i;
984			}
985		}
986
987		spilled[largest_index] = true;
988		*regno -= largest;
989		*scratch_space_needed += largest;
990
991		narrays_left --;
992	}
993
994	if (narrays_left == 0) {
995		ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
996	}
997}
998
999/* Take spilled temp arrays into account when translating tgsi register
1000 * indexes into r600 gprs if spilled is false, or scratch array offset if
1001 * spilled is true */
1002static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
1003{
1004	unsigned i;
1005	unsigned spilled_size = 0;
1006
1007	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1008		if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1009			if (ctx->spilled_arrays[i]) {
1010				/* vec4 index into spilled scratch memory */
1011				*spilled = true;
1012				return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1013			}
1014			else {
1015				/* regular GPR array */
1016				*spilled = false;
1017				return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1018			}
1019		}
1020
1021		if (tgsi_reg_index < ctx->array_infos[i].range.First)
1022			break;
1023		if (ctx->spilled_arrays[i]) {
1024			spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1025		}
1026	}
1027
1028	/* regular GPR index, minus the holes from spilled arrays */
1029	*spilled = false;
1030
1031	return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1032}
1033
1034/* look up spill area base offset and array size for a spilled temp array */
1035static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1036	unsigned *array_base, unsigned *array_size)
1037{
1038	unsigned i;
1039	unsigned offset = 0;
1040
1041	for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1042		if (ctx->spilled_arrays[i]) {
1043			unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1044
1045			if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1046				*array_base = offset;
1047				*array_size = size - 1; /* hw counts from 1 */
1048
1049				return;
1050			}
1051
1052			offset += size;
1053		}
1054	}
1055}
1056
1057static int tgsi_declaration(struct r600_shader_ctx *ctx)
1058{
1059	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1060	int r, i, j, count = d->Range.Last - d->Range.First + 1;
1061
1062	switch (d->Declaration.File) {
1063	case TGSI_FILE_INPUT:
1064		for (j = 0; j < count; j++) {
1065			i = ctx->shader->ninput + j;
1066			assert(i < ARRAY_SIZE(ctx->shader->input));
1067			ctx->shader->input[i].name = d->Semantic.Name;
1068			ctx->shader->input[i].sid = d->Semantic.Index + j;
1069			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1070			ctx->shader->input[i].interpolate_location = d->Interp.Location;
1071			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1072			if (ctx->type == PIPE_SHADER_FRAGMENT) {
1073				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1074				switch (ctx->shader->input[i].name) {
1075				case TGSI_SEMANTIC_FACE:
1076					if (ctx->face_gpr != -1)
1077						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1078					else
1079						ctx->face_gpr = ctx->shader->input[i].gpr;
1080					break;
1081				case TGSI_SEMANTIC_COLOR:
1082					ctx->colors_used++;
1083					break;
1084				case TGSI_SEMANTIC_POSITION:
1085					ctx->fragcoord_input = i;
1086					break;
1087				case TGSI_SEMANTIC_PRIMID:
1088					/* set this for now */
1089					ctx->shader->gs_prim_id_input = true;
1090					ctx->shader->ps_prim_id_input = i;
1091					break;
1092				}
1093				if (ctx->bc->chip_class >= EVERGREEN) {
1094					if ((r = evergreen_interp_input(ctx, i)))
1095						return r;
1096				}
1097			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1098				/* FIXME probably skip inputs if they aren't passed in the ring */
1099				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1100				ctx->next_ring_offset += 16;
1101				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1102					ctx->shader->gs_prim_id_input = true;
1103			}
1104		}
1105		ctx->shader->ninput += count;
1106		break;
1107	case TGSI_FILE_OUTPUT:
1108		for (j = 0; j < count; j++) {
1109			i = ctx->shader->noutput + j;
1110			assert(i < ARRAY_SIZE(ctx->shader->output));
1111			ctx->shader->output[i].name = d->Semantic.Name;
1112			ctx->shader->output[i].sid = d->Semantic.Index + j;
1113			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1114			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1115			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1116			if (ctx->type == PIPE_SHADER_VERTEX ||
1117			    ctx->type == PIPE_SHADER_GEOMETRY ||
1118			    ctx->type == PIPE_SHADER_TESS_EVAL) {
1119				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1120				switch (d->Semantic.Name) {
1121				case TGSI_SEMANTIC_CLIPDIST:
1122					break;
1123				case TGSI_SEMANTIC_PSIZE:
1124					ctx->shader->vs_out_misc_write = 1;
1125					ctx->shader->vs_out_point_size = 1;
1126					break;
1127				case TGSI_SEMANTIC_EDGEFLAG:
1128					ctx->shader->vs_out_misc_write = 1;
1129					ctx->shader->vs_out_edgeflag = 1;
1130					ctx->edgeflag_output = i;
1131					break;
1132				case TGSI_SEMANTIC_VIEWPORT_INDEX:
1133					ctx->shader->vs_out_misc_write = 1;
1134					ctx->shader->vs_out_viewport = 1;
1135					break;
1136				case TGSI_SEMANTIC_LAYER:
1137					ctx->shader->vs_out_misc_write = 1;
1138					ctx->shader->vs_out_layer = 1;
1139					break;
1140				case TGSI_SEMANTIC_CLIPVERTEX:
1141					ctx->clip_vertex_write = TRUE;
1142					ctx->cv_output = i;
1143					break;
1144				}
1145				if (ctx->type == PIPE_SHADER_GEOMETRY) {
1146					ctx->gs_out_ring_offset += 16;
1147				}
1148			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1149				switch (d->Semantic.Name) {
1150				case TGSI_SEMANTIC_COLOR:
1151					ctx->shader->nr_ps_max_color_exports++;
1152					break;
1153				}
1154			}
1155		}
1156		ctx->shader->noutput += count;
1157		break;
1158	case TGSI_FILE_TEMPORARY:
1159		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1160			if (d->Array.ArrayID) {
1161				bool spilled;
1162				unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1163					d->Range.First,
1164					&spilled);
1165
1166				if (!spilled) {
1167					r600_add_gpr_array(ctx->shader, idx,
1168						d->Range.Last - d->Range.First + 1, 0x0F);
1169				}
1170			}
1171		}
1172		break;
1173
1174	case TGSI_FILE_CONSTANT:
1175	case TGSI_FILE_SAMPLER:
1176	case TGSI_FILE_SAMPLER_VIEW:
1177	case TGSI_FILE_ADDRESS:
1178	case TGSI_FILE_BUFFER:
1179	case TGSI_FILE_IMAGE:
1180	case TGSI_FILE_MEMORY:
1181		break;
1182
1183	case TGSI_FILE_HW_ATOMIC:
1184		i = ctx->shader->nhwatomic_ranges;
1185		ctx->shader->atomics[i].start = d->Range.First;
1186		ctx->shader->atomics[i].end = d->Range.Last;
1187		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1188		ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1189		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1190		ctx->shader->nhwatomic_ranges++;
1191		ctx->shader->nhwatomic += count;
1192		break;
1193
1194	case TGSI_FILE_SYSTEM_VALUE:
1195		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1196			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1197			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1198			break; /* Already handled from allocate_system_value_inputs */
1199		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1200			break;
1201		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1202			break;
1203		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1204			break;
1205		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1206			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1207			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1208			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1209			unsigned temp_reg = r600_get_temp(ctx);
1210
1211			r = get_lds_offset0(ctx, 2, temp_reg, true);
1212			if (r)
1213				return r;
1214
1215			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1216					   temp_reg, 0,
1217					   temp_reg, 0,
1218					   V_SQ_ALU_SRC_LITERAL, param * 16);
1219			if (r)
1220				return r;
1221
1222			do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1223		}
1224		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1225			/* MOV r1.x, r0.x;
1226			   MOV r1.y, r0.y;
1227			*/
1228			for (i = 0; i < 2; i++) {
1229				struct r600_bytecode_alu alu;
1230				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1231				alu.op = ALU_OP1_MOV;
1232				alu.src[0].sel = 0;
1233				alu.src[0].chan = 0 + i;
1234				alu.dst.sel = 1;
1235				alu.dst.chan = 0 + i;
1236				alu.dst.write = 1;
1237				alu.last = (i == 1) ? 1 : 0;
1238				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1239					return r;
1240			}
1241			/* ADD r1.z, 1.0f, -r0.x */
1242			struct r600_bytecode_alu alu;
1243			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1244			alu.op = ALU_OP2_ADD;
1245			alu.src[0].sel = V_SQ_ALU_SRC_1;
1246			alu.src[1].sel = 1;
1247			alu.src[1].chan = 0;
1248			alu.src[1].neg = 1;
1249			alu.dst.sel = 1;
1250			alu.dst.chan = 2;
1251			alu.dst.write = 1;
1252			alu.last = 1;
1253			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1254				return r;
1255
1256			/* ADD r1.z, r1.z, -r1.y */
1257			alu.op = ALU_OP2_ADD;
1258			alu.src[0].sel = 1;
1259			alu.src[0].chan = 2;
1260			alu.src[1].sel = 1;
1261			alu.src[1].chan = 1;
1262			alu.src[1].neg = 1;
1263			alu.dst.sel = 1;
1264			alu.dst.chan = 2;
1265			alu.dst.write = 1;
1266			alu.last = 1;
1267			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1268				return r;
1269			break;
1270		}
1271		break;
1272	default:
1273		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1274		return -EINVAL;
1275	}
1276	return 0;
1277}
1278
1279static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1280{
1281	struct tgsi_parse_context parse;
1282	struct {
1283		boolean enabled;
1284		int *reg;
1285		unsigned name, alternate_name;
1286	} inputs[2] = {
1287		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1288
1289		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1290	};
1291	int num_regs = 0;
1292	unsigned k, i;
1293
1294	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1295		return 0;
1296	}
1297
1298	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1299	while (!tgsi_parse_end_of_tokens(&parse)) {
1300		tgsi_parse_token(&parse);
1301
1302		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1303			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1304			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1305				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1306				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1307			{
1308				int interpolate, location, k;
1309
1310				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1311					location = TGSI_INTERPOLATE_LOC_CENTER;
1312				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1313					location = TGSI_INTERPOLATE_LOC_CENTER;
1314					/* Needs sample positions, currently those are always available */
1315				} else {
1316					location = TGSI_INTERPOLATE_LOC_CENTROID;
1317				}
1318
1319				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1320				k = eg_get_interpolator_index(interpolate, location);
1321				if (k >= 0)
1322					ctx->eg_interpolators[k].enabled = true;
1323			}
1324		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1325			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1326			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1327				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1328					if (d->Semantic.Name == inputs[k].name ||
1329						d->Semantic.Name == inputs[k].alternate_name) {
1330						inputs[k].enabled = true;
1331					}
1332				}
1333			}
1334		}
1335	}
1336
1337	tgsi_parse_free(&parse);
1338
1339	if (ctx->info.reads_samplemask &&
1340	    (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1341		inputs[1].enabled = true;
1342	}
1343
1344	if (ctx->bc->chip_class >= EVERGREEN) {
1345		int num_baryc = 0;
1346		/* assign gpr to each interpolator according to priority */
1347		for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1348			if (ctx->eg_interpolators[i].enabled) {
1349				ctx->eg_interpolators[i].ij_index = num_baryc;
1350				num_baryc++;
1351			}
1352		}
1353		num_baryc = (num_baryc + 1) >> 1;
1354		gpr_offset += num_baryc;
1355	}
1356
1357	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1358		boolean enabled = inputs[i].enabled;
1359		int *reg = inputs[i].reg;
1360		unsigned name = inputs[i].name;
1361
1362		if (enabled) {
1363			int gpr = gpr_offset + num_regs++;
1364			ctx->shader->nsys_inputs++;
1365
1366			// add to inputs, allocate a gpr
1367			k = ctx->shader->ninput++;
1368			ctx->shader->input[k].name = name;
1369			ctx->shader->input[k].sid = 0;
1370			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1371			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1372			*reg = ctx->shader->input[k].gpr = gpr;
1373		}
1374	}
1375
1376	return gpr_offset + num_regs;
1377}
1378
1379/*
1380 * for evergreen we need to scan the shader to find the number of GPRs we need to
1381 * reserve for interpolation and system values
1382 *
1383 * we need to know if we are going to emit any sample or centroid inputs
1384 * if perspective and linear are required
1385*/
1386static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1387{
1388	unsigned i;
1389
1390	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1391
1392	/*
1393	 * Could get this information from the shader info. But right now
1394	 * we interpolate all declared inputs, whereas the shader info will
1395	 * only contain the bits if the inputs are actually used, so it might
1396	 * not be safe...
1397	 */
1398	for (i = 0; i < ctx->info.num_inputs; i++) {
1399		int k;
1400		/* skip position/face/mask/sampleid */
1401		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1402		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1403		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1404		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1405			continue;
1406
1407		k = eg_get_interpolator_index(
1408			ctx->info.input_interpolate[i],
1409			ctx->info.input_interpolate_loc[i]);
1410		if (k >= 0)
1411			ctx->eg_interpolators[k].enabled = TRUE;
1412	}
1413
1414	/* XXX PULL MODEL and LINE STIPPLE */
1415
1416	return allocate_system_value_inputs(ctx, 0);
1417}
1418
1419/* sample_id_sel == NULL means fetch for current sample */
1420static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1421{
1422	struct r600_bytecode_vtx vtx;
1423	int r, t1;
1424
1425	t1 = r600_get_temp(ctx);
1426
1427	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1428	vtx.op = FETCH_OP_VFETCH;
1429	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1430	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1431	if (sample_id == NULL) {
1432		assert(ctx->fixed_pt_position_gpr != -1);
1433
1434		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1435		vtx.src_sel_x = 3;
1436	}
1437	else {
1438		struct r600_bytecode_alu alu;
1439
1440		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1441		alu.op = ALU_OP1_MOV;
1442		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1443		alu.dst.sel = t1;
1444		alu.dst.write = 1;
1445		alu.last = 1;
1446		r = r600_bytecode_add_alu(ctx->bc, &alu);
1447		if (r)
1448			return r;
1449
1450		vtx.src_gpr = t1;
1451		vtx.src_sel_x = 0;
1452	}
1453	vtx.mega_fetch_count = 16;
1454	vtx.dst_gpr = t1;
1455	vtx.dst_sel_x = 0;
1456	vtx.dst_sel_y = 1;
1457	vtx.dst_sel_z = 2;
1458	vtx.dst_sel_w = 3;
1459	vtx.data_format = FMT_32_32_32_32_FLOAT;
1460	vtx.num_format_all = 2;
1461	vtx.format_comp_all = 1;
1462	vtx.use_const_fields = 0;
1463	vtx.offset = 0;
1464	vtx.endian = r600_endian_swap(32);
1465	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1466
1467	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1468	if (r)
1469		return r;
1470
1471	return t1;
1472}
1473
1474static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1475{
1476	int r;
1477	struct r600_bytecode_alu alu;
1478
1479	/* do a vtx fetch with wqm set on the vtx fetch */
1480	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1481	alu.op = ALU_OP1_MOV;
1482	alu.dst.sel = ctx->helper_invoc_reg;
1483	alu.dst.chan = 0;
1484	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1485	alu.src[0].value = 0xffffffff;
1486	alu.dst.write = 1;
1487	alu.last = 1;
1488	r = r600_bytecode_add_alu(ctx->bc, &alu);
1489	if (r)
1490		return r;
1491
1492	/* do a vtx fetch in VPM mode */
1493	struct r600_bytecode_vtx vtx;
1494	memset(&vtx, 0, sizeof(vtx));
1495	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1496	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1497	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1498	vtx.src_gpr = 0;
1499	vtx.mega_fetch_count = 16; /* no idea here really... */
1500	vtx.dst_gpr = ctx->helper_invoc_reg;
1501	vtx.dst_sel_x = 4;
1502	vtx.dst_sel_y = 7;		/* SEL_Y */
1503	vtx.dst_sel_z = 7;		/* SEL_Z */
1504	vtx.dst_sel_w = 7;		/* SEL_W */
1505	vtx.data_format = FMT_32;
1506	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1507		return r;
1508	ctx->bc->cf_last->vpm = 1;
1509	return 0;
1510}
1511
1512static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1513{
1514	int r;
1515	struct r600_bytecode_alu alu;
1516
1517	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1518	alu.op = ALU_OP1_MOV;
1519	alu.dst.sel = ctx->helper_invoc_reg;
1520	alu.dst.chan = 0;
1521	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1522	alu.src[0].value = 0xffffffff;
1523	alu.dst.write = 1;
1524	alu.last = 1;
1525	r = r600_bytecode_add_alu(ctx->bc, &alu);
1526	if (r)
1527		return r;
1528
1529	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1530	alu.op = ALU_OP1_MOV;
1531	alu.dst.sel = ctx->helper_invoc_reg;
1532	alu.dst.chan = 0;
1533	alu.src[0].sel = V_SQ_ALU_SRC_0;
1534	alu.dst.write = 1;
1535	alu.last = 1;
1536	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1537	if (r)
1538		return r;
1539
1540	return ctx->helper_invoc_reg;
1541}
1542
1543static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1544{
1545	struct r600_bytecode_vtx vtx;
1546	int r, t1;
1547
1548	if (ctx->cs_block_size_loaded)
1549		return ctx->cs_block_size_reg;
1550	if (ctx->cs_grid_size_loaded)
1551		return ctx->cs_grid_size_reg;
1552
1553	t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1554	struct r600_bytecode_alu alu;
1555	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1556	alu.op = ALU_OP1_MOV;
1557	alu.src[0].sel = V_SQ_ALU_SRC_0;
1558	alu.dst.sel = t1;
1559	alu.dst.write = 1;
1560	alu.last = 1;
1561	r = r600_bytecode_add_alu(ctx->bc, &alu);
1562	if (r)
1563		return r;
1564
1565	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1566	vtx.op = FETCH_OP_VFETCH;
1567	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1568	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1569	vtx.src_gpr = t1;
1570	vtx.src_sel_x = 0;
1571
1572	vtx.mega_fetch_count = 16;
1573	vtx.dst_gpr = t1;
1574	vtx.dst_sel_x = 0;
1575	vtx.dst_sel_y = 1;
1576	vtx.dst_sel_z = 2;
1577	vtx.dst_sel_w = 7;
1578	vtx.data_format = FMT_32_32_32_32;
1579	vtx.num_format_all = 1;
1580	vtx.format_comp_all = 0;
1581	vtx.use_const_fields = 0;
1582	vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1583	vtx.endian = r600_endian_swap(32);
1584	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1585
1586	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1587	if (r)
1588		return r;
1589
1590	if (load_block)
1591		ctx->cs_block_size_loaded = true;
1592	else
1593		ctx->cs_grid_size_loaded = true;
1594	return t1;
1595}
1596
1597static void tgsi_src(struct r600_shader_ctx *ctx,
1598		     const struct tgsi_full_src_register *tgsi_src,
1599		     struct r600_shader_src *r600_src)
1600{
1601	memset(r600_src, 0, sizeof(*r600_src));
1602	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1603	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1604	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1605	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1606	r600_src->neg = tgsi_src->Register.Negate;
1607	r600_src->abs = tgsi_src->Register.Absolute;
1608
1609	if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1610		bool spilled;
1611		unsigned idx;
1612
1613		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1614
1615		if (spilled) {
1616			int reg = r600_get_temp(ctx);
1617			int r;
1618
1619			r600_src->sel = reg;
1620
1621			if (ctx->bc->chip_class < R700) {
1622				struct r600_bytecode_output cf;
1623
1624				memset(&cf, 0, sizeof(struct r600_bytecode_output));
1625				cf.op = CF_OP_MEM_SCRATCH;
1626				cf.elem_size = 3;
1627				cf.gpr = reg;
1628				cf.comp_mask = 0xF;
1629				cf.swizzle_x = 0;
1630				cf.swizzle_y = 1;
1631				cf.swizzle_z = 2;
1632				cf.swizzle_w = 3;
1633				cf.burst_count = 1;
1634
1635				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1636					&cf.array_base, &cf.array_size);
1637
1638				if (tgsi_src->Register.Indirect) {
1639					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1640					cf.index_gpr = ctx->bc->ar_reg;
1641				}
1642				else {
1643					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1644					cf.array_base += idx;
1645					cf.array_size = 0;
1646				}
1647
1648				r = r600_bytecode_add_output(ctx->bc, &cf);
1649			}
1650			else {
1651				struct r600_bytecode_vtx vtx;
1652
1653				if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1654					r600_bytecode_need_wait_ack(ctx->bc, false);
1655					r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1656				}
1657
1658				memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1659				vtx.op = FETCH_OP_READ_SCRATCH;
1660				vtx.dst_gpr = reg;
1661				vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1662				vtx.elem_size = 3;
1663				vtx.data_format = FMT_32_32_32_32;
1664				vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1665				vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1666				vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1667				vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1668				vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1669
1670				get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1671					&vtx.array_base, &vtx.array_size);
1672
1673				if (tgsi_src->Register.Indirect) {
1674					vtx.indexed = 1;
1675					vtx.src_gpr = ctx->bc->ar_reg;
1676				}
1677				else {
1678					vtx.array_base += idx;
1679					vtx.array_size = 0;
1680				}
1681
1682				r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1683			}
1684
1685			if (r)
1686				return;
1687		}
1688		else {
1689			if (tgsi_src->Register.Indirect)
1690				r600_src->rel = V_SQ_REL_RELATIVE;
1691
1692			r600_src->sel = idx;
1693		}
1694
1695		return;
1696	}
1697
1698	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1699		int index;
1700		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1701			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1702			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1703
1704			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1705			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel);
1706			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1707				return;
1708		}
1709		index = tgsi_src->Register.Index;
1710		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1711		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1712	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1713		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1714			r600_src->swizzle[0] = 2; // Z value
1715			r600_src->swizzle[1] = 2;
1716			r600_src->swizzle[2] = 2;
1717			r600_src->swizzle[3] = 2;
1718			r600_src->sel = ctx->face_gpr;
1719		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1720			r600_src->swizzle[0] = 3; // W value
1721			r600_src->swizzle[1] = 3;
1722			r600_src->swizzle[2] = 3;
1723			r600_src->swizzle[3] = 3;
1724			r600_src->sel = ctx->fixed_pt_position_gpr;
1725		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1726			r600_src->swizzle[0] = 0;
1727			r600_src->swizzle[1] = 1;
1728			r600_src->swizzle[2] = 4;
1729			r600_src->swizzle[3] = 4;
1730			r600_src->sel = load_sample_position(ctx, NULL, -1);
1731		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1732			r600_src->swizzle[0] = 3;
1733			r600_src->swizzle[1] = 3;
1734			r600_src->swizzle[2] = 3;
1735			r600_src->swizzle[3] = 3;
1736			r600_src->sel = 0;
1737		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1738			r600_src->swizzle[0] = 0;
1739			r600_src->swizzle[1] = 0;
1740			r600_src->swizzle[2] = 0;
1741			r600_src->swizzle[3] = 0;
1742			r600_src->sel = 0;
1743		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1744			r600_src->sel = 0;
1745		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1746			r600_src->sel = 1;
1747		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1748			r600_src->swizzle[0] = 3;
1749			r600_src->swizzle[1] = 3;
1750			r600_src->swizzle[2] = 3;
1751			r600_src->swizzle[3] = 3;
1752			r600_src->sel = 1;
1753		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1754			r600_src->swizzle[0] = 2;
1755			r600_src->swizzle[1] = 2;
1756			r600_src->swizzle[2] = 2;
1757			r600_src->swizzle[3] = 2;
1758			r600_src->sel = 0;
1759		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1760			r600_src->sel = 1;
1761		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1762			r600_src->sel = 3;
1763		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1764			r600_src->sel = 2;
1765		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1766			r600_src->sel = ctx->tess_input_info;
1767			r600_src->swizzle[0] = 2;
1768			r600_src->swizzle[1] = 2;
1769			r600_src->swizzle[2] = 2;
1770			r600_src->swizzle[3] = 2;
1771		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1772			r600_src->sel = 0;
1773			r600_src->swizzle[0] = 0;
1774			r600_src->swizzle[1] = 0;
1775			r600_src->swizzle[2] = 0;
1776			r600_src->swizzle[3] = 0;
1777		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1778			r600_src->sel = 0;
1779			r600_src->swizzle[0] = 3;
1780			r600_src->swizzle[1] = 3;
1781			r600_src->swizzle[2] = 3;
1782			r600_src->swizzle[3] = 3;
1783		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1784			r600_src->sel = load_block_grid_size(ctx, false);
1785		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1786			r600_src->sel = load_block_grid_size(ctx, true);
1787		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1788			r600_src->sel = ctx->helper_invoc_reg;
1789			r600_src->swizzle[0] = 0;
1790			r600_src->swizzle[1] = 0;
1791			r600_src->swizzle[2] = 0;
1792			r600_src->swizzle[3] = 0;
1793		}
1794	} else {
1795		if (tgsi_src->Register.Indirect)
1796			r600_src->rel = V_SQ_REL_RELATIVE;
1797		r600_src->sel = tgsi_src->Register.Index;
1798		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1799	}
1800	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1801		if (tgsi_src->Register.Dimension) {
1802			r600_src->kc_bank = tgsi_src->Dimension.Index;
1803			if (tgsi_src->Dimension.Indirect) {
1804				r600_src->kc_rel = 1;
1805			}
1806		}
1807	}
1808}
1809
1810static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1811                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1812                                unsigned int dst_reg)
1813{
1814	struct r600_bytecode_vtx vtx;
1815	unsigned int ar_reg;
1816	int r;
1817
1818	if (offset) {
1819		struct r600_bytecode_alu alu;
1820
1821		memset(&alu, 0, sizeof(alu));
1822
1823		alu.op = ALU_OP2_ADD_INT;
1824		alu.src[0].sel = ctx->bc->ar_reg;
1825		alu.src[0].chan = ar_chan;
1826
1827		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1828		alu.src[1].value = offset;
1829
1830		alu.dst.sel = dst_reg;
1831		alu.dst.chan = ar_chan;
1832		alu.dst.write = 1;
1833		alu.last = 1;
1834
1835		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1836			return r;
1837
1838		ar_reg = dst_reg;
1839	} else {
1840		ar_reg = ctx->bc->ar_reg;
1841	}
1842
1843	memset(&vtx, 0, sizeof(vtx));
1844	vtx.buffer_id = cb_idx;
1845	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1846	vtx.src_gpr = ar_reg;
1847	vtx.src_sel_x = ar_chan;
1848	vtx.mega_fetch_count = 16;
1849	vtx.dst_gpr = dst_reg;
1850	vtx.dst_sel_x = 0;		/* SEL_X */
1851	vtx.dst_sel_y = 1;		/* SEL_Y */
1852	vtx.dst_sel_z = 2;		/* SEL_Z */
1853	vtx.dst_sel_w = 3;		/* SEL_W */
1854	vtx.data_format = FMT_32_32_32_32_FLOAT;
1855	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1856	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1857	vtx.endian = r600_endian_swap(32);
1858	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1859
1860	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1861		return r;
1862
1863	return 0;
1864}
1865
1866static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1867{
1868	struct r600_bytecode_vtx vtx;
1869	int r;
1870	unsigned index = src->Register.Index;
1871	unsigned vtx_id = src->Dimension.Index;
1872	int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1873	int offset_chan = vtx_id % 3;
1874	int t2 = 0;
1875
1876	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1877	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1878
1879	if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1880		offset_chan = 3;
1881
1882	if (src->Dimension.Indirect || src->Register.Indirect)
1883		t2 = r600_get_temp(ctx);
1884
1885	if (src->Dimension.Indirect) {
1886		int treg[3];
1887		struct r600_bytecode_alu alu;
1888		int r, i;
1889		unsigned addr_reg;
1890		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1891		if (src->DimIndirect.Index > 0) {
1892			r = single_alu_op2(ctx, ALU_OP1_MOV,
1893					   ctx->bc->ar_reg, 0,
1894					   addr_reg, 0,
1895					   0, 0);
1896			if (r)
1897				return r;
1898		}
1899		/*
1900		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1901		   at least this is what fglrx seems to do. */
1902		for (i = 0; i < 3; i++) {
1903			treg[i] = r600_get_temp(ctx);
1904		}
1905		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1906
1907		for (i = 0; i < 3; i++) {
1908			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1909			alu.op = ALU_OP1_MOV;
1910			alu.src[0].sel = ctx->gs_rotated_input[0];
1911			alu.src[0].chan = i == 2 ? 3 : i;
1912			alu.dst.sel = treg[i];
1913			alu.dst.chan = 0;
1914			alu.dst.write = 1;
1915			alu.last = 1;
1916			r = r600_bytecode_add_alu(ctx->bc, &alu);
1917			if (r)
1918				return r;
1919		}
1920		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1921		alu.op = ALU_OP1_MOV;
1922		alu.src[0].sel = treg[0];
1923		alu.src[0].rel = 1;
1924		alu.dst.sel = t2;
1925		alu.dst.write = 1;
1926		alu.last = 1;
1927		r = r600_bytecode_add_alu(ctx->bc, &alu);
1928		if (r)
1929			return r;
1930		offset_reg = t2;
1931		offset_chan = 0;
1932	}
1933
1934	if (src->Register.Indirect) {
1935		int addr_reg;
1936		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1937
1938		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1939
1940		/* pull the value from index_reg */
1941		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1942				   t2, 1,
1943				   addr_reg, 0,
1944				   V_SQ_ALU_SRC_LITERAL, first);
1945		if (r)
1946			return r;
1947		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1948				   t2, 0,
1949				   t2, 1,
1950				   V_SQ_ALU_SRC_LITERAL, 4,
1951				   offset_reg, offset_chan);
1952		if (r)
1953			return r;
1954		offset_reg = t2;
1955		offset_chan = 0;
1956		index = src->Register.Index - first;
1957	}
1958
1959	memset(&vtx, 0, sizeof(vtx));
1960	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1961	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1962	vtx.src_gpr = offset_reg;
1963	vtx.src_sel_x = offset_chan;
1964	vtx.offset = index * 16; /*bytes*/
1965	vtx.mega_fetch_count = 16;
1966	vtx.dst_gpr = dst_reg;
1967	vtx.dst_sel_x = 0;		/* SEL_X */
1968	vtx.dst_sel_y = 1;		/* SEL_Y */
1969	vtx.dst_sel_z = 2;		/* SEL_Z */
1970	vtx.dst_sel_w = 3;		/* SEL_W */
1971	if (ctx->bc->chip_class >= EVERGREEN) {
1972		vtx.use_const_fields = 1;
1973	} else {
1974		vtx.data_format = FMT_32_32_32_32_FLOAT;
1975	}
1976
1977	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1978		return r;
1979
1980	return 0;
1981}
1982
1983static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1984{
1985	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1986	unsigned i;
1987
1988	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1989		struct tgsi_full_src_register *src = &inst->Src[i];
1990
1991		if (src->Register.File == TGSI_FILE_INPUT) {
1992			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1993				/* primitive id is in R0.z */
1994				ctx->src[i].sel = 0;
1995				ctx->src[i].swizzle[0] = 2;
1996			}
1997		}
1998		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1999			int treg = r600_get_temp(ctx);
2000
2001			fetch_gs_input(ctx, src, treg);
2002			ctx->src[i].sel = treg;
2003			ctx->src[i].rel = 0;
2004		}
2005	}
2006	return 0;
2007}
2008
2009
2010/* Tessellation shaders pass outputs to the next shader using LDS.
2011 *
2012 * LS outputs = TCS(HS) inputs
2013 * TCS(HS) outputs = TES(DS) inputs
2014 *
2015 * The LDS layout is:
2016 * - TCS inputs for patch 0
2017 * - TCS inputs for patch 1
2018 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
2019 * - ...
2020 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
2021 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
2022 * - TCS outputs for patch 1
2023 * - Per-patch TCS outputs for patch 1
2024 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
2025 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2026 * - ...
2027 *
2028 * All three shaders VS(LS), TCS, TES share the same LDS space.
2029 */
2030/* this will return with the dw address in temp_reg.x */
2031static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2032				 const struct tgsi_full_dst_register *dst,
2033				 const struct tgsi_full_src_register *src,
2034				 int stride_bytes_reg, int stride_bytes_chan)
2035{
2036	struct tgsi_full_dst_register reg;
2037	ubyte *name, *index, *array_first;
2038	int r;
2039	int param;
2040	struct tgsi_shader_info *info = &ctx->info;
2041	/* Set the register description. The address computation is the same
2042	 * for sources and destinations. */
2043	if (src) {
2044		reg.Register.File = src->Register.File;
2045		reg.Register.Index = src->Register.Index;
2046		reg.Register.Indirect = src->Register.Indirect;
2047		reg.Register.Dimension = src->Register.Dimension;
2048		reg.Indirect = src->Indirect;
2049		reg.Dimension = src->Dimension;
2050		reg.DimIndirect = src->DimIndirect;
2051	} else
2052		reg = *dst;
2053
2054	/* If the register is 2-dimensional (e.g. an array of vertices
2055	 * in a primitive), calculate the base address of the vertex. */
2056	if (reg.Register.Dimension) {
2057		int sel, chan;
2058		if (reg.Dimension.Indirect) {
2059			unsigned addr_reg;
2060			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2061
2062			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2063			/* pull the value from index_reg */
2064			sel = addr_reg;
2065			chan = 0;
2066		} else {
2067			sel = V_SQ_ALU_SRC_LITERAL;
2068			chan = reg.Dimension.Index;
2069		}
2070
2071		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2072				   temp_reg, 0,
2073				   stride_bytes_reg, stride_bytes_chan,
2074				   sel, chan,
2075				   temp_reg, 0);
2076		if (r)
2077			return r;
2078	}
2079
2080	if (reg.Register.File == TGSI_FILE_INPUT) {
2081		name = info->input_semantic_name;
2082		index = info->input_semantic_index;
2083		array_first = info->input_array_first;
2084	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2085		name = info->output_semantic_name;
2086		index = info->output_semantic_index;
2087		array_first = info->output_array_first;
2088	} else {
2089		assert(0);
2090		return -1;
2091	}
2092	if (reg.Register.Indirect) {
2093		int addr_reg;
2094		int first;
2095		/* Add the relative address of the element. */
2096		if (reg.Indirect.ArrayID)
2097			first = array_first[reg.Indirect.ArrayID];
2098		else
2099			first = reg.Register.Index;
2100
2101		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2102
2103		/* pull the value from index_reg */
2104		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2105				   temp_reg, 0,
2106				   V_SQ_ALU_SRC_LITERAL, 16,
2107				   addr_reg, 0,
2108				   temp_reg, 0);
2109		if (r)
2110			return r;
2111
2112		param = r600_get_lds_unique_index(name[first],
2113						  index[first]);
2114
2115	} else {
2116		param = r600_get_lds_unique_index(name[reg.Register.Index],
2117						  index[reg.Register.Index]);
2118	}
2119
2120	/* add to base_addr - passed in temp_reg.x */
2121	if (param) {
2122		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2123				   temp_reg, 0,
2124				   temp_reg, 0,
2125				   V_SQ_ALU_SRC_LITERAL, param * 16);
2126		if (r)
2127			return r;
2128
2129	}
2130	return 0;
2131}
2132
2133static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2134			       unsigned dst_reg, unsigned mask)
2135{
2136	struct r600_bytecode_alu alu;
2137	int r, i, lasti;
2138
2139	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2140		ctx->bc->force_add_cf = 1;
2141
2142	lasti = tgsi_last_instruction(mask);
2143	for (i = 1; i <= lasti; i++) {
2144		if (!(mask & (1 << i)))
2145			continue;
2146
2147		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2148				   temp_reg, i,
2149				   temp_reg, 0,
2150				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2151		if (r)
2152			return r;
2153	}
2154	for (i = 0; i <= lasti; i++) {
2155		if (!(mask & (1 << i)))
2156			continue;
2157
2158		/* emit an LDS_READ_RET */
2159		memset(&alu, 0, sizeof(alu));
2160		alu.op = LDS_OP1_LDS_READ_RET;
2161		alu.src[0].sel = temp_reg;
2162		alu.src[0].chan = i;
2163		alu.src[1].sel = V_SQ_ALU_SRC_0;
2164		alu.src[2].sel = V_SQ_ALU_SRC_0;
2165		alu.dst.chan = 0;
2166		alu.is_lds_idx_op = true;
2167		alu.last = 1;
2168		r = r600_bytecode_add_alu(ctx->bc, &alu);
2169		if (r)
2170			return r;
2171	}
2172	for (i = 0; i <= lasti; i++) {
2173		if (!(mask & (1 << i)))
2174			continue;
2175
2176		/* then read from LDS_OQ_A_POP */
2177		memset(&alu, 0, sizeof(alu));
2178
2179		alu.op = ALU_OP1_MOV;
2180		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2181		alu.src[0].chan = 0;
2182		alu.dst.sel = dst_reg;
2183		alu.dst.chan = i;
2184		alu.dst.write = 1;
2185		alu.last = 1;
2186		r = r600_bytecode_add_alu(ctx->bc, &alu);
2187		if (r)
2188			return r;
2189	}
2190	return 0;
2191}
2192
2193static int fetch_mask(struct tgsi_src_register *reg)
2194{
2195	int mask = 0;
2196	mask |= 1 << reg->SwizzleX;
2197	mask |= 1 << reg->SwizzleY;
2198	mask |= 1 << reg->SwizzleZ;
2199	mask |= 1 << reg->SwizzleW;
2200	return mask;
2201}
2202
2203static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2204{
2205	int r;
2206	unsigned temp_reg = r600_get_temp(ctx);
2207
2208	r = get_lds_offset0(ctx, 2, temp_reg,
2209			    src->Register.Dimension ? false : true);
2210	if (r)
2211		return r;
2212
2213	/* the base address is now in temp.x */
2214	r = r600_get_byte_address(ctx, temp_reg,
2215				  NULL, src, ctx->tess_output_info, 1);
2216	if (r)
2217		return r;
2218
2219	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2220	if (r)
2221		return r;
2222	return 0;
2223}
2224
2225static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2226{
2227	int r;
2228	unsigned temp_reg = r600_get_temp(ctx);
2229
2230	/* t.x = ips * r0.y */
2231	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2232			   temp_reg, 0,
2233			   ctx->tess_input_info, 0,
2234			   0, 1);
2235
2236	if (r)
2237		return r;
2238
2239	/* the base address is now in temp.x */
2240	r = r600_get_byte_address(ctx, temp_reg,
2241				  NULL, src, ctx->tess_input_info, 1);
2242	if (r)
2243		return r;
2244
2245	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2246	if (r)
2247		return r;
2248	return 0;
2249}
2250
2251static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2252{
2253	int r;
2254	unsigned temp_reg = r600_get_temp(ctx);
2255
2256	r = get_lds_offset0(ctx, 1, temp_reg,
2257			    src->Register.Dimension ? false : true);
2258	if (r)
2259		return r;
2260	/* the base address is now in temp.x */
2261	r = r600_get_byte_address(ctx, temp_reg,
2262				  NULL, src,
2263				  ctx->tess_output_info, 1);
2264	if (r)
2265		return r;
2266
2267	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2268	if (r)
2269		return r;
2270	return 0;
2271}
2272
2273static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2274{
2275	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2276	unsigned i;
2277
2278	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2279		struct tgsi_full_src_register *src = &inst->Src[i];
2280
2281		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2282			int treg = r600_get_temp(ctx);
2283			fetch_tes_input(ctx, src, treg);
2284			ctx->src[i].sel = treg;
2285			ctx->src[i].rel = 0;
2286		}
2287		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2288			int treg = r600_get_temp(ctx);
2289			fetch_tcs_input(ctx, src, treg);
2290			ctx->src[i].sel = treg;
2291			ctx->src[i].rel = 0;
2292		}
2293		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2294			int treg = r600_get_temp(ctx);
2295			fetch_tcs_output(ctx, src, treg);
2296			ctx->src[i].sel = treg;
2297			ctx->src[i].rel = 0;
2298		}
2299	}
2300	return 0;
2301}
2302
2303static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2304{
2305	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2306	struct r600_bytecode_alu alu;
2307	int i, j, k, nconst, r;
2308
2309	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2310		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2311			nconst++;
2312		}
2313		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2314	}
2315	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2316		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2317			continue;
2318		}
2319
2320		if (ctx->src[i].rel) {
2321			int chan = inst->Src[i].Indirect.Swizzle;
2322			int treg = r600_get_temp(ctx);
2323			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2324				return r;
2325
2326			ctx->src[i].kc_bank = 0;
2327			ctx->src[i].kc_rel = 0;
2328			ctx->src[i].sel = treg;
2329			ctx->src[i].rel = 0;
2330			j--;
2331		} else if (j > 0) {
2332			int treg = r600_get_temp(ctx);
2333			for (k = 0; k < 4; k++) {
2334				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2335				alu.op = ALU_OP1_MOV;
2336				alu.src[0].sel = ctx->src[i].sel;
2337				alu.src[0].chan = k;
2338				alu.src[0].rel = ctx->src[i].rel;
2339				alu.src[0].kc_bank = ctx->src[i].kc_bank;
2340				alu.src[0].kc_rel = ctx->src[i].kc_rel;
2341				alu.dst.sel = treg;
2342				alu.dst.chan = k;
2343				alu.dst.write = 1;
2344				if (k == 3)
2345					alu.last = 1;
2346				r = r600_bytecode_add_alu(ctx->bc, &alu);
2347				if (r)
2348					return r;
2349			}
2350			ctx->src[i].sel = treg;
2351			ctx->src[i].rel =0;
2352			j--;
2353		}
2354	}
2355	return 0;
2356}
2357
2358/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2359static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2360{
2361	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2362	struct r600_bytecode_alu alu;
2363	int i, j, k, nliteral, r;
2364
2365	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2366		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2367			nliteral++;
2368		}
2369	}
2370	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2371		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2372			int treg = r600_get_temp(ctx);
2373			for (k = 0; k < 4; k++) {
2374				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2375				alu.op = ALU_OP1_MOV;
2376				alu.src[0].sel = ctx->src[i].sel;
2377				alu.src[0].chan = k;
2378				alu.src[0].value = ctx->src[i].value[k];
2379				alu.dst.sel = treg;
2380				alu.dst.chan = k;
2381				alu.dst.write = 1;
2382				if (k == 3)
2383					alu.last = 1;
2384				r = r600_bytecode_add_alu(ctx->bc, &alu);
2385				if (r)
2386					return r;
2387			}
2388			ctx->src[i].sel = treg;
2389			j--;
2390		}
2391	}
2392	return 0;
2393}
2394
2395static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2396{
2397	int i, r, count = ctx->shader->ninput;
2398
2399	for (i = 0; i < count; i++) {
2400		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2401			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2402			if (r)
2403				return r;
2404		}
2405	}
2406	return 0;
2407}
2408
2409static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2410						  int stream, unsigned *stream_item_size UNUSED)
2411{
2412	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2413	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2414	int j, r;
2415	unsigned i;
2416
2417	/* Sanity checking. */
2418	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2419		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2420		r = -EINVAL;
2421		goto out_err;
2422	}
2423	for (i = 0; i < so->num_outputs; i++) {
2424		if (so->output[i].output_buffer >= 4) {
2425			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2426				 so->output[i].output_buffer);
2427			r = -EINVAL;
2428			goto out_err;
2429		}
2430	}
2431
2432	/* Initialize locations where the outputs are stored. */
2433	for (i = 0; i < so->num_outputs; i++) {
2434
2435		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2436		start_comp[i] = so->output[i].start_component;
2437		/* Lower outputs with dst_offset < start_component.
2438		 *
2439		 * We can only output 4D vectors with a write mask, e.g. we can
2440		 * only output the W component at offset 3, etc. If we want
2441		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2442		 * to move it to X and output X. */
2443		if (so->output[i].dst_offset < so->output[i].start_component) {
2444			unsigned tmp = r600_get_temp(ctx);
2445
2446			for (j = 0; j < so->output[i].num_components; j++) {
2447				struct r600_bytecode_alu alu;
2448				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2449				alu.op = ALU_OP1_MOV;
2450				alu.src[0].sel = so_gpr[i];
2451				alu.src[0].chan = so->output[i].start_component + j;
2452
2453				alu.dst.sel = tmp;
2454				alu.dst.chan = j;
2455				alu.dst.write = 1;
2456				if (j == so->output[i].num_components - 1)
2457					alu.last = 1;
2458				r = r600_bytecode_add_alu(ctx->bc, &alu);
2459				if (r)
2460					return r;
2461			}
2462			start_comp[i] = 0;
2463			so_gpr[i] = tmp;
2464		}
2465	}
2466
2467	/* Write outputs to buffers. */
2468	for (i = 0; i < so->num_outputs; i++) {
2469		struct r600_bytecode_output output;
2470
2471		if (stream != -1 && stream != so->output[i].stream)
2472			continue;
2473
2474		memset(&output, 0, sizeof(struct r600_bytecode_output));
2475		output.gpr = so_gpr[i];
2476		output.elem_size = so->output[i].num_components - 1;
2477		if (output.elem_size == 2)
2478			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2479		output.array_base = so->output[i].dst_offset - start_comp[i];
2480		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2481		output.burst_count = 1;
2482		/* array_size is an upper limit for the burst_count
2483		 * with MEM_STREAM instructions */
2484		output.array_size = 0xFFF;
2485		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2486
2487		if (ctx->bc->chip_class >= EVERGREEN) {
2488			switch (so->output[i].output_buffer) {
2489			case 0:
2490				output.op = CF_OP_MEM_STREAM0_BUF0;
2491				break;
2492			case 1:
2493				output.op = CF_OP_MEM_STREAM0_BUF1;
2494				break;
2495			case 2:
2496				output.op = CF_OP_MEM_STREAM0_BUF2;
2497				break;
2498			case 3:
2499				output.op = CF_OP_MEM_STREAM0_BUF3;
2500				break;
2501			}
2502			output.op += so->output[i].stream * 4;
2503			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2504			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2505		} else {
2506			switch (so->output[i].output_buffer) {
2507			case 0:
2508				output.op = CF_OP_MEM_STREAM0;
2509				break;
2510			case 1:
2511				output.op = CF_OP_MEM_STREAM1;
2512				break;
2513			case 2:
2514				output.op = CF_OP_MEM_STREAM2;
2515				break;
2516			case 3:
2517				output.op = CF_OP_MEM_STREAM3;
2518					break;
2519			}
2520			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2521		}
2522		r = r600_bytecode_add_output(ctx->bc, &output);
2523		if (r)
2524			goto out_err;
2525	}
2526	return 0;
2527out_err:
2528	return r;
2529}
2530
2531static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2532{
2533	struct r600_bytecode_alu alu;
2534	unsigned reg;
2535
2536	if (!ctx->shader->vs_out_edgeflag)
2537		return;
2538
2539	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2540
2541	/* clamp(x, 0, 1) */
2542	memset(&alu, 0, sizeof(alu));
2543	alu.op = ALU_OP1_MOV;
2544	alu.src[0].sel = reg;
2545	alu.dst.sel = reg;
2546	alu.dst.write = 1;
2547	alu.dst.clamp = 1;
2548	alu.last = 1;
2549	r600_bytecode_add_alu(ctx->bc, &alu);
2550
2551	memset(&alu, 0, sizeof(alu));
2552	alu.op = ALU_OP1_FLT_TO_INT;
2553	alu.src[0].sel = reg;
2554	alu.dst.sel = reg;
2555	alu.dst.write = 1;
2556	alu.last = 1;
2557	r600_bytecode_add_alu(ctx->bc, &alu);
2558}
2559
2560int generate_gs_copy_shader(struct r600_context *rctx,
2561                            struct r600_pipe_shader *gs,
2562                            struct pipe_stream_output_info *so)
2563{
2564	struct r600_shader_ctx ctx = {};
2565	struct r600_shader *gs_shader = &gs->shader;
2566	struct r600_pipe_shader *cshader;
2567	unsigned ocnt = gs_shader->noutput;
2568	struct r600_bytecode_alu alu;
2569	struct r600_bytecode_vtx vtx;
2570	struct r600_bytecode_output output;
2571	struct r600_bytecode_cf *cf_jump, *cf_pop,
2572		*last_exp_pos = NULL, *last_exp_param = NULL;
2573	int next_clip_pos = 61, next_param = 0;
2574	unsigned i, j;
2575	int ring;
2576	bool only_ring_0 = true;
2577	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2578	if (!cshader)
2579		return 0;
2580
2581	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2582	       sizeof(struct r600_shader_io));
2583
2584	cshader->shader.noutput = ocnt;
2585
2586	ctx.shader = &cshader->shader;
2587	ctx.bc = &ctx.shader->bc;
2588	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2589
2590	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2591			   rctx->screen->has_compressed_msaa_texturing);
2592
2593	ctx.bc->isa = rctx->isa;
2594
2595	cf_jump = NULL;
2596	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2597
2598	/* R0.x = R0.x & 0x3fffffff */
2599	memset(&alu, 0, sizeof(alu));
2600	alu.op = ALU_OP2_AND_INT;
2601	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2602	alu.src[1].value = 0x3fffffff;
2603	alu.dst.write = 1;
2604	r600_bytecode_add_alu(ctx.bc, &alu);
2605
2606	/* R0.y = R0.x >> 30 */
2607	memset(&alu, 0, sizeof(alu));
2608	alu.op = ALU_OP2_LSHR_INT;
2609	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2610	alu.src[1].value = 0x1e;
2611	alu.dst.chan = 1;
2612	alu.dst.write = 1;
2613	alu.last = 1;
2614	r600_bytecode_add_alu(ctx.bc, &alu);
2615
2616	/* fetch vertex data from GSVS ring */
2617	for (i = 0; i < ocnt; ++i) {
2618		struct r600_shader_io *out = &ctx.shader->output[i];
2619
2620		out->gpr = i + 1;
2621		out->ring_offset = i * 16;
2622
2623		memset(&vtx, 0, sizeof(vtx));
2624		vtx.op = FETCH_OP_VFETCH;
2625		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2626		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2627		vtx.mega_fetch_count = 16;
2628		vtx.offset = out->ring_offset;
2629		vtx.dst_gpr = out->gpr;
2630		vtx.src_gpr = 0;
2631		vtx.dst_sel_x = 0;
2632		vtx.dst_sel_y = 1;
2633		vtx.dst_sel_z = 2;
2634		vtx.dst_sel_w = 3;
2635		if (rctx->b.chip_class >= EVERGREEN) {
2636			vtx.use_const_fields = 1;
2637		} else {
2638			vtx.data_format = FMT_32_32_32_32_FLOAT;
2639		}
2640
2641		r600_bytecode_add_vtx(ctx.bc, &vtx);
2642	}
2643	ctx.temp_reg = i + 1;
2644	for (ring = 3; ring >= 0; --ring) {
2645		bool enabled = false;
2646		for (i = 0; i < so->num_outputs; i++) {
2647			if (so->output[i].stream == ring) {
2648				enabled = true;
2649				if (ring > 0)
2650					only_ring_0 = false;
2651				break;
2652			}
2653		}
2654		if (ring != 0 && !enabled) {
2655			cshader->shader.ring_item_sizes[ring] = 0;
2656			continue;
2657		}
2658
2659		if (cf_jump) {
2660			// Patch up jump label
2661			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2662			cf_pop = ctx.bc->cf_last;
2663
2664			cf_jump->cf_addr = cf_pop->id + 2;
2665			cf_jump->pop_count = 1;
2666			cf_pop->cf_addr = cf_pop->id + 2;
2667			cf_pop->pop_count = 1;
2668		}
2669
2670		/* PRED_SETE_INT __, R0.y, ring */
2671		memset(&alu, 0, sizeof(alu));
2672		alu.op = ALU_OP2_PRED_SETE_INT;
2673		alu.src[0].chan = 1;
2674		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2675		alu.src[1].value = ring;
2676		alu.execute_mask = 1;
2677		alu.update_pred = 1;
2678		alu.last = 1;
2679		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2680
2681		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2682		cf_jump = ctx.bc->cf_last;
2683
2684		if (enabled)
2685			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2686		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2687	}
2688
2689	/* bc adds nops - copy it */
2690	if (ctx.bc->chip_class == R600) {
2691		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2692		alu.op = ALU_OP0_NOP;
2693		alu.last = 1;
2694		r600_bytecode_add_alu(ctx.bc, &alu);
2695
2696		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2697	}
2698
2699	/* export vertex data */
2700	/* XXX factor out common code with r600_shader_from_tgsi ? */
2701	for (i = 0; i < ocnt; ++i) {
2702		struct r600_shader_io *out = &ctx.shader->output[i];
2703		bool instream0 = true;
2704		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2705			continue;
2706
2707		for (j = 0; j < so->num_outputs; j++) {
2708			if (so->output[j].register_index == i) {
2709				if (so->output[j].stream == 0)
2710					break;
2711				if (so->output[j].stream > 0)
2712					instream0 = false;
2713			}
2714		}
2715		if (!instream0)
2716			continue;
2717		memset(&output, 0, sizeof(output));
2718		output.gpr = out->gpr;
2719		output.elem_size = 3;
2720		output.swizzle_x = 0;
2721		output.swizzle_y = 1;
2722		output.swizzle_z = 2;
2723		output.swizzle_w = 3;
2724		output.burst_count = 1;
2725		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2726		output.op = CF_OP_EXPORT;
2727		switch (out->name) {
2728		case TGSI_SEMANTIC_POSITION:
2729			output.array_base = 60;
2730			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2731			break;
2732
2733		case TGSI_SEMANTIC_PSIZE:
2734			output.array_base = 61;
2735			if (next_clip_pos == 61)
2736				next_clip_pos = 62;
2737			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2738			output.swizzle_y = 7;
2739			output.swizzle_z = 7;
2740			output.swizzle_w = 7;
2741			ctx.shader->vs_out_misc_write = 1;
2742			ctx.shader->vs_out_point_size = 1;
2743			break;
2744		case TGSI_SEMANTIC_LAYER:
2745			if (out->spi_sid) {
2746				/* duplicate it as PARAM to pass to the pixel shader */
2747				output.array_base = next_param++;
2748				r600_bytecode_add_output(ctx.bc, &output);
2749				last_exp_param = ctx.bc->cf_last;
2750			}
2751			output.array_base = 61;
2752			if (next_clip_pos == 61)
2753				next_clip_pos = 62;
2754			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2755			output.swizzle_x = 7;
2756			output.swizzle_y = 7;
2757			output.swizzle_z = 0;
2758			output.swizzle_w = 7;
2759			ctx.shader->vs_out_misc_write = 1;
2760			ctx.shader->vs_out_layer = 1;
2761			break;
2762		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2763			if (out->spi_sid) {
2764				/* duplicate it as PARAM to pass to the pixel shader */
2765				output.array_base = next_param++;
2766				r600_bytecode_add_output(ctx.bc, &output);
2767				last_exp_param = ctx.bc->cf_last;
2768			}
2769			output.array_base = 61;
2770			if (next_clip_pos == 61)
2771				next_clip_pos = 62;
2772			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2773			ctx.shader->vs_out_misc_write = 1;
2774			ctx.shader->vs_out_viewport = 1;
2775			output.swizzle_x = 7;
2776			output.swizzle_y = 7;
2777			output.swizzle_z = 7;
2778			output.swizzle_w = 0;
2779			break;
2780		case TGSI_SEMANTIC_CLIPDIST:
2781			/* spi_sid is 0 for clipdistance outputs that were generated
2782			 * for clipvertex - we don't need to pass them to PS */
2783			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2784			ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2785			ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2786			if (out->spi_sid) {
2787				/* duplicate it as PARAM to pass to the pixel shader */
2788				output.array_base = next_param++;
2789				r600_bytecode_add_output(ctx.bc, &output);
2790				last_exp_param = ctx.bc->cf_last;
2791			}
2792			output.array_base = next_clip_pos++;
2793			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2794			break;
2795		case TGSI_SEMANTIC_FOG:
2796			output.swizzle_y = 4; /* 0 */
2797			output.swizzle_z = 4; /* 0 */
2798			output.swizzle_w = 5; /* 1 */
2799			break;
2800		default:
2801			output.array_base = next_param++;
2802			break;
2803		}
2804		r600_bytecode_add_output(ctx.bc, &output);
2805		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2806			last_exp_param = ctx.bc->cf_last;
2807		else
2808			last_exp_pos = ctx.bc->cf_last;
2809	}
2810
2811	if (!last_exp_pos) {
2812		memset(&output, 0, sizeof(output));
2813		output.gpr = 0;
2814		output.elem_size = 3;
2815		output.swizzle_x = 7;
2816		output.swizzle_y = 7;
2817		output.swizzle_z = 7;
2818		output.swizzle_w = 7;
2819		output.burst_count = 1;
2820		output.type = 2;
2821		output.op = CF_OP_EXPORT;
2822		output.array_base = 60;
2823		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2824		r600_bytecode_add_output(ctx.bc, &output);
2825		last_exp_pos = ctx.bc->cf_last;
2826	}
2827
2828	if (!last_exp_param) {
2829		memset(&output, 0, sizeof(output));
2830		output.gpr = 0;
2831		output.elem_size = 3;
2832		output.swizzle_x = 7;
2833		output.swizzle_y = 7;
2834		output.swizzle_z = 7;
2835		output.swizzle_w = 7;
2836		output.burst_count = 1;
2837		output.type = 2;
2838		output.op = CF_OP_EXPORT;
2839		output.array_base = next_param++;
2840		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2841		r600_bytecode_add_output(ctx.bc, &output);
2842		last_exp_param = ctx.bc->cf_last;
2843	}
2844
2845	last_exp_pos->op = CF_OP_EXPORT_DONE;
2846	last_exp_param->op = CF_OP_EXPORT_DONE;
2847
2848	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2849	cf_pop = ctx.bc->cf_last;
2850
2851	cf_jump->cf_addr = cf_pop->id + 2;
2852	cf_jump->pop_count = 1;
2853	cf_pop->cf_addr = cf_pop->id + 2;
2854	cf_pop->pop_count = 1;
2855
2856	if (ctx.bc->chip_class == CAYMAN)
2857		cm_bytecode_add_cf_end(ctx.bc);
2858	else {
2859		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2860		ctx.bc->cf_last->end_of_program = 1;
2861	}
2862
2863	gs->gs_copy_shader = cshader;
2864	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2865
2866	ctx.bc->nstack = 1;
2867
2868	return r600_bytecode_build(ctx.bc);
2869}
2870
2871static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2872{
2873	if (ind) {
2874		struct r600_bytecode_alu alu;
2875		int r;
2876
2877		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2878		alu.op = ALU_OP2_ADD_INT;
2879		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2880		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2881		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2882		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2883		alu.dst.write = 1;
2884		alu.last = 1;
2885		r = r600_bytecode_add_alu(ctx->bc, &alu);
2886		if (r)
2887			return r;
2888	}
2889	return 0;
2890}
2891
2892static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2893{
2894	struct r600_bytecode_output output;
2895	int ring_offset;
2896	unsigned i, k;
2897	int effective_stream = stream == -1 ? 0 : stream;
2898	int idx = 0;
2899
2900	for (i = 0; i < ctx->shader->noutput; i++) {
2901		if (ctx->gs_for_vs) {
2902			/* for ES we need to lookup corresponding ring offset expected by GS
2903			 * (map this output to GS input by name and sid) */
2904			/* FIXME precompute offsets */
2905			ring_offset = -1;
2906			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2907				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2908				struct r600_shader_io *out = &ctx->shader->output[i];
2909				if (in->name == out->name && in->sid == out->sid)
2910					ring_offset = in->ring_offset;
2911			}
2912
2913			if (ring_offset == -1)
2914				continue;
2915		} else {
2916			ring_offset = idx * 16;
2917			idx++;
2918		}
2919
2920		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2921			continue;
2922		/* next_ring_offset after parsing input decls contains total size of
2923		 * single vertex data, gs_next_vertex - current vertex index */
2924		if (!ind)
2925			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2926
2927		memset(&output, 0, sizeof(struct r600_bytecode_output));
2928		output.gpr = ctx->shader->output[i].gpr;
2929		output.elem_size = 3;
2930		output.comp_mask = 0xF;
2931		output.burst_count = 1;
2932
2933		if (ind)
2934			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2935		else
2936			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2937
2938		switch (stream) {
2939		default:
2940		case 0:
2941			output.op = CF_OP_MEM_RING; break;
2942		case 1:
2943			output.op = CF_OP_MEM_RING1; break;
2944		case 2:
2945			output.op = CF_OP_MEM_RING2; break;
2946		case 3:
2947			output.op = CF_OP_MEM_RING3; break;
2948		}
2949
2950		if (ind) {
2951			output.array_base = ring_offset >> 2; /* in dwords */
2952			output.array_size = 0xfff;
2953			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2954		} else
2955			output.array_base = ring_offset >> 2; /* in dwords */
2956		r600_bytecode_add_output(ctx->bc, &output);
2957	}
2958
2959	++ctx->gs_next_vertex;
2960	return 0;
2961}
2962
2963
2964static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2965{
2966	int r;
2967	struct r600_bytecode_vtx vtx;
2968	int temp_val = ctx->temp_reg;
2969	/* need to store the TCS output somewhere */
2970	r = single_alu_op2(ctx, ALU_OP1_MOV,
2971			   temp_val, 0,
2972			   V_SQ_ALU_SRC_LITERAL, 0,
2973			   0, 0);
2974	if (r)
2975		return r;
2976
2977	/* used by VS/TCS */
2978	if (ctx->tess_input_info) {
2979		/* fetch tcs input values into resv space */
2980		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2981		vtx.op = FETCH_OP_VFETCH;
2982		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2983		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2984		vtx.mega_fetch_count = 16;
2985		vtx.data_format = FMT_32_32_32_32;
2986		vtx.num_format_all = 2;
2987		vtx.format_comp_all = 1;
2988		vtx.use_const_fields = 0;
2989		vtx.endian = r600_endian_swap(32);
2990		vtx.srf_mode_all = 1;
2991		vtx.offset = 0;
2992		vtx.dst_gpr = ctx->tess_input_info;
2993		vtx.dst_sel_x = 0;
2994		vtx.dst_sel_y = 1;
2995		vtx.dst_sel_z = 2;
2996		vtx.dst_sel_w = 3;
2997		vtx.src_gpr = temp_val;
2998		vtx.src_sel_x = 0;
2999
3000		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3001		if (r)
3002			return r;
3003	}
3004
3005	/* used by TCS/TES */
3006	if (ctx->tess_output_info) {
3007		/* fetch tcs output values into resv space */
3008		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3009		vtx.op = FETCH_OP_VFETCH;
3010		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3011		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3012		vtx.mega_fetch_count = 16;
3013		vtx.data_format = FMT_32_32_32_32;
3014		vtx.num_format_all = 2;
3015		vtx.format_comp_all = 1;
3016		vtx.use_const_fields = 0;
3017		vtx.endian = r600_endian_swap(32);
3018		vtx.srf_mode_all = 1;
3019		vtx.offset = 16;
3020		vtx.dst_gpr = ctx->tess_output_info;
3021		vtx.dst_sel_x = 0;
3022		vtx.dst_sel_y = 1;
3023		vtx.dst_sel_z = 2;
3024		vtx.dst_sel_w = 3;
3025		vtx.src_gpr = temp_val;
3026		vtx.src_sel_x = 0;
3027
3028		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3029		if (r)
3030			return r;
3031	}
3032	return 0;
3033}
3034
3035static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3036{
3037	int j, r;
3038	int temp_reg;
3039	unsigned i;
3040
3041	/* fetch tcs input values into input_vals */
3042	ctx->tess_input_info = r600_get_temp(ctx);
3043	ctx->tess_output_info = 0;
3044	r = r600_fetch_tess_io_info(ctx);
3045	if (r)
3046		return r;
3047
3048	temp_reg = r600_get_temp(ctx);
3049	/* dst reg contains LDS address stride * idx */
3050	/* MUL vertexID, vertex_dw_stride */
3051	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3052			   temp_reg, 0,
3053			   ctx->tess_input_info, 1,
3054			   0, 1); /* rel id in r0.y? */
3055	if (r)
3056		return r;
3057
3058	for (i = 0; i < ctx->shader->noutput; i++) {
3059		struct r600_bytecode_alu alu;
3060		int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3061						      ctx->shader->output[i].sid);
3062
3063		if (param) {
3064			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3065					   temp_reg, 1,
3066					   temp_reg, 0,
3067					   V_SQ_ALU_SRC_LITERAL, param * 16);
3068			if (r)
3069				return r;
3070		}
3071
3072		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3073				   temp_reg, 2,
3074				   temp_reg, param ? 1 : 0,
3075				   V_SQ_ALU_SRC_LITERAL, 8);
3076		if (r)
3077			return r;
3078
3079
3080		for (j = 0; j < 2; j++) {
3081			int chan = (j == 1) ? 2 : (param ? 1 : 0);
3082			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3083			alu.op = LDS_OP3_LDS_WRITE_REL;
3084			alu.src[0].sel = temp_reg;
3085			alu.src[0].chan = chan;
3086			alu.src[1].sel = ctx->shader->output[i].gpr;
3087			alu.src[1].chan = j * 2;
3088			alu.src[2].sel = ctx->shader->output[i].gpr;
3089			alu.src[2].chan = (j * 2) + 1;
3090			alu.last = 1;
3091			alu.dst.chan = 0;
3092			alu.lds_idx = 1;
3093			alu.is_lds_idx_op = true;
3094			r = r600_bytecode_add_alu(ctx->bc, &alu);
3095			if (r)
3096				return r;
3097		}
3098	}
3099	return 0;
3100}
3101
3102static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3103{
3104	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3105	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3106	int i, r, lasti;
3107	int temp_reg = r600_get_temp(ctx);
3108	struct r600_bytecode_alu alu;
3109	unsigned write_mask = dst->Register.WriteMask;
3110
3111	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3112		return 0;
3113
3114	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3115	if (r)
3116		return r;
3117
3118	/* the base address is now in temp.x */
3119	r = r600_get_byte_address(ctx, temp_reg,
3120				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3121	if (r)
3122		return r;
3123
3124	/* LDS write */
3125	lasti = tgsi_last_instruction(write_mask);
3126	for (i = 1; i <= lasti; i++) {
3127
3128		if (!(write_mask & (1 << i)))
3129			continue;
3130		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3131				   temp_reg, i,
3132				   temp_reg, 0,
3133				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3134		if (r)
3135			return r;
3136	}
3137
3138	for (i = 0; i <= lasti; i++) {
3139		if (!(write_mask & (1 << i)))
3140			continue;
3141
3142		if ((i == 0 && ((write_mask & 3) == 3)) ||
3143		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
3144			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3145			alu.op = LDS_OP3_LDS_WRITE_REL;
3146			alu.src[0].sel = temp_reg;
3147			alu.src[0].chan = i;
3148
3149			alu.src[1].sel = dst->Register.Index;
3150			alu.src[1].sel += ctx->file_offset[dst->Register.File];
3151			alu.src[1].chan = i;
3152
3153			alu.src[2].sel = dst->Register.Index;
3154			alu.src[2].sel += ctx->file_offset[dst->Register.File];
3155			alu.src[2].chan = i + 1;
3156			alu.lds_idx = 1;
3157			alu.dst.chan = 0;
3158			alu.last = 1;
3159			alu.is_lds_idx_op = true;
3160			r = r600_bytecode_add_alu(ctx->bc, &alu);
3161			if (r)
3162				return r;
3163			i += 1;
3164			continue;
3165		}
3166		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3167		alu.op = LDS_OP2_LDS_WRITE;
3168		alu.src[0].sel = temp_reg;
3169		alu.src[0].chan = i;
3170
3171		alu.src[1].sel = dst->Register.Index;
3172		alu.src[1].sel += ctx->file_offset[dst->Register.File];
3173		alu.src[1].chan = i;
3174
3175		alu.src[2].sel = V_SQ_ALU_SRC_0;
3176		alu.dst.chan = 0;
3177		alu.last = 1;
3178		alu.is_lds_idx_op = true;
3179		r = r600_bytecode_add_alu(ctx->bc, &alu);
3180		if (r)
3181			return r;
3182	}
3183	return 0;
3184}
3185
3186static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3187				 int output_idx, int nc)
3188{
3189	int param;
3190	unsigned temp_reg = r600_get_temp(ctx);
3191	unsigned name = ctx->shader->output[output_idx].name;
3192	int dreg = ctx->shader->output[output_idx].gpr;
3193	int r;
3194
3195	param = r600_get_lds_unique_index(name, 0);
3196	r = get_lds_offset0(ctx, 1, temp_reg, true);
3197	if (r)
3198		return r;
3199
3200	if (param) {
3201		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3202				   temp_reg, 0,
3203				   temp_reg, 0,
3204				   V_SQ_ALU_SRC_LITERAL, param * 16);
3205		if (r)
3206			return r;
3207	}
3208
3209	do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3210	return 0;
3211}
3212
3213static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3214{
3215	int stride, outer_comps, inner_comps;
3216	int tessinner_idx = -1, tessouter_idx = -1;
3217	int i, r;
3218	unsigned j;
3219	int temp_reg = r600_get_temp(ctx);
3220	int treg[3] = {-1, -1, -1};
3221	struct r600_bytecode_alu alu;
3222	struct r600_bytecode_cf *cf_jump, *cf_pop;
3223
3224	/* only execute factor emission for invocation 0 */
3225	/* PRED_SETE_INT __, R0.x, 0 */
3226	memset(&alu, 0, sizeof(alu));
3227	alu.op = ALU_OP2_PRED_SETE_INT;
3228	alu.src[0].chan = 2;
3229	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3230	alu.execute_mask = 1;
3231	alu.update_pred = 1;
3232	alu.last = 1;
3233	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3234
3235	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3236	cf_jump = ctx->bc->cf_last;
3237
3238	treg[0] = r600_get_temp(ctx);
3239	switch (ctx->shader->tcs_prim_mode) {
3240	case PIPE_PRIM_LINES:
3241		stride = 8; /* 2 dwords, 1 vec2 store */
3242		outer_comps = 2;
3243		inner_comps = 0;
3244		break;
3245	case PIPE_PRIM_TRIANGLES:
3246		stride = 16; /* 4 dwords, 1 vec4 store */
3247		outer_comps = 3;
3248		inner_comps = 1;
3249		treg[1] = r600_get_temp(ctx);
3250		break;
3251	case PIPE_PRIM_QUADS:
3252		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3253		outer_comps = 4;
3254		inner_comps = 2;
3255		treg[1] = r600_get_temp(ctx);
3256		treg[2] = r600_get_temp(ctx);
3257		break;
3258	default:
3259		assert(0);
3260		return -1;
3261	}
3262
3263	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3264	/* TF_WRITE takes index in R.x, value in R.y */
3265	for (j = 0; j < ctx->shader->noutput; j++) {
3266		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3267			tessinner_idx = j;
3268		if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3269			tessouter_idx = j;
3270	}
3271
3272	if (tessouter_idx == -1)
3273		return -1;
3274
3275	if (tessinner_idx == -1 && inner_comps)
3276		return -1;
3277
3278	if (tessouter_idx != -1) {
3279		r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3280		if (r)
3281			return r;
3282	}
3283
3284	if (tessinner_idx != -1) {
3285		r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3286		if (r)
3287			return r;
3288	}
3289
3290	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3291	/* r.x = relpatchid(r0.y) * tf_stride */
3292
3293	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
3294	/* add incoming r0.w to it: t.x = t.x + r0.w */
3295	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3296			   temp_reg, 0,
3297			   0, 1,
3298			   V_SQ_ALU_SRC_LITERAL, stride,
3299			   0, 3);
3300	if (r)
3301		return r;
3302
3303	for (i = 0; i < outer_comps + inner_comps; i++) {
3304		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3305		int out_comp = i >= outer_comps ? i - outer_comps : i;
3306
3307		if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3308			if (out_comp == 1)
3309				out_comp = 0;
3310			else if (out_comp == 0)
3311				out_comp = 1;
3312		}
3313
3314		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3315				   treg[i / 2], (2 * (i % 2)),
3316				   temp_reg, 0,
3317				   V_SQ_ALU_SRC_LITERAL, 4 * i);
3318		if (r)
3319			return r;
3320		r = single_alu_op2(ctx, ALU_OP1_MOV,
3321				   treg[i / 2], 1 + (2 * (i%2)),
3322				   ctx->shader->output[out_idx].gpr, out_comp,
3323				   0, 0);
3324		if (r)
3325			return r;
3326	}
3327	for (i = 0; i < outer_comps + inner_comps; i++) {
3328		struct r600_bytecode_gds gds;
3329
3330		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3331		gds.src_gpr = treg[i / 2];
3332		gds.src_sel_x = 2 * (i % 2);
3333		gds.src_sel_y = 1 + (2 * (i % 2));
3334		gds.src_sel_z = 4;
3335		gds.dst_sel_x = 7;
3336		gds.dst_sel_y = 7;
3337		gds.dst_sel_z = 7;
3338		gds.dst_sel_w = 7;
3339		gds.op = FETCH_OP_TF_WRITE;
3340		r = r600_bytecode_add_gds(ctx->bc, &gds);
3341		if (r)
3342			return r;
3343	}
3344
3345	// Patch up jump label
3346	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3347	cf_pop = ctx->bc->cf_last;
3348
3349	cf_jump->cf_addr = cf_pop->id + 2;
3350	cf_jump->pop_count = 1;
3351	cf_pop->cf_addr = cf_pop->id + 2;
3352	cf_pop->pop_count = 1;
3353
3354	return 0;
3355}
3356
3357/*
3358 * We have to work out the thread ID for load and atomic
3359 * operations, which store the returned value to an index
3360 * in an intermediate buffer.
3361 * The index is calculated by taking the thread id,
3362 * calculated from the MBCNT instructions.
3363 * Then the shader engine ID is multiplied by 256,
3364 * and the wave id is added.
3365 * Then the result is multipled by 64 and thread id is
3366 * added.
3367 */
3368static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3369{
3370	struct r600_bytecode_alu alu;
3371	int r;
3372
3373	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3374	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3375	alu.dst.sel = ctx->temp_reg;
3376	alu.dst.chan = 0;
3377	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3378	alu.src[0].value = 0xffffffff;
3379	alu.dst.write = 1;
3380	r = r600_bytecode_add_alu(ctx->bc, &alu);
3381	if (r)
3382		return r;
3383
3384	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3385	alu.op = ALU_OP1_MBCNT_32HI_INT;
3386	alu.dst.sel = ctx->temp_reg;
3387	alu.dst.chan = 1;
3388	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3389	alu.src[0].value = 0xffffffff;
3390	alu.dst.write = 1;
3391	r = r600_bytecode_add_alu(ctx->bc, &alu);
3392	if (r)
3393		return r;
3394
3395	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3396	alu.op = ALU_OP3_MULADD_UINT24;
3397	alu.dst.sel = ctx->temp_reg;
3398	alu.dst.chan = 2;
3399	alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3400	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3401	alu.src[1].value = 256;
3402	alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3403	alu.dst.write = 1;
3404	alu.is_op3 = 1;
3405	alu.last = 1;
3406	r = r600_bytecode_add_alu(ctx->bc, &alu);
3407	if (r)
3408		return r;
3409
3410	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3411			   ctx->thread_id_gpr, 1,
3412			   ctx->temp_reg, 2,
3413			   V_SQ_ALU_SRC_LITERAL, 0x40,
3414			   ctx->temp_reg, 0);
3415	if (r)
3416		return r;
3417	return 0;
3418}
3419
3420static int r600_shader_from_tgsi(struct r600_context *rctx,
3421				 struct r600_pipe_shader *pipeshader,
3422				 union r600_shader_key key)
3423{
3424	struct r600_screen *rscreen = rctx->screen;
3425	struct r600_shader *shader = &pipeshader->shader;
3426	struct tgsi_token *tokens = pipeshader->selector->tokens;
3427	struct pipe_stream_output_info so = pipeshader->selector->so;
3428	struct tgsi_full_immediate *immediate;
3429	struct r600_shader_ctx ctx;
3430	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3431	unsigned output_done, noutput;
3432	unsigned opcode;
3433	int j, k, r = 0;
3434	unsigned i;
3435	int next_param_base = 0, next_clip_base;
3436	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3437	bool indirect_gprs;
3438	bool ring_outputs = false;
3439	bool lds_outputs = false;
3440	bool lds_inputs = false;
3441	bool pos_emitted = false;
3442
3443	ctx.bc = &shader->bc;
3444	ctx.shader = shader;
3445
3446	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3447			   rscreen->has_compressed_msaa_texturing);
3448	ctx.tokens = tokens;
3449	tgsi_scan_shader(tokens, &ctx.info);
3450	shader->indirect_files = ctx.info.indirect_files;
3451
3452	int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3453	ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3454	ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3455	tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3456
3457	shader->uses_helper_invocation = false;
3458	shader->uses_doubles = ctx.info.uses_doubles;
3459	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3460	shader->nsys_inputs = 0;
3461
3462	shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3463		ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3464	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3465	tgsi_parse_init(&ctx.parse, tokens);
3466	ctx.type = ctx.info.processor;
3467	shader->processor_type = ctx.type;
3468	ctx.bc->type = shader->processor_type;
3469
3470	switch (ctx.type) {
3471	case PIPE_SHADER_VERTEX:
3472		shader->vs_as_gs_a = key.vs.as_gs_a;
3473		shader->vs_as_es = key.vs.as_es;
3474		shader->vs_as_ls = key.vs.as_ls;
3475		shader->atomic_base = key.vs.first_atomic_counter;
3476		if (shader->vs_as_es)
3477			ring_outputs = true;
3478		if (shader->vs_as_ls)
3479			lds_outputs = true;
3480		break;
3481	case PIPE_SHADER_GEOMETRY:
3482		ring_outputs = true;
3483		shader->atomic_base = key.gs.first_atomic_counter;
3484		shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3485		break;
3486	case PIPE_SHADER_TESS_CTRL:
3487		shader->tcs_prim_mode = key.tcs.prim_mode;
3488		shader->atomic_base = key.tcs.first_atomic_counter;
3489		lds_outputs = true;
3490		lds_inputs = true;
3491		break;
3492	case PIPE_SHADER_TESS_EVAL:
3493		shader->tes_as_es = key.tes.as_es;
3494		shader->atomic_base = key.tes.first_atomic_counter;
3495		lds_inputs = true;
3496		if (shader->tes_as_es)
3497			ring_outputs = true;
3498		break;
3499	case PIPE_SHADER_FRAGMENT:
3500		shader->two_side = key.ps.color_two_side;
3501		shader->atomic_base = key.ps.first_atomic_counter;
3502		shader->rat_base = key.ps.nr_cbufs;
3503		shader->image_size_const_offset = key.ps.image_size_const_offset;
3504		break;
3505	case PIPE_SHADER_COMPUTE:
3506		shader->rat_base = 0;
3507		shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3508		break;
3509	default:
3510		break;
3511	}
3512
3513	if (shader->vs_as_es || shader->tes_as_es) {
3514		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3515	} else {
3516		ctx.gs_for_vs = NULL;
3517	}
3518
3519	ctx.next_ring_offset = 0;
3520	ctx.gs_out_ring_offset = 0;
3521	ctx.gs_next_vertex = 0;
3522	ctx.gs_stream_output_info = &so;
3523
3524	ctx.thread_id_gpr = -1;
3525	ctx.face_gpr = -1;
3526	ctx.fixed_pt_position_gpr = -1;
3527	ctx.fragcoord_input = -1;
3528	ctx.colors_used = 0;
3529	ctx.clip_vertex_write = 0;
3530
3531	ctx.helper_invoc_reg = -1;
3532	ctx.cs_block_size_reg = -1;
3533	ctx.cs_grid_size_reg = -1;
3534	ctx.cs_block_size_loaded = false;
3535	ctx.cs_grid_size_loaded = false;
3536
3537	shader->nr_ps_color_exports = 0;
3538	shader->nr_ps_max_color_exports = 0;
3539
3540
3541	/* register allocations */
3542	/* Values [0,127] correspond to GPR[0..127].
3543	 * Values [128,159] correspond to constant buffer bank 0
3544	 * Values [160,191] correspond to constant buffer bank 1
3545	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3546	 * Values [256,287] correspond to constant buffer bank 2 (EG)
3547	 * Values [288,319] correspond to constant buffer bank 3 (EG)
3548	 * Other special values are shown in the list below.
3549	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3550	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3551	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3552	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3553	 * 248	SQ_ALU_SRC_0: special constant 0.0.
3554	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
3555	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
3556	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3557	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
3558	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
3559	 * 254	SQ_ALU_SRC_PV: previous vector result.
3560	 * 255	SQ_ALU_SRC_PS: previous scalar result.
3561	 */
3562	for (i = 0; i < TGSI_FILE_COUNT; i++) {
3563		ctx.file_offset[i] = 0;
3564	}
3565
3566	if (ctx.type == PIPE_SHADER_VERTEX)  {
3567
3568		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3569		if (ctx.info.num_inputs)
3570			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3571	}
3572	if (ctx.type == PIPE_SHADER_FRAGMENT) {
3573		if (ctx.bc->chip_class >= EVERGREEN)
3574			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3575		else
3576			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3577
3578		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3579			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3580				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3581				shader->uses_helper_invocation = true;
3582			}
3583		}
3584	}
3585	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3586		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3587		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3588	}
3589	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3590		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3591	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3592		bool add_tesscoord = false, add_tess_inout = false;
3593		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3594		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3595			/* if we have tesscoord save one reg */
3596			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3597				add_tesscoord = true;
3598			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3599			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3600				add_tess_inout = true;
3601		}
3602		if (add_tesscoord || add_tess_inout)
3603			ctx.file_offset[TGSI_FILE_INPUT]++;
3604		if (add_tess_inout)
3605			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3606	}
3607	if (ctx.type == PIPE_SHADER_COMPUTE) {
3608		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3609		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3610			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3611				ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3612			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3613				ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3614		}
3615	}
3616
3617	ctx.file_offset[TGSI_FILE_OUTPUT] =
3618			ctx.file_offset[TGSI_FILE_INPUT] +
3619			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3620	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3621						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3622
3623	/* Outside the GPR range. This will be translated to one of the
3624	 * kcache banks later. */
3625	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3626	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3627
3628	pipeshader->scratch_space_needed = 0;
3629	int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3630			ctx.info.file_max[TGSI_FILE_TEMPORARY];
3631	if (regno > 124) {
3632		choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3633		shader->indirect_files = ctx.info.indirect_files;
3634	}
3635	shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3636
3637	ctx.bc->ar_reg = ++regno;
3638	ctx.bc->index_reg[0] = ++regno;
3639	ctx.bc->index_reg[1] = ++regno;
3640
3641	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3642		ctx.tess_input_info = ++regno;
3643		ctx.tess_output_info = ++regno;
3644	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3645		ctx.tess_input_info = ++regno;
3646		ctx.tess_output_info = ++regno;
3647	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3648		ctx.gs_export_gpr_tregs[0] = ++regno;
3649		ctx.gs_export_gpr_tregs[1] = ++regno;
3650		ctx.gs_export_gpr_tregs[2] = ++regno;
3651		ctx.gs_export_gpr_tregs[3] = ++regno;
3652		if (ctx.shader->gs_tri_strip_adj_fix) {
3653			ctx.gs_rotated_input[0] = ++regno;
3654			ctx.gs_rotated_input[1] = ++regno;
3655		} else {
3656			ctx.gs_rotated_input[0] = 0;
3657			ctx.gs_rotated_input[1] = 1;
3658		}
3659	}
3660
3661	if (shader->uses_images) {
3662		ctx.thread_id_gpr = ++regno;
3663	}
3664	ctx.temp_reg = ++regno;
3665
3666	shader->max_arrays = 0;
3667	shader->num_arrays = 0;
3668	if (indirect_gprs) {
3669
3670		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3671			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3672			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3673			                   ctx.file_offset[TGSI_FILE_INPUT],
3674			                   0x0F);
3675		}
3676		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3677			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3678			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3679			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3680			                   0x0F);
3681		}
3682	}
3683
3684	ctx.nliterals = 0;
3685	ctx.literals = NULL;
3686	ctx.max_driver_temp_used = 0;
3687
3688	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3689			       ctx.info.colors_written == 1;
3690	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3691	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3692
3693	if (ctx.type == PIPE_SHADER_VERTEX ||
3694	    ctx.type == PIPE_SHADER_GEOMETRY ||
3695	    ctx.type == PIPE_SHADER_TESS_EVAL) {
3696		shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3697					      ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3698		shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3699		shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3700	}
3701
3702	if (shader->vs_as_gs_a)
3703		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3704
3705	if (ctx.thread_id_gpr != -1) {
3706		r = load_thread_id_gpr(&ctx);
3707		if (r)
3708			return r;
3709	}
3710
3711	if (ctx.type == PIPE_SHADER_TESS_EVAL)
3712		r600_fetch_tess_io_info(&ctx);
3713
3714	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3715		tgsi_parse_token(&ctx.parse);
3716		switch (ctx.parse.FullToken.Token.Type) {
3717		case TGSI_TOKEN_TYPE_IMMEDIATE:
3718			immediate = &ctx.parse.FullToken.FullImmediate;
3719			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3720			if(ctx.literals == NULL) {
3721				r = -ENOMEM;
3722				goto out_err;
3723			}
3724			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3725			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3726			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3727			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3728			ctx.nliterals++;
3729			break;
3730		case TGSI_TOKEN_TYPE_DECLARATION:
3731			r = tgsi_declaration(&ctx);
3732			if (r)
3733				goto out_err;
3734			break;
3735		case TGSI_TOKEN_TYPE_INSTRUCTION:
3736		case TGSI_TOKEN_TYPE_PROPERTY:
3737			break;
3738		default:
3739			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3740			r = -EINVAL;
3741			goto out_err;
3742		}
3743	}
3744
3745	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3746	shader->ring_item_sizes[1] = 0;
3747	shader->ring_item_sizes[2] = 0;
3748	shader->ring_item_sizes[3] = 0;
3749
3750	/* Process two side if needed */
3751	if (shader->two_side && ctx.colors_used) {
3752		int i, count = ctx.shader->ninput;
3753		unsigned next_lds_loc = ctx.shader->nlds;
3754
3755		/* additional inputs will be allocated right after the existing inputs,
3756		 * we won't need them after the color selection, so we don't need to
3757		 * reserve these gprs for the rest of the shader code and to adjust
3758		 * output offsets etc. */
3759		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3760				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3761
3762		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3763		if (ctx.face_gpr == -1) {
3764			i = ctx.shader->ninput++;
3765			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3766			ctx.shader->input[i].spi_sid = 0;
3767			ctx.shader->input[i].gpr = gpr++;
3768			ctx.face_gpr = ctx.shader->input[i].gpr;
3769		}
3770
3771		for (i = 0; i < count; i++) {
3772			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3773				int ni = ctx.shader->ninput++;
3774				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3775				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3776				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3777				ctx.shader->input[ni].gpr = gpr++;
3778				// TGSI to LLVM needs to know the lds position of inputs.
3779				// Non LLVM path computes it later (in process_twoside_color)
3780				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3781				ctx.shader->input[i].back_color_input = ni;
3782				if (ctx.bc->chip_class >= EVERGREEN) {
3783					if ((r = evergreen_interp_input(&ctx, ni)))
3784						return r;
3785				}
3786			}
3787		}
3788	}
3789
3790	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3791		shader->nr_ps_max_color_exports = 8;
3792
3793	if (ctx.shader->uses_helper_invocation) {
3794		if (ctx.bc->chip_class == CAYMAN)
3795			r = cm_load_helper_invocation(&ctx);
3796		else
3797			r = eg_load_helper_invocation(&ctx);
3798		if (r)
3799			return r;
3800	}
3801
3802	/*
3803	 * XXX this relies on fixed_pt_position_gpr only being present when
3804	 * this shader should be executed per sample. Should be the case for now...
3805	 */
3806	if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3807		/*
3808		 * Fix up sample mask. The hw always gives us coverage mask for
3809		 * the pixel. However, for per-sample shading, we need the
3810		 * coverage for the shader invocation only.
3811		 * Also, with disabled msaa, only the first bit should be set
3812		 * (luckily the same fixup works for both problems).
3813		 * For now, we can only do it if we know this shader is always
3814		 * executed per sample (due to usage of bits in the shader
3815		 * forcing per-sample execution).
3816		 * If the fb is not multisampled, we'd do unnecessary work but
3817		 * it should still be correct.
3818		 * It will however do nothing for sample shading according
3819		 * to MinSampleShading.
3820		 */
3821		struct r600_bytecode_alu alu;
3822		int tmp = r600_get_temp(&ctx);
3823		assert(ctx.face_gpr != -1);
3824		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3825
3826		alu.op = ALU_OP2_LSHL_INT;
3827		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3828		alu.src[0].value = 0x1;
3829		alu.src[1].sel = ctx.fixed_pt_position_gpr;
3830		alu.src[1].chan = 3;
3831		alu.dst.sel = tmp;
3832		alu.dst.chan = 0;
3833		alu.dst.write = 1;
3834		alu.last = 1;
3835		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3836			return r;
3837
3838		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3839		alu.op = ALU_OP2_AND_INT;
3840		alu.src[0].sel = tmp;
3841		alu.src[1].sel = ctx.face_gpr;
3842		alu.src[1].chan = 2;
3843		alu.dst.sel = ctx.face_gpr;
3844		alu.dst.chan = 2;
3845		alu.dst.write = 1;
3846		alu.last = 1;
3847		if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3848			return r;
3849	}
3850
3851	if (ctx.fragcoord_input >= 0) {
3852		if (ctx.bc->chip_class == CAYMAN) {
3853			for (j = 0 ; j < 4; j++) {
3854				struct r600_bytecode_alu alu;
3855				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3856				alu.op = ALU_OP1_RECIP_IEEE;
3857				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3858				alu.src[0].chan = 3;
3859
3860				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3861				alu.dst.chan = j;
3862				alu.dst.write = (j == 3);
3863				alu.last = (j == 3);
3864				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3865					return r;
3866			}
3867		} else {
3868			struct r600_bytecode_alu alu;
3869			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3870			alu.op = ALU_OP1_RECIP_IEEE;
3871			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3872			alu.src[0].chan = 3;
3873
3874			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3875			alu.dst.chan = 3;
3876			alu.dst.write = 1;
3877			alu.last = 1;
3878			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3879				return r;
3880		}
3881	}
3882
3883	if (ctx.type == PIPE_SHADER_GEOMETRY) {
3884		struct r600_bytecode_alu alu;
3885		int r;
3886
3887		/* GS thread with no output workaround - emit a cut at start of GS */
3888		if (ctx.bc->chip_class == R600)
3889			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3890
3891		for (j = 0; j < 4; j++) {
3892			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3893			alu.op = ALU_OP1_MOV;
3894			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3895			alu.src[0].value = 0;
3896			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3897			alu.dst.write = 1;
3898			alu.last = 1;
3899			r = r600_bytecode_add_alu(ctx.bc, &alu);
3900			if (r)
3901				return r;
3902		}
3903
3904		if (ctx.shader->gs_tri_strip_adj_fix) {
3905			r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3906					   ctx.gs_rotated_input[0], 2,
3907					   0, 2,
3908					   V_SQ_ALU_SRC_LITERAL, 1);
3909			if (r)
3910				return r;
3911
3912			for (i = 0; i < 6; i++) {
3913				int rotated = (i + 4) % 6;
3914				int offset_reg = i / 3;
3915				int offset_chan = i % 3;
3916				int rotated_offset_reg = rotated / 3;
3917				int rotated_offset_chan = rotated % 3;
3918
3919				if (offset_reg == 0 && offset_chan == 2)
3920					offset_chan = 3;
3921				if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3922					rotated_offset_chan = 3;
3923
3924				r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3925						   ctx.gs_rotated_input[offset_reg], offset_chan,
3926						   ctx.gs_rotated_input[0], 2,
3927						   offset_reg, offset_chan,
3928						   rotated_offset_reg, rotated_offset_chan);
3929				if (r)
3930					return r;
3931			}
3932		}
3933	}
3934
3935	if (ctx.type == PIPE_SHADER_TESS_CTRL)
3936		r600_fetch_tess_io_info(&ctx);
3937
3938	if (shader->two_side && ctx.colors_used) {
3939		if ((r = process_twoside_color_inputs(&ctx)))
3940			return r;
3941	}
3942
3943	tgsi_parse_init(&ctx.parse, tokens);
3944	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3945		tgsi_parse_token(&ctx.parse);
3946		switch (ctx.parse.FullToken.Token.Type) {
3947		case TGSI_TOKEN_TYPE_INSTRUCTION:
3948			r = tgsi_is_supported(&ctx);
3949			if (r)
3950				goto out_err;
3951			ctx.max_driver_temp_used = 0;
3952			/* reserve first tmp for everyone */
3953			r600_get_temp(&ctx);
3954
3955			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3956			if ((r = tgsi_split_constant(&ctx)))
3957				goto out_err;
3958			if ((r = tgsi_split_literal_constant(&ctx)))
3959				goto out_err;
3960			if (ctx.type == PIPE_SHADER_GEOMETRY) {
3961				if ((r = tgsi_split_gs_inputs(&ctx)))
3962					goto out_err;
3963			} else if (lds_inputs) {
3964				if ((r = tgsi_split_lds_inputs(&ctx)))
3965					goto out_err;
3966			}
3967			if (ctx.bc->chip_class == CAYMAN)
3968				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3969			else if (ctx.bc->chip_class >= EVERGREEN)
3970				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3971			else
3972				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3973
3974			ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3975
3976			r = ctx.inst_info->process(&ctx);
3977			if (r)
3978				goto out_err;
3979
3980			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3981				r = r600_store_tcs_output(&ctx);
3982				if (r)
3983					goto out_err;
3984			}
3985			break;
3986		default:
3987			break;
3988		}
3989	}
3990
3991	/* Reset the temporary register counter. */
3992	ctx.max_driver_temp_used = 0;
3993
3994	noutput = shader->noutput;
3995
3996	if (!ring_outputs && ctx.clip_vertex_write) {
3997		unsigned clipdist_temp[2];
3998
3999		clipdist_temp[0] = r600_get_temp(&ctx);
4000		clipdist_temp[1] = r600_get_temp(&ctx);
4001
4002		/* need to convert a clipvertex write into clipdistance writes and not export
4003		   the clip vertex anymore */
4004
4005		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
4006		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4007		shader->output[noutput].gpr = clipdist_temp[0];
4008		noutput++;
4009		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4010		shader->output[noutput].gpr = clipdist_temp[1];
4011		noutput++;
4012
4013		/* reset spi_sid for clipvertex output to avoid confusing spi */
4014		shader->output[ctx.cv_output].spi_sid = 0;
4015
4016		shader->clip_dist_write = 0xFF;
4017		shader->cc_dist_mask = 0xFF;
4018
4019		for (i = 0; i < 8; i++) {
4020			int oreg = i >> 2;
4021			int ochan = i & 3;
4022
4023			for (j = 0; j < 4; j++) {
4024				struct r600_bytecode_alu alu;
4025				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4026				alu.op = ALU_OP2_DOT4;
4027				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
4028				alu.src[0].chan = j;
4029
4030				alu.src[1].sel = 512 + i;
4031				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4032				alu.src[1].chan = j;
4033
4034				alu.dst.sel = clipdist_temp[oreg];
4035				alu.dst.chan = j;
4036				alu.dst.write = (j == ochan);
4037				if (j == 3)
4038					alu.last = 1;
4039				r = r600_bytecode_add_alu(ctx.bc, &alu);
4040				if (r)
4041					return r;
4042			}
4043		}
4044	}
4045
4046	/* Add stream outputs. */
4047	if (so.num_outputs) {
4048		bool emit = false;
4049		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
4050			emit = true;
4051		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
4052			emit = true;
4053		if (emit)
4054			emit_streamout(&ctx, &so, -1, NULL);
4055	}
4056	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
4057	convert_edgeflag_to_int(&ctx);
4058
4059	if (ctx.type == PIPE_SHADER_TESS_CTRL)
4060		r600_emit_tess_factor(&ctx);
4061
4062	if (lds_outputs) {
4063		if (ctx.type == PIPE_SHADER_VERTEX) {
4064			if (ctx.shader->noutput)
4065				emit_lds_vs_writes(&ctx);
4066		}
4067	} else if (ring_outputs) {
4068		if (shader->vs_as_es || shader->tes_as_es) {
4069			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
4070			ctx.gs_export_gpr_tregs[1] = -1;
4071			ctx.gs_export_gpr_tregs[2] = -1;
4072			ctx.gs_export_gpr_tregs[3] = -1;
4073
4074			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
4075		}
4076	} else {
4077		/* Export output */
4078		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
4079
4080		for (i = 0, j = 0; i < noutput; i++, j++) {
4081			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4082			output[j].gpr = shader->output[i].gpr;
4083			output[j].elem_size = 3;
4084			output[j].swizzle_x = 0;
4085			output[j].swizzle_y = 1;
4086			output[j].swizzle_z = 2;
4087			output[j].swizzle_w = 3;
4088			output[j].burst_count = 1;
4089			output[j].type = 0xffffffff;
4090			output[j].op = CF_OP_EXPORT;
4091			switch (ctx.type) {
4092			case PIPE_SHADER_VERTEX:
4093			case PIPE_SHADER_TESS_EVAL:
4094				switch (shader->output[i].name) {
4095				case TGSI_SEMANTIC_POSITION:
4096					output[j].array_base = 60;
4097					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4098					pos_emitted = true;
4099					break;
4100
4101				case TGSI_SEMANTIC_PSIZE:
4102					output[j].array_base = 61;
4103					output[j].swizzle_y = 7;
4104					output[j].swizzle_z = 7;
4105					output[j].swizzle_w = 7;
4106					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4107					pos_emitted = true;
4108					break;
4109				case TGSI_SEMANTIC_EDGEFLAG:
4110					output[j].array_base = 61;
4111					output[j].swizzle_x = 7;
4112					output[j].swizzle_y = 0;
4113					output[j].swizzle_z = 7;
4114					output[j].swizzle_w = 7;
4115					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4116					pos_emitted = true;
4117					break;
4118				case TGSI_SEMANTIC_LAYER:
4119					/* spi_sid is 0 for outputs that are
4120					 * not consumed by PS */
4121					if (shader->output[i].spi_sid) {
4122						output[j].array_base = next_param_base++;
4123						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4124						j++;
4125						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4126					}
4127					output[j].array_base = 61;
4128					output[j].swizzle_x = 7;
4129					output[j].swizzle_y = 7;
4130					output[j].swizzle_z = 0;
4131					output[j].swizzle_w = 7;
4132					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4133					pos_emitted = true;
4134					break;
4135				case TGSI_SEMANTIC_VIEWPORT_INDEX:
4136					/* spi_sid is 0 for outputs that are
4137					 * not consumed by PS */
4138					if (shader->output[i].spi_sid) {
4139						output[j].array_base = next_param_base++;
4140						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4141						j++;
4142						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4143					}
4144					output[j].array_base = 61;
4145					output[j].swizzle_x = 7;
4146					output[j].swizzle_y = 7;
4147					output[j].swizzle_z = 7;
4148					output[j].swizzle_w = 0;
4149					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4150					pos_emitted = true;
4151					break;
4152				case TGSI_SEMANTIC_CLIPVERTEX:
4153					j--;
4154					break;
4155				case TGSI_SEMANTIC_CLIPDIST:
4156					output[j].array_base = next_clip_base++;
4157					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4158					pos_emitted = true;
4159					/* spi_sid is 0 for clipdistance outputs that were generated
4160					 * for clipvertex - we don't need to pass them to PS */
4161					if (shader->output[i].spi_sid) {
4162						j++;
4163						/* duplicate it as PARAM to pass to the pixel shader */
4164						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4165						output[j].array_base = next_param_base++;
4166						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4167					}
4168					break;
4169				case TGSI_SEMANTIC_FOG:
4170					output[j].swizzle_y = 4; /* 0 */
4171					output[j].swizzle_z = 4; /* 0 */
4172					output[j].swizzle_w = 5; /* 1 */
4173					break;
4174				case TGSI_SEMANTIC_PRIMID:
4175					output[j].swizzle_x = 2;
4176					output[j].swizzle_y = 4; /* 0 */
4177					output[j].swizzle_z = 4; /* 0 */
4178					output[j].swizzle_w = 4; /* 0 */
4179					break;
4180				}
4181
4182				break;
4183			case PIPE_SHADER_FRAGMENT:
4184				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4185					/* never export more colors than the number of CBs */
4186					if (shader->output[i].sid >= max_color_exports) {
4187						/* skip export */
4188						j--;
4189						continue;
4190					}
4191					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4192					output[j].array_base = shader->output[i].sid;
4193					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4194					shader->nr_ps_color_exports++;
4195					shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4196
4197					/* If the i-th target format is set, all previous target formats must
4198					 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4199					 */
4200					if (shader->output[i].sid > 0)
4201						for (unsigned x = 0; x < shader->output[i].sid; x++)
4202							shader->ps_color_export_mask |= (1 << (x*4));
4203
4204					if (shader->output[i].sid > shader->ps_export_highest)
4205						shader->ps_export_highest = shader->output[i].sid;
4206					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4207						for (k = 1; k < max_color_exports; k++) {
4208							j++;
4209							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4210							output[j].gpr = shader->output[i].gpr;
4211							output[j].elem_size = 3;
4212							output[j].swizzle_x = 0;
4213							output[j].swizzle_y = 1;
4214							output[j].swizzle_z = 2;
4215							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4216							output[j].burst_count = 1;
4217							output[j].array_base = k;
4218							output[j].op = CF_OP_EXPORT;
4219							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4220							shader->nr_ps_color_exports++;
4221							if (k > shader->ps_export_highest)
4222								shader->ps_export_highest = k;
4223							shader->ps_color_export_mask |= (0xf << (j * 4));
4224						}
4225					}
4226				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4227					output[j].array_base = 61;
4228					output[j].swizzle_x = 2;
4229					output[j].swizzle_y = 7;
4230					output[j].swizzle_z = output[j].swizzle_w = 7;
4231					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4232				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4233					output[j].array_base = 61;
4234					output[j].swizzle_x = 7;
4235					output[j].swizzle_y = 1;
4236					output[j].swizzle_z = output[j].swizzle_w = 7;
4237					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4238				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4239					output[j].array_base = 61;
4240					output[j].swizzle_x = 7;
4241					output[j].swizzle_y = 7;
4242					output[j].swizzle_z = 0;
4243					output[j].swizzle_w = 7;
4244					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4245				} else {
4246					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4247					r = -EINVAL;
4248					goto out_err;
4249				}
4250				break;
4251			case PIPE_SHADER_TESS_CTRL:
4252				break;
4253			default:
4254				R600_ERR("unsupported processor type %d\n", ctx.type);
4255				r = -EINVAL;
4256				goto out_err;
4257			}
4258
4259			if (output[j].type == 0xffffffff) {
4260				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4261				output[j].array_base = next_param_base++;
4262			}
4263		}
4264
4265		/* add fake position export */
4266		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4267			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4268			output[j].gpr = 0;
4269			output[j].elem_size = 3;
4270			output[j].swizzle_x = 7;
4271			output[j].swizzle_y = 7;
4272			output[j].swizzle_z = 7;
4273			output[j].swizzle_w = 7;
4274			output[j].burst_count = 1;
4275			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4276			output[j].array_base = 60;
4277			output[j].op = CF_OP_EXPORT;
4278			j++;
4279		}
4280
4281		/* add fake param output for vertex shader if no param is exported */
4282		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4283			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4284			output[j].gpr = 0;
4285			output[j].elem_size = 3;
4286			output[j].swizzle_x = 7;
4287			output[j].swizzle_y = 7;
4288			output[j].swizzle_z = 7;
4289			output[j].swizzle_w = 7;
4290			output[j].burst_count = 1;
4291			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4292			output[j].array_base = 0;
4293			output[j].op = CF_OP_EXPORT;
4294			j++;
4295		}
4296
4297		/* add fake pixel export */
4298		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4299			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4300			output[j].gpr = 0;
4301			output[j].elem_size = 3;
4302			output[j].swizzle_x = 7;
4303			output[j].swizzle_y = 7;
4304			output[j].swizzle_z = 7;
4305			output[j].swizzle_w = 7;
4306			output[j].burst_count = 1;
4307			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4308			output[j].array_base = 0;
4309			output[j].op = CF_OP_EXPORT;
4310			j++;
4311			shader->nr_ps_color_exports++;
4312			shader->ps_color_export_mask = 0xf;
4313		}
4314
4315		noutput = j;
4316
4317		/* set export done on last export of each type */
4318		for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4319			if (!(output_done & (1 << output[k].type))) {
4320				output_done |= (1 << output[k].type);
4321				output[k].op = CF_OP_EXPORT_DONE;
4322			}
4323		}
4324		/* add output to bytecode */
4325		for (i = 0; i < noutput; i++) {
4326			r = r600_bytecode_add_output(ctx.bc, &output[i]);
4327			if (r)
4328				goto out_err;
4329		}
4330	}
4331
4332	/* add program end */
4333	if (ctx.bc->chip_class == CAYMAN)
4334		cm_bytecode_add_cf_end(ctx.bc);
4335	else {
4336		const struct cf_op_info *last = NULL;
4337
4338		if (ctx.bc->cf_last)
4339			last = r600_isa_cf(ctx.bc->cf_last->op);
4340
4341		/* alu clause instructions don't have EOP bit, so add NOP */
4342		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4343			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4344
4345		ctx.bc->cf_last->end_of_program = 1;
4346	}
4347
4348	/* check GPR limit - we have 124 = 128 - 4
4349	 * (4 are reserved as alu clause temporary registers) */
4350	if (ctx.bc->ngpr > 124) {
4351		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4352		r = -ENOMEM;
4353		goto out_err;
4354	}
4355
4356	if (ctx.type == PIPE_SHADER_GEOMETRY) {
4357		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4358			return r;
4359	}
4360
4361	free(ctx.spilled_arrays);
4362	free(ctx.array_infos);
4363	free(ctx.literals);
4364	tgsi_parse_free(&ctx.parse);
4365	return 0;
4366out_err:
4367	free(ctx.spilled_arrays);
4368	free(ctx.array_infos);
4369	free(ctx.literals);
4370	tgsi_parse_free(&ctx.parse);
4371	return r;
4372}
4373
4374static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4375{
4376	const unsigned tgsi_opcode =
4377		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4378	R600_ERR("%s tgsi opcode unsupported\n",
4379		 tgsi_get_opcode_name(tgsi_opcode));
4380	return -EINVAL;
4381}
4382
4383static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4384{
4385	return 0;
4386}
4387
4388static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4389			const struct r600_shader_src *shader_src,
4390			unsigned chan)
4391{
4392	bc_src->sel = shader_src->sel;
4393	bc_src->chan = shader_src->swizzle[chan];
4394	bc_src->neg = shader_src->neg;
4395	bc_src->abs = shader_src->abs;
4396	bc_src->rel = shader_src->rel;
4397	bc_src->value = shader_src->value[bc_src->chan];
4398	bc_src->kc_bank = shader_src->kc_bank;
4399	bc_src->kc_rel = shader_src->kc_rel;
4400}
4401
4402static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4403{
4404	bc_src->abs = 1;
4405	bc_src->neg = 0;
4406}
4407
4408static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4409{
4410	bc_src->neg = !bc_src->neg;
4411}
4412
4413static void tgsi_dst(struct r600_shader_ctx *ctx,
4414		     const struct tgsi_full_dst_register *tgsi_dst,
4415		     unsigned swizzle,
4416		     struct r600_bytecode_alu_dst *r600_dst)
4417{
4418	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4419
4420	if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4421		bool spilled;
4422		unsigned idx;
4423
4424		idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4425
4426		if (spilled) {
4427			struct r600_bytecode_output cf;
4428			int reg = 0;
4429			int r;
4430			bool add_pending_output = true;
4431
4432			memset(&cf, 0, sizeof(struct r600_bytecode_output));
4433			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4434				&cf.array_base, &cf.array_size);
4435
4436			/* If no component has spilled, reserve a register and add the spill code
4437			 *  ctx->bc->n_pending_outputs is cleared after each instruction group */
4438			if (ctx->bc->n_pending_outputs == 0) {
4439				reg = r600_get_temp(ctx);
4440			} else {
4441				/* If we are already spilling and the output address is the same like
4442				* before then just reuse the same slot */
4443				struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4444				if ((cf.array_base + idx == tmpl->array_base) ||
4445				    (cf.array_base == tmpl->array_base &&
4446				     tmpl->index_gpr == ctx->bc->ar_reg &&
4447				     tgsi_dst->Register.Indirect)) {
4448					reg = ctx->bc->pending_outputs[0].gpr;
4449					add_pending_output = false;
4450				} else {
4451					reg = r600_get_temp(ctx);
4452				}
4453			}
4454
4455			r600_dst->sel = reg;
4456			r600_dst->chan = swizzle;
4457			r600_dst->write = 1;
4458			if (inst->Instruction.Saturate) {
4459				r600_dst->clamp = 1;
4460			}
4461
4462			/* Add new outputs as pending */
4463			if (add_pending_output) {
4464				cf.op = CF_OP_MEM_SCRATCH;
4465				cf.elem_size = 3;
4466				cf.gpr = reg;
4467				cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4468				cf.mark = 1;
4469				cf.comp_mask = inst->Dst[0].Register.WriteMask;
4470				cf.swizzle_x = 0;
4471				cf.swizzle_y = 1;
4472				cf.swizzle_z = 2;
4473				cf.swizzle_w = 3;
4474				cf.burst_count = 1;
4475
4476				if (tgsi_dst->Register.Indirect) {
4477					if (ctx->bc->chip_class < R700)
4478						cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4479					else
4480						cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4481					cf.index_gpr = ctx->bc->ar_reg;
4482			}
4483			else {
4484				cf.array_base += idx;
4485				cf.array_size = 0;
4486			}
4487
4488			r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4489			if (r)
4490				return;
4491
4492			if (ctx->bc->chip_class >= R700)
4493				r600_bytecode_need_wait_ack(ctx->bc, true);
4494			}
4495			return;
4496		}
4497		else {
4498			r600_dst->sel = idx;
4499		}
4500	}
4501	else {
4502		r600_dst->sel = tgsi_dst->Register.Index;
4503		r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4504	}
4505	r600_dst->chan = swizzle;
4506	r600_dst->write = 1;
4507	if (inst->Instruction.Saturate) {
4508		r600_dst->clamp = 1;
4509	}
4510	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4511		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4512			return;
4513		}
4514	}
4515	if (tgsi_dst->Register.Indirect)
4516		r600_dst->rel = V_SQ_REL_RELATIVE;
4517
4518}
4519
4520static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4521{
4522	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4523	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4524	struct r600_bytecode_alu alu;
4525	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4526	int use_tmp = 0;
4527	int swizzle_x = inst->Src[0].Register.SwizzleX;
4528
4529	if (singledest) {
4530		switch (write_mask) {
4531		case 0x1:
4532			if (swizzle_x == 2) {
4533				write_mask = 0xc;
4534				use_tmp = 3;
4535			} else
4536				write_mask = 0x3;
4537			break;
4538		case 0x2:
4539			if (swizzle_x == 2) {
4540				write_mask = 0xc;
4541				use_tmp = 3;
4542			} else {
4543				write_mask = 0x3;
4544				use_tmp = 1;
4545			}
4546			break;
4547		case 0x4:
4548			if (swizzle_x == 0) {
4549				write_mask = 0x3;
4550				use_tmp = 1;
4551			} else
4552				write_mask = 0xc;
4553			break;
4554		case 0x8:
4555			if (swizzle_x == 0) {
4556				write_mask = 0x3;
4557				use_tmp = 1;
4558			} else {
4559				write_mask = 0xc;
4560				use_tmp = 3;
4561			}
4562			break;
4563		}
4564	}
4565
4566	lasti = tgsi_last_instruction(write_mask);
4567	for (i = 0; i <= lasti; i++) {
4568
4569		if (!(write_mask & (1 << i)))
4570			continue;
4571
4572		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4573
4574		if (singledest) {
4575			if (use_tmp || dest_temp) {
4576				alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4577				alu.dst.chan = i;
4578				alu.dst.write = 1;
4579			} else {
4580				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4581			}
4582			if (i == 1 || i == 3)
4583				alu.dst.write = 0;
4584		} else
4585			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4586
4587		alu.op = op_override ? op_override : ctx->inst_info->op;
4588		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4589			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4590		} else if (!swap) {
4591			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4592				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4593			}
4594		} else {
4595			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4596			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4597		}
4598
4599		/* handle some special cases */
4600		if (i == 1 || i == 3) {
4601			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4602			case TGSI_OPCODE_DABS:
4603				r600_bytecode_src_set_abs(&alu.src[0]);
4604				break;
4605			default:
4606				break;
4607			}
4608		}
4609		if (i == lasti) {
4610			alu.last = 1;
4611		}
4612		r = r600_bytecode_add_alu(ctx->bc, &alu);
4613		if (r)
4614			return r;
4615	}
4616
4617	if (use_tmp) {
4618		write_mask = inst->Dst[0].Register.WriteMask;
4619
4620		lasti = tgsi_last_instruction(write_mask);
4621		/* move result from temp to dst */
4622		for (i = 0; i <= lasti; i++) {
4623			if (!(write_mask & (1 << i)))
4624				continue;
4625
4626			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4627			alu.op = ALU_OP1_MOV;
4628
4629			if (dest_temp) {
4630				alu.dst.sel = dest_temp;
4631				alu.dst.chan = i;
4632				alu.dst.write = 1;
4633			} else
4634				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4635			alu.src[0].sel = ctx->temp_reg;
4636			alu.src[0].chan = use_tmp - 1;
4637			alu.last = (i == lasti);
4638
4639			r = r600_bytecode_add_alu(ctx->bc, &alu);
4640			if (r)
4641				return r;
4642		}
4643	}
4644	return 0;
4645}
4646
4647static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4648{
4649	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4650	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4651	/* confirm writemasking */
4652	if ((write_mask & 0x3) != 0x3 &&
4653	    (write_mask & 0xc) != 0xc) {
4654		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4655		return -1;
4656	}
4657	return tgsi_op2_64_params(ctx, false, false, 0, 0);
4658}
4659
4660static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4661{
4662	return tgsi_op2_64_params(ctx, true, false, 0, 0);
4663}
4664
4665static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4666{
4667	return tgsi_op2_64_params(ctx, true, true, 0, 0);
4668}
4669
4670static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4671{
4672	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4673	struct r600_bytecode_alu alu;
4674	int i, j, r;
4675	int lasti = 3;
4676	int tmp = r600_get_temp(ctx);
4677
4678	for (i = 0; i < lasti + 1; i++) {
4679
4680		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4681		alu.op = ctx->inst_info->op;
4682		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4683			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4684		}
4685
4686		if (inst->Dst[0].Register.WriteMask & (1 << i))
4687			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4688		else
4689			alu.dst.sel = tmp;
4690
4691		alu.dst.chan = i;
4692		alu.is_op3 = 1;
4693		if (i == lasti) {
4694			alu.last = 1;
4695		}
4696		r = r600_bytecode_add_alu(ctx->bc, &alu);
4697		if (r)
4698			return r;
4699	}
4700	return 0;
4701}
4702
4703static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4704{
4705	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4706	struct r600_bytecode_alu alu;
4707	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4708	int i, j, r, lasti = tgsi_last_instruction(write_mask);
4709	/* use temp register if trans_only and more than one dst component */
4710	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4711	unsigned op = ctx->inst_info->op;
4712
4713	if (op == ALU_OP2_MUL_IEEE &&
4714	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4715		op = ALU_OP2_MUL;
4716
4717	/* nir_to_tgsi lowers nir_op_isub to UADD + negate, since r600 doesn't support
4718	 * source modifiers with integer ops we switch back to SUB_INT */
4719	bool src1_neg = ctx->src[1].neg;
4720	if (op == ALU_OP2_ADD_INT && src1_neg) {
4721		src1_neg = false;
4722		op = ALU_OP2_SUB_INT;
4723	}
4724
4725	for (i = 0; i <= lasti; i++) {
4726		if (!(write_mask & (1 << i)))
4727			continue;
4728
4729		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4730		if (use_tmp) {
4731			alu.dst.sel = ctx->temp_reg;
4732			alu.dst.chan = i;
4733			alu.dst.write = 1;
4734		} else
4735			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4736
4737		alu.op = op;
4738		if (!swap) {
4739			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4740				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4741			}
4742			alu.src[1].neg = src1_neg;
4743		} else {
4744			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4745			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4746		}
4747		if (i == lasti || trans_only) {
4748			alu.last = 1;
4749		}
4750		r = r600_bytecode_add_alu(ctx->bc, &alu);
4751		if (r)
4752			return r;
4753	}
4754
4755	if (use_tmp) {
4756		/* move result from temp to dst */
4757		for (i = 0; i <= lasti; i++) {
4758			if (!(write_mask & (1 << i)))
4759				continue;
4760
4761			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4762			alu.op = ALU_OP1_MOV;
4763			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4764			alu.src[0].sel = ctx->temp_reg;
4765			alu.src[0].chan = i;
4766			alu.last = (i == lasti);
4767
4768			r = r600_bytecode_add_alu(ctx->bc, &alu);
4769			if (r)
4770				return r;
4771		}
4772	}
4773	return 0;
4774}
4775
4776static int tgsi_op2(struct r600_shader_ctx *ctx)
4777{
4778	return tgsi_op2_s(ctx, 0, 0);
4779}
4780
4781static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4782{
4783	return tgsi_op2_s(ctx, 1, 0);
4784}
4785
4786static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4787{
4788	return tgsi_op2_s(ctx, 0, 1);
4789}
4790
4791static int tgsi_ineg(struct r600_shader_ctx *ctx)
4792{
4793	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4794	struct r600_bytecode_alu alu;
4795	int i, r;
4796	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4797
4798	for (i = 0; i < lasti + 1; i++) {
4799
4800		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4801			continue;
4802		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4803		alu.op = ctx->inst_info->op;
4804
4805		alu.src[0].sel = V_SQ_ALU_SRC_0;
4806
4807		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4808
4809		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4810
4811		if (i == lasti) {
4812			alu.last = 1;
4813		}
4814		r = r600_bytecode_add_alu(ctx->bc, &alu);
4815		if (r)
4816			return r;
4817	}
4818	return 0;
4819
4820}
4821
4822static int tgsi_dneg(struct r600_shader_ctx *ctx)
4823{
4824	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4825	struct r600_bytecode_alu alu;
4826	int i, r;
4827	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4828
4829	for (i = 0; i < lasti + 1; i++) {
4830
4831		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4832			continue;
4833		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4834		alu.op = ALU_OP1_MOV;
4835
4836		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4837
4838		if (i == 1 || i == 3)
4839			r600_bytecode_src_toggle_neg(&alu.src[0]);
4840		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4841
4842		if (i == lasti) {
4843			alu.last = 1;
4844		}
4845		r = r600_bytecode_add_alu(ctx->bc, &alu);
4846		if (r)
4847			return r;
4848	}
4849	return 0;
4850
4851}
4852
4853static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4854{
4855	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4856	struct r600_bytecode_alu alu;
4857	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4858	int i, j, r;
4859
4860	for (i = 0; i <= 3; i++) {
4861		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4862		alu.op = ctx->inst_info->op;
4863
4864		alu.dst.sel = ctx->temp_reg;
4865		alu.dst.chan = i;
4866		alu.dst.write = 1;
4867		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4868			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4869		}
4870
4871		if (i == 3)
4872			alu.last = 1;
4873
4874		r = r600_bytecode_add_alu(ctx->bc, &alu);
4875		if (r)
4876			return r;
4877	}
4878
4879	/* Replicate significand result across channels. */
4880	for (i = 0; i <= 3; i++) {
4881		if (!(write_mask & (1 << i)))
4882			continue;
4883
4884		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4885		alu.op = ALU_OP1_MOV;
4886		alu.src[0].chan = (i & 1) + 2;
4887		alu.src[0].sel = ctx->temp_reg;
4888
4889		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4890		alu.dst.write = 1;
4891		alu.last = 1;
4892		r = r600_bytecode_add_alu(ctx->bc, &alu);
4893		if (r)
4894			return r;
4895	}
4896
4897	for (i = 0; i <= 3; i++) {
4898		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4899			/* MOV third channels to writemask dst1 */
4900			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4901			alu.op = ALU_OP1_MOV;
4902			alu.src[0].chan = 1;
4903			alu.src[0].sel = ctx->temp_reg;
4904
4905			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4906			alu.last = 1;
4907			r = r600_bytecode_add_alu(ctx->bc, &alu);
4908			if (r)
4909				return r;
4910			break;
4911		}
4912	}
4913	return 0;
4914}
4915
4916
4917static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4918{
4919	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4920	struct r600_bytecode_alu alu;
4921	int i, c, r;
4922	int write_mask = inst->Dst[0].Register.WriteMask;
4923	int temp_reg = r600_get_temp(ctx);
4924
4925	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4926		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4927
4928	for (c = 0; c < 2; c++) {
4929		int dchan = c * 2;
4930		if (write_mask & (0x3 << dchan)) {
4931	/* split into 24-bit int and 8-bit int */
4932			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4933			alu.op = ALU_OP2_AND_INT;
4934			alu.dst.sel = temp_reg;
4935			alu.dst.chan = dchan;
4936			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4937			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4938			alu.src[1].value = 0xffffff00;
4939			alu.dst.write = 1;
4940			r = r600_bytecode_add_alu(ctx->bc, &alu);
4941			if (r)
4942				return r;
4943
4944			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4945			alu.op = ALU_OP2_AND_INT;
4946			alu.dst.sel = temp_reg;
4947			alu.dst.chan = dchan + 1;
4948			r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4949			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4950			alu.src[1].value = 0xff;
4951			alu.dst.write = 1;
4952			alu.last = 1;
4953			r = r600_bytecode_add_alu(ctx->bc, &alu);
4954			if (r)
4955				return r;
4956		}
4957	}
4958
4959	for (c = 0; c < 2; c++) {
4960		int dchan = c * 2;
4961		if (write_mask & (0x3 << dchan)) {
4962			for (i = dchan; i <= dchan + 1; i++) {
4963				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4964				alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4965
4966				alu.src[0].sel = temp_reg;
4967				alu.src[0].chan = i;
4968				alu.dst.sel = temp_reg;
4969				alu.dst.chan = i;
4970				alu.dst.write = 1;
4971				if (ctx->bc->chip_class == CAYMAN)
4972					alu.last = i == dchan + 1;
4973				else
4974					alu.last = 1; /* trans only ops on evergreen */
4975
4976				r = r600_bytecode_add_alu(ctx->bc, &alu);
4977				if (r)
4978					return r;
4979			}
4980		}
4981	}
4982
4983	for (c = 0; c < 2; c++) {
4984		int dchan = c * 2;
4985		if (write_mask & (0x3 << dchan)) {
4986			for (i = 0; i < 4; i++) {
4987				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4988				alu.op = ALU_OP1_FLT32_TO_FLT64;
4989
4990				alu.src[0].chan = dchan + (i / 2);
4991				if (i == 0 || i == 2)
4992					alu.src[0].sel = temp_reg;
4993				else {
4994					alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4995					alu.src[0].value = 0x0;
4996				}
4997				alu.dst.sel = ctx->temp_reg;
4998				alu.dst.chan = i;
4999				alu.last = i == 3;
5000				alu.dst.write = 1;
5001
5002				r = r600_bytecode_add_alu(ctx->bc, &alu);
5003				if (r)
5004					return r;
5005			}
5006
5007			for (i = 0; i <= 1; i++) {
5008				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5009				alu.op = ALU_OP2_ADD_64;
5010
5011				alu.src[0].chan = fp64_switch(i);
5012				alu.src[0].sel = ctx->temp_reg;
5013
5014				alu.src[1].chan = fp64_switch(i + 2);
5015				alu.src[1].sel = ctx->temp_reg;
5016				tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
5017				alu.last = i == 1;
5018
5019				r = r600_bytecode_add_alu(ctx->bc, &alu);
5020				if (r)
5021					return r;
5022			}
5023		}
5024	}
5025
5026	return 0;
5027}
5028
5029static int egcm_double_to_int(struct r600_shader_ctx *ctx)
5030{
5031	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5032	struct r600_bytecode_alu alu;
5033	int i, r;
5034	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5035	int treg = r600_get_temp(ctx);
5036	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
5037		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
5038
5039	/* do a 64->32 into a temp register */
5040	r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
5041	if (r)
5042		return r;
5043
5044	for (i = 0; i <= lasti; i++) {
5045		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5046			continue;
5047		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5048		alu.op = ctx->inst_info->op;
5049
5050		alu.src[0].chan = i;
5051		alu.src[0].sel = treg;
5052		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5053		alu.last = (i == lasti);
5054
5055		r = r600_bytecode_add_alu(ctx->bc, &alu);
5056		if (r)
5057			return r;
5058	}
5059
5060	return 0;
5061}
5062
5063static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
5064					unsigned op,
5065					int dst_reg,
5066					struct r600_shader_src *src,
5067					bool abs)
5068{
5069	struct r600_bytecode_alu alu;
5070	const int last_slot = 3;
5071	int r;
5072
5073	/* these have to write the result to X/Y by the looks of it */
5074	for (int i = 0 ; i < last_slot; i++) {
5075		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5076		alu.op = op;
5077
5078		r600_bytecode_src(&alu.src[0], src, 1);
5079		r600_bytecode_src(&alu.src[1], src, 0);
5080
5081		if (abs)
5082			r600_bytecode_src_set_abs(&alu.src[1]);
5083
5084		alu.dst.sel = dst_reg;
5085		alu.dst.chan = i;
5086		alu.dst.write = (i == 0 || i == 1);
5087
5088		if (bc->chip_class != CAYMAN || i == last_slot - 1)
5089			alu.last = 1;
5090		r = r600_bytecode_add_alu(bc, &alu);
5091		if (r)
5092			return r;
5093	}
5094
5095	return 0;
5096}
5097
5098static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5099{
5100	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5101	int i, r;
5102	struct r600_bytecode_alu alu;
5103	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5104	int t1 = ctx->temp_reg;
5105
5106	/* should only be one src regs */
5107	assert(inst->Instruction.NumSrcRegs == 1);
5108
5109	/* only support one double at a time */
5110	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5111	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5112
5113	r = cayman_emit_unary_double_raw(
5114		ctx->bc, ctx->inst_info->op, t1,
5115		&ctx->src[0],
5116		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5117		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5118	if (r)
5119		return r;
5120
5121	for (i = 0 ; i <= lasti; i++) {
5122		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5123			continue;
5124		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5125		alu.op = ALU_OP1_MOV;
5126		alu.src[0].sel = t1;
5127		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5128		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5129		alu.dst.write = 1;
5130		if (i == lasti)
5131			alu.last = 1;
5132		r = r600_bytecode_add_alu(ctx->bc, &alu);
5133		if (r)
5134			return r;
5135	}
5136	return 0;
5137}
5138
5139static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5140{
5141	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5142	int i, j, r;
5143	struct r600_bytecode_alu alu;
5144	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5145
5146	for (i = 0 ; i < last_slot; i++) {
5147		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5148		alu.op = ctx->inst_info->op;
5149		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5150			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5151
5152			/* RSQ should take the absolute value of src */
5153			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5154				r600_bytecode_src_set_abs(&alu.src[j]);
5155			}
5156		}
5157		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5158		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5159
5160		if (i == last_slot - 1)
5161			alu.last = 1;
5162		r = r600_bytecode_add_alu(ctx->bc, &alu);
5163		if (r)
5164			return r;
5165	}
5166	return 0;
5167}
5168
5169static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5170{
5171	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5172	int i, j, k, r;
5173	struct r600_bytecode_alu alu;
5174	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5175	int t1 = ctx->temp_reg;
5176
5177	for (k = 0; k <= lasti; k++) {
5178		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5179			continue;
5180
5181		for (i = 0 ; i < 4; i++) {
5182			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183			alu.op = ctx->inst_info->op;
5184			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5185				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5186			}
5187			alu.dst.sel = t1;
5188			alu.dst.chan = i;
5189			alu.dst.write = (i == k);
5190			if (i == 3)
5191				alu.last = 1;
5192			r = r600_bytecode_add_alu(ctx->bc, &alu);
5193			if (r)
5194				return r;
5195		}
5196	}
5197
5198	for (i = 0 ; i <= lasti; i++) {
5199		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5200			continue;
5201		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5202		alu.op = ALU_OP1_MOV;
5203		alu.src[0].sel = t1;
5204		alu.src[0].chan = i;
5205		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5206		alu.dst.write = 1;
5207		if (i == lasti)
5208			alu.last = 1;
5209		r = r600_bytecode_add_alu(ctx->bc, &alu);
5210		if (r)
5211			return r;
5212	}
5213
5214	return 0;
5215}
5216
5217
5218static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5219{
5220	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5221	int i, j, k, r;
5222	struct r600_bytecode_alu alu;
5223	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5224	int t1 = ctx->temp_reg;
5225
5226	/* t1 would get overwritten below if we actually tried to
5227	 * multiply two pairs of doubles at a time. */
5228	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5229	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5230
5231	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5232
5233	for (i = 0; i < 4; i++) {
5234		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5235		alu.op = ctx->inst_info->op;
5236		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5237			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5238		}
5239		alu.dst.sel = t1;
5240		alu.dst.chan = i;
5241		alu.dst.write = 1;
5242		if (i == 3)
5243			alu.last = 1;
5244		r = r600_bytecode_add_alu(ctx->bc, &alu);
5245		if (r)
5246			return r;
5247	}
5248
5249	for (i = 0; i <= lasti; i++) {
5250		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5251			continue;
5252		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5253		alu.op = ALU_OP1_MOV;
5254		alu.src[0].sel = t1;
5255		alu.src[0].chan = i;
5256		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5257		alu.dst.write = 1;
5258		if (i == lasti)
5259			alu.last = 1;
5260		r = r600_bytecode_add_alu(ctx->bc, &alu);
5261		if (r)
5262			return r;
5263	}
5264
5265	return 0;
5266}
5267
5268/*
5269 * Emit RECIP_64 + MUL_64 to implement division.
5270 */
5271static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5272{
5273	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5274	int r;
5275	struct r600_bytecode_alu alu;
5276	int t1 = ctx->temp_reg;
5277	int k;
5278
5279	/* Only support one double at a time. This is the same constraint as
5280	 * in DMUL lowering. */
5281	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5282	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5283
5284	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5285
5286	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5287	if (r)
5288		return r;
5289
5290	for (int i = 0; i < 4; i++) {
5291		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5292		alu.op = ALU_OP2_MUL_64;
5293
5294		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5295
5296		alu.src[1].sel = t1;
5297		alu.src[1].chan = (i == 3) ? 0 : 1;
5298
5299		alu.dst.sel = t1;
5300		alu.dst.chan = i;
5301		alu.dst.write = 1;
5302		if (i == 3)
5303			alu.last = 1;
5304		r = r600_bytecode_add_alu(ctx->bc, &alu);
5305		if (r)
5306			return r;
5307	}
5308
5309	for (int i = 0; i < 2; i++) {
5310		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5311		alu.op = ALU_OP1_MOV;
5312		alu.src[0].sel = t1;
5313		alu.src[0].chan = i;
5314		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5315		alu.dst.write = 1;
5316		if (i == 1)
5317			alu.last = 1;
5318		r = r600_bytecode_add_alu(ctx->bc, &alu);
5319		if (r)
5320			return r;
5321	}
5322	return 0;
5323}
5324
5325/*
5326 * r600 - trunc to -PI..PI range
5327 * r700 - normalize by dividing by 2PI
5328 * see fdo bug 27901
5329 */
5330static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5331{
5332	int r;
5333	struct r600_bytecode_alu alu;
5334
5335	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5336	alu.op = ALU_OP3_MULADD;
5337	alu.is_op3 = 1;
5338
5339	alu.dst.chan = 0;
5340	alu.dst.sel = ctx->temp_reg;
5341	alu.dst.write = 1;
5342
5343	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5344
5345	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5346	alu.src[1].chan = 0;
5347	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5348	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5349	alu.src[2].chan = 0;
5350	alu.last = 1;
5351	r = r600_bytecode_add_alu(ctx->bc, &alu);
5352	if (r)
5353		return r;
5354
5355	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5356	alu.op = ALU_OP1_FRACT;
5357
5358	alu.dst.chan = 0;
5359	alu.dst.sel = ctx->temp_reg;
5360	alu.dst.write = 1;
5361
5362	alu.src[0].sel = ctx->temp_reg;
5363	alu.src[0].chan = 0;
5364	alu.last = 1;
5365	r = r600_bytecode_add_alu(ctx->bc, &alu);
5366	if (r)
5367		return r;
5368
5369	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5370	alu.op = ALU_OP3_MULADD;
5371	alu.is_op3 = 1;
5372
5373	alu.dst.chan = 0;
5374	alu.dst.sel = ctx->temp_reg;
5375	alu.dst.write = 1;
5376
5377	alu.src[0].sel = ctx->temp_reg;
5378	alu.src[0].chan = 0;
5379
5380	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5381	alu.src[1].chan = 0;
5382	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5383	alu.src[2].chan = 0;
5384
5385	if (ctx->bc->chip_class == R600) {
5386		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5387		alu.src[2].value = u_bitcast_f2u(-M_PI);
5388	} else {
5389		alu.src[1].sel = V_SQ_ALU_SRC_1;
5390		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5391		alu.src[2].neg = 1;
5392	}
5393
5394	alu.last = 1;
5395	r = r600_bytecode_add_alu(ctx->bc, &alu);
5396	if (r)
5397		return r;
5398	return 0;
5399}
5400
5401static int cayman_trig(struct r600_shader_ctx *ctx)
5402{
5403	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5404	struct r600_bytecode_alu alu;
5405	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5406	int i, r;
5407
5408	r = tgsi_setup_trig(ctx);
5409	if (r)
5410		return r;
5411
5412
5413	for (i = 0; i < last_slot; i++) {
5414		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5415		alu.op = ctx->inst_info->op;
5416		alu.dst.chan = i;
5417
5418		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5419		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5420
5421		alu.src[0].sel = ctx->temp_reg;
5422		alu.src[0].chan = 0;
5423		if (i == last_slot - 1)
5424			alu.last = 1;
5425		r = r600_bytecode_add_alu(ctx->bc, &alu);
5426		if (r)
5427			return r;
5428	}
5429	return 0;
5430}
5431
5432static int tgsi_trig(struct r600_shader_ctx *ctx)
5433{
5434	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5435	struct r600_bytecode_alu alu;
5436	int i, r;
5437	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5438
5439	r = tgsi_setup_trig(ctx);
5440	if (r)
5441		return r;
5442
5443	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5444	alu.op = ctx->inst_info->op;
5445	alu.dst.chan = 0;
5446	alu.dst.sel = ctx->temp_reg;
5447	alu.dst.write = 1;
5448
5449	alu.src[0].sel = ctx->temp_reg;
5450	alu.src[0].chan = 0;
5451	alu.last = 1;
5452	r = r600_bytecode_add_alu(ctx->bc, &alu);
5453	if (r)
5454		return r;
5455
5456	/* replicate result */
5457	for (i = 0; i < lasti + 1; i++) {
5458		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5459			continue;
5460
5461		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5462		alu.op = ALU_OP1_MOV;
5463
5464		alu.src[0].sel = ctx->temp_reg;
5465		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5466		if (i == lasti)
5467			alu.last = 1;
5468		r = r600_bytecode_add_alu(ctx->bc, &alu);
5469		if (r)
5470			return r;
5471	}
5472	return 0;
5473}
5474
5475static int tgsi_kill(struct r600_shader_ctx *ctx)
5476{
5477	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5478	struct r600_bytecode_alu alu;
5479	int i, r;
5480
5481	for (i = 0; i < 4; i++) {
5482		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5483		alu.op = ctx->inst_info->op;
5484
5485		alu.dst.chan = i;
5486
5487		alu.src[0].sel = V_SQ_ALU_SRC_0;
5488
5489		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5490			alu.src[1].sel = V_SQ_ALU_SRC_1;
5491			alu.src[1].neg = 1;
5492		} else {
5493			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5494		}
5495		if (i == 3) {
5496			alu.last = 1;
5497		}
5498		r = r600_bytecode_add_alu(ctx->bc, &alu);
5499		if (r)
5500			return r;
5501	}
5502
5503	/* kill must be last in ALU */
5504	ctx->bc->force_add_cf = 1;
5505	ctx->shader->uses_kill = TRUE;
5506	return 0;
5507}
5508
5509static int tgsi_lit(struct r600_shader_ctx *ctx)
5510{
5511	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5512	struct r600_bytecode_alu alu;
5513	int r;
5514
5515	/* tmp.x = max(src.y, 0.0) */
5516	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5517	alu.op = ALU_OP2_MAX;
5518	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5519	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5520	alu.src[1].chan = 1;
5521
5522	alu.dst.sel = ctx->temp_reg;
5523	alu.dst.chan = 0;
5524	alu.dst.write = 1;
5525
5526	alu.last = 1;
5527	r = r600_bytecode_add_alu(ctx->bc, &alu);
5528	if (r)
5529		return r;
5530
5531	if (inst->Dst[0].Register.WriteMask & (1 << 2))
5532	{
5533		int chan;
5534		int sel;
5535		unsigned i;
5536
5537		if (ctx->bc->chip_class == CAYMAN) {
5538			for (i = 0; i < 3; i++) {
5539				/* tmp.z = log(tmp.x) */
5540				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5541				alu.op = ALU_OP1_LOG_CLAMPED;
5542				alu.src[0].sel = ctx->temp_reg;
5543				alu.src[0].chan = 0;
5544				alu.dst.sel = ctx->temp_reg;
5545				alu.dst.chan = i;
5546				if (i == 2) {
5547					alu.dst.write = 1;
5548					alu.last = 1;
5549				} else
5550					alu.dst.write = 0;
5551
5552				r = r600_bytecode_add_alu(ctx->bc, &alu);
5553				if (r)
5554					return r;
5555			}
5556		} else {
5557			/* tmp.z = log(tmp.x) */
5558			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5559			alu.op = ALU_OP1_LOG_CLAMPED;
5560			alu.src[0].sel = ctx->temp_reg;
5561			alu.src[0].chan = 0;
5562			alu.dst.sel = ctx->temp_reg;
5563			alu.dst.chan = 2;
5564			alu.dst.write = 1;
5565			alu.last = 1;
5566			r = r600_bytecode_add_alu(ctx->bc, &alu);
5567			if (r)
5568				return r;
5569		}
5570
5571		chan = alu.dst.chan;
5572		sel = alu.dst.sel;
5573
5574		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5575		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5576		alu.op = ALU_OP3_MUL_LIT;
5577		alu.src[0].sel  = sel;
5578		alu.src[0].chan = chan;
5579		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5580		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5581		alu.dst.sel = ctx->temp_reg;
5582		alu.dst.chan = 0;
5583		alu.dst.write = 1;
5584		alu.is_op3 = 1;
5585		alu.last = 1;
5586		r = r600_bytecode_add_alu(ctx->bc, &alu);
5587		if (r)
5588			return r;
5589
5590		if (ctx->bc->chip_class == CAYMAN) {
5591			for (i = 0; i < 3; i++) {
5592				/* dst.z = exp(tmp.x) */
5593				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5594				alu.op = ALU_OP1_EXP_IEEE;
5595				alu.src[0].sel = ctx->temp_reg;
5596				alu.src[0].chan = 0;
5597				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5598				if (i == 2) {
5599					alu.dst.write = 1;
5600					alu.last = 1;
5601				} else
5602					alu.dst.write = 0;
5603				r = r600_bytecode_add_alu(ctx->bc, &alu);
5604				if (r)
5605					return r;
5606			}
5607		} else {
5608			/* dst.z = exp(tmp.x) */
5609			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5610			alu.op = ALU_OP1_EXP_IEEE;
5611			alu.src[0].sel = ctx->temp_reg;
5612			alu.src[0].chan = 0;
5613			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5614			alu.last = 1;
5615			r = r600_bytecode_add_alu(ctx->bc, &alu);
5616			if (r)
5617				return r;
5618		}
5619	}
5620
5621	/* dst.x, <- 1.0  */
5622	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5623	alu.op = ALU_OP1_MOV;
5624	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
5625	alu.src[0].chan = 0;
5626	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5627	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5628	r = r600_bytecode_add_alu(ctx->bc, &alu);
5629	if (r)
5630		return r;
5631
5632	/* dst.y = max(src.x, 0.0) */
5633	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5634	alu.op = ALU_OP2_MAX;
5635	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5636	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
5637	alu.src[1].chan = 0;
5638	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5639	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5640	r = r600_bytecode_add_alu(ctx->bc, &alu);
5641	if (r)
5642		return r;
5643
5644	/* dst.w, <- 1.0  */
5645	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5646	alu.op = ALU_OP1_MOV;
5647	alu.src[0].sel  = V_SQ_ALU_SRC_1;
5648	alu.src[0].chan = 0;
5649	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5650	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5651	alu.last = 1;
5652	r = r600_bytecode_add_alu(ctx->bc, &alu);
5653	if (r)
5654		return r;
5655
5656	return 0;
5657}
5658
5659static int tgsi_rsq(struct r600_shader_ctx *ctx)
5660{
5661	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5662	struct r600_bytecode_alu alu;
5663	int i, r;
5664
5665	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5666
5667	alu.op = ALU_OP1_RECIPSQRT_IEEE;
5668
5669	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5670		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5671		r600_bytecode_src_set_abs(&alu.src[i]);
5672	}
5673	alu.dst.sel = ctx->temp_reg;
5674	alu.dst.write = 1;
5675	alu.last = 1;
5676	r = r600_bytecode_add_alu(ctx->bc, &alu);
5677	if (r)
5678		return r;
5679	/* replicate result */
5680	return tgsi_helper_tempx_replicate(ctx);
5681}
5682
5683static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5684{
5685	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5686	struct r600_bytecode_alu alu;
5687	int i, r;
5688
5689	for (i = 0; i < 4; i++) {
5690		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5691		alu.src[0].sel = ctx->temp_reg;
5692		alu.op = ALU_OP1_MOV;
5693		alu.dst.chan = i;
5694		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5695		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5696		if (i == 3)
5697			alu.last = 1;
5698		r = r600_bytecode_add_alu(ctx->bc, &alu);
5699		if (r)
5700			return r;
5701	}
5702	return 0;
5703}
5704
5705static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5706{
5707	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5708	struct r600_bytecode_alu alu;
5709	int i, r;
5710
5711	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5712	alu.op = ctx->inst_info->op;
5713	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5714		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5715	}
5716	alu.dst.sel = ctx->temp_reg;
5717	alu.dst.write = 1;
5718	alu.last = 1;
5719	r = r600_bytecode_add_alu(ctx->bc, &alu);
5720	if (r)
5721		return r;
5722	/* replicate result */
5723	return tgsi_helper_tempx_replicate(ctx);
5724}
5725
5726static int cayman_pow(struct r600_shader_ctx *ctx)
5727{
5728	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5729	int i, r;
5730	struct r600_bytecode_alu alu;
5731	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5732
5733	for (i = 0; i < 3; i++) {
5734		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5735		alu.op = ALU_OP1_LOG_IEEE;
5736		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5737		alu.dst.sel = ctx->temp_reg;
5738		alu.dst.chan = i;
5739		alu.dst.write = 1;
5740		if (i == 2)
5741			alu.last = 1;
5742		r = r600_bytecode_add_alu(ctx->bc, &alu);
5743		if (r)
5744			return r;
5745	}
5746
5747	/* b * LOG2(a) */
5748	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5749	alu.op = ALU_OP2_MUL;
5750	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5751	alu.src[1].sel = ctx->temp_reg;
5752	alu.dst.sel = ctx->temp_reg;
5753	alu.dst.write = 1;
5754	alu.last = 1;
5755	r = r600_bytecode_add_alu(ctx->bc, &alu);
5756	if (r)
5757		return r;
5758
5759	for (i = 0; i < last_slot; i++) {
5760		/* POW(a,b) = EXP2(b * LOG2(a))*/
5761		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5762		alu.op = ALU_OP1_EXP_IEEE;
5763		alu.src[0].sel = ctx->temp_reg;
5764
5765		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5766		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5767		if (i == last_slot - 1)
5768			alu.last = 1;
5769		r = r600_bytecode_add_alu(ctx->bc, &alu);
5770		if (r)
5771			return r;
5772	}
5773	return 0;
5774}
5775
5776static int tgsi_pow(struct r600_shader_ctx *ctx)
5777{
5778	struct r600_bytecode_alu alu;
5779	int r;
5780
5781	/* LOG2(a) */
5782	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5783	alu.op = ALU_OP1_LOG_IEEE;
5784	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5785	alu.dst.sel = ctx->temp_reg;
5786	alu.dst.write = 1;
5787	alu.last = 1;
5788	r = r600_bytecode_add_alu(ctx->bc, &alu);
5789	if (r)
5790		return r;
5791	/* b * LOG2(a) */
5792	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5793	alu.op = ALU_OP2_MUL;
5794	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5795	alu.src[1].sel = ctx->temp_reg;
5796	alu.dst.sel = ctx->temp_reg;
5797	alu.dst.write = 1;
5798	alu.last = 1;
5799	r = r600_bytecode_add_alu(ctx->bc, &alu);
5800	if (r)
5801		return r;
5802	/* POW(a,b) = EXP2(b * LOG2(a))*/
5803	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5804	alu.op = ALU_OP1_EXP_IEEE;
5805	alu.src[0].sel = ctx->temp_reg;
5806	alu.dst.sel = ctx->temp_reg;
5807	alu.dst.write = 1;
5808	alu.last = 1;
5809	r = r600_bytecode_add_alu(ctx->bc, &alu);
5810	if (r)
5811		return r;
5812	return tgsi_helper_tempx_replicate(ctx);
5813}
5814
5815static int emit_mul_int_op(struct r600_bytecode *bc,
5816			   struct r600_bytecode_alu *alu_src)
5817{
5818	struct r600_bytecode_alu alu;
5819	int i, r;
5820	alu = *alu_src;
5821	if (bc->chip_class == CAYMAN) {
5822		for (i = 0; i < 4; i++) {
5823			alu.dst.chan = i;
5824			alu.dst.write = (i == alu_src->dst.chan);
5825			alu.last = (i == 3);
5826
5827			r = r600_bytecode_add_alu(bc, &alu);
5828			if (r)
5829				return r;
5830		}
5831	} else {
5832		alu.last = 1;
5833		r = r600_bytecode_add_alu(bc, &alu);
5834		if (r)
5835			return r;
5836	}
5837	return 0;
5838}
5839
5840static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5841{
5842	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5843	struct r600_bytecode_alu alu;
5844	int i, r, j;
5845	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5846	int lasti = tgsi_last_instruction(write_mask);
5847	int tmp0 = ctx->temp_reg;
5848	int tmp1 = r600_get_temp(ctx);
5849	int tmp2 = r600_get_temp(ctx);
5850	int tmp3 = r600_get_temp(ctx);
5851	int tmp4 = 0;
5852
5853	/* Use additional temp if dst register and src register are the same */
5854	if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5855	    inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5856		tmp4 = r600_get_temp(ctx);
5857	}
5858
5859	/* Unsigned path:
5860	 *
5861	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5862	 *
5863	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5864	 * 2. tmp0.z = lo (tmp0.x * src2)
5865	 * 3. tmp0.w = -tmp0.z
5866	 * 4. tmp0.y = hi (tmp0.x * src2)
5867	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5868	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5869	 * 7. tmp1.x = tmp0.x - tmp0.w
5870	 * 8. tmp1.y = tmp0.x + tmp0.w
5871	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5872	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5873	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5874	 *
5875	 * 12. tmp0.w = src1 - tmp0.y       = r
5876	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5877	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5878	 *
5879	 * if DIV
5880	 *
5881	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5882	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5883	 *
5884	 * else MOD
5885	 *
5886	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5887	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5888	 *
5889	 * endif
5890	 *
5891	 * 17. tmp1.x = tmp1.x & tmp1.y
5892	 *
5893	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5894	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5895	 *
5896	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5897	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5898	 *
5899	 * Signed path:
5900	 *
5901	 * Same as unsigned, using abs values of the operands,
5902	 * and fixing the sign of the result in the end.
5903	 */
5904
5905	for (i = 0; i < 4; i++) {
5906		if (!(write_mask & (1<<i)))
5907			continue;
5908
5909		if (signed_op) {
5910
5911			/* tmp2.x = -src0 */
5912			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5913			alu.op = ALU_OP2_SUB_INT;
5914
5915			alu.dst.sel = tmp2;
5916			alu.dst.chan = 0;
5917			alu.dst.write = 1;
5918
5919			alu.src[0].sel = V_SQ_ALU_SRC_0;
5920
5921			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5922
5923			alu.last = 1;
5924			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5925				return r;
5926
5927			/* tmp2.y = -src1 */
5928			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5929			alu.op = ALU_OP2_SUB_INT;
5930
5931			alu.dst.sel = tmp2;
5932			alu.dst.chan = 1;
5933			alu.dst.write = 1;
5934
5935			alu.src[0].sel = V_SQ_ALU_SRC_0;
5936
5937			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5938
5939			alu.last = 1;
5940			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5941				return r;
5942
5943			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5944			/* it will be a sign of the quotient */
5945			if (!mod) {
5946
5947				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5948				alu.op = ALU_OP2_XOR_INT;
5949
5950				alu.dst.sel = tmp2;
5951				alu.dst.chan = 2;
5952				alu.dst.write = 1;
5953
5954				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5955				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5956
5957				alu.last = 1;
5958				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5959					return r;
5960			}
5961
5962			/* tmp2.x = |src0| */
5963			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5964			alu.op = ALU_OP3_CNDGE_INT;
5965			alu.is_op3 = 1;
5966
5967			alu.dst.sel = tmp2;
5968			alu.dst.chan = 0;
5969			alu.dst.write = 1;
5970
5971			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5972			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5973			alu.src[2].sel = tmp2;
5974			alu.src[2].chan = 0;
5975
5976			alu.last = 1;
5977			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5978				return r;
5979
5980			/* tmp2.y = |src1| */
5981			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5982			alu.op = ALU_OP3_CNDGE_INT;
5983			alu.is_op3 = 1;
5984
5985			alu.dst.sel = tmp2;
5986			alu.dst.chan = 1;
5987			alu.dst.write = 1;
5988
5989			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5990			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5991			alu.src[2].sel = tmp2;
5992			alu.src[2].chan = 1;
5993
5994			alu.last = 1;
5995			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5996				return r;
5997
5998		}
5999
6000		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
6001		if (ctx->bc->chip_class == CAYMAN) {
6002			/* tmp3.x = u2f(src2) */
6003			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6004			alu.op = ALU_OP1_UINT_TO_FLT;
6005
6006			alu.dst.sel = tmp3;
6007			alu.dst.chan = 0;
6008			alu.dst.write = 1;
6009
6010			if (signed_op) {
6011				alu.src[0].sel = tmp2;
6012				alu.src[0].chan = 1;
6013			} else {
6014				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6015			}
6016
6017			alu.last = 1;
6018			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6019				return r;
6020
6021			/* tmp0.x = recip(tmp3.x) */
6022			for (j = 0 ; j < 3; j++) {
6023				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6024				alu.op = ALU_OP1_RECIP_IEEE;
6025
6026				alu.dst.sel = tmp0;
6027				alu.dst.chan = j;
6028				alu.dst.write = (j == 0);
6029
6030				alu.src[0].sel = tmp3;
6031				alu.src[0].chan = 0;
6032
6033				if (j == 2)
6034					alu.last = 1;
6035				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6036					return r;
6037			}
6038
6039			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6040			alu.op = ALU_OP2_MUL;
6041
6042			alu.src[0].sel = tmp0;
6043			alu.src[0].chan = 0;
6044
6045			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6046			alu.src[1].value = 0x4f800000;
6047
6048			alu.dst.sel = tmp3;
6049			alu.dst.write = 1;
6050			alu.last = 1;
6051			r = r600_bytecode_add_alu(ctx->bc, &alu);
6052			if (r)
6053				return r;
6054
6055			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6056			alu.op = ALU_OP1_FLT_TO_UINT;
6057
6058			alu.dst.sel = tmp0;
6059			alu.dst.chan = 0;
6060			alu.dst.write = 1;
6061
6062			alu.src[0].sel = tmp3;
6063			alu.src[0].chan = 0;
6064
6065			alu.last = 1;
6066			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6067				return r;
6068
6069		} else {
6070			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6071			alu.op = ALU_OP1_RECIP_UINT;
6072
6073			alu.dst.sel = tmp0;
6074			alu.dst.chan = 0;
6075			alu.dst.write = 1;
6076
6077			if (signed_op) {
6078				alu.src[0].sel = tmp2;
6079				alu.src[0].chan = 1;
6080			} else {
6081				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6082			}
6083
6084			alu.last = 1;
6085			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6086				return r;
6087		}
6088
6089		/* 2. tmp0.z = lo (tmp0.x * src2) */
6090		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6091		alu.op = ALU_OP2_MULLO_UINT;
6092
6093		alu.dst.sel = tmp0;
6094		alu.dst.chan = 2;
6095		alu.dst.write = 1;
6096
6097		alu.src[0].sel = tmp0;
6098		alu.src[0].chan = 0;
6099		if (signed_op) {
6100			alu.src[1].sel = tmp2;
6101			alu.src[1].chan = 1;
6102		} else {
6103			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6104		}
6105
6106		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6107			return r;
6108
6109		/* 3. tmp0.w = -tmp0.z */
6110		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6111		alu.op = ALU_OP2_SUB_INT;
6112
6113		alu.dst.sel = tmp0;
6114		alu.dst.chan = 3;
6115		alu.dst.write = 1;
6116
6117		alu.src[0].sel = V_SQ_ALU_SRC_0;
6118		alu.src[1].sel = tmp0;
6119		alu.src[1].chan = 2;
6120
6121		alu.last = 1;
6122		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6123			return r;
6124
6125		/* 4. tmp0.y = hi (tmp0.x * src2) */
6126		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6127		alu.op = ALU_OP2_MULHI_UINT;
6128
6129		alu.dst.sel = tmp0;
6130		alu.dst.chan = 1;
6131		alu.dst.write = 1;
6132
6133		alu.src[0].sel = tmp0;
6134		alu.src[0].chan = 0;
6135
6136		if (signed_op) {
6137			alu.src[1].sel = tmp2;
6138			alu.src[1].chan = 1;
6139		} else {
6140			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6141		}
6142
6143		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6144			return r;
6145
6146		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
6147		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6148		alu.op = ALU_OP3_CNDE_INT;
6149		alu.is_op3 = 1;
6150
6151		alu.dst.sel = tmp0;
6152		alu.dst.chan = 2;
6153		alu.dst.write = 1;
6154
6155		alu.src[0].sel = tmp0;
6156		alu.src[0].chan = 1;
6157		alu.src[1].sel = tmp0;
6158		alu.src[1].chan = 3;
6159		alu.src[2].sel = tmp0;
6160		alu.src[2].chan = 2;
6161
6162		alu.last = 1;
6163		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6164			return r;
6165
6166		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
6167		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6168		alu.op = ALU_OP2_MULHI_UINT;
6169
6170		alu.dst.sel = tmp0;
6171		alu.dst.chan = 3;
6172		alu.dst.write = 1;
6173
6174		alu.src[0].sel = tmp0;
6175		alu.src[0].chan = 2;
6176
6177		alu.src[1].sel = tmp0;
6178		alu.src[1].chan = 0;
6179
6180		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6181				return r;
6182
6183		/* 7. tmp1.x = tmp0.x - tmp0.w */
6184		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6185		alu.op = ALU_OP2_SUB_INT;
6186
6187		alu.dst.sel = tmp1;
6188		alu.dst.chan = 0;
6189		alu.dst.write = 1;
6190
6191		alu.src[0].sel = tmp0;
6192		alu.src[0].chan = 0;
6193		alu.src[1].sel = tmp0;
6194		alu.src[1].chan = 3;
6195
6196		alu.last = 1;
6197		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6198			return r;
6199
6200		/* 8. tmp1.y = tmp0.x + tmp0.w */
6201		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6202		alu.op = ALU_OP2_ADD_INT;
6203
6204		alu.dst.sel = tmp1;
6205		alu.dst.chan = 1;
6206		alu.dst.write = 1;
6207
6208		alu.src[0].sel = tmp0;
6209		alu.src[0].chan = 0;
6210		alu.src[1].sel = tmp0;
6211		alu.src[1].chan = 3;
6212
6213		alu.last = 1;
6214		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6215			return r;
6216
6217		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6218		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6219		alu.op = ALU_OP3_CNDE_INT;
6220		alu.is_op3 = 1;
6221
6222		alu.dst.sel = tmp0;
6223		alu.dst.chan = 0;
6224		alu.dst.write = 1;
6225
6226		alu.src[0].sel = tmp0;
6227		alu.src[0].chan = 1;
6228		alu.src[1].sel = tmp1;
6229		alu.src[1].chan = 1;
6230		alu.src[2].sel = tmp1;
6231		alu.src[2].chan = 0;
6232
6233		alu.last = 1;
6234		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6235			return r;
6236
6237		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
6238		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6239		alu.op = ALU_OP2_MULHI_UINT;
6240
6241		alu.dst.sel = tmp0;
6242		alu.dst.chan = 2;
6243		alu.dst.write = 1;
6244
6245		alu.src[0].sel = tmp0;
6246		alu.src[0].chan = 0;
6247
6248		if (signed_op) {
6249			alu.src[1].sel = tmp2;
6250			alu.src[1].chan = 0;
6251		} else {
6252			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6253		}
6254
6255		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6256			return r;
6257
6258		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
6259		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6260		alu.op = ALU_OP2_MULLO_UINT;
6261
6262		alu.dst.sel = tmp0;
6263		alu.dst.chan = 1;
6264		alu.dst.write = 1;
6265
6266		if (signed_op) {
6267			alu.src[0].sel = tmp2;
6268			alu.src[0].chan = 1;
6269		} else {
6270			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6271		}
6272
6273		alu.src[1].sel = tmp0;
6274		alu.src[1].chan = 2;
6275
6276		if ((r = emit_mul_int_op(ctx->bc, &alu)))
6277			return r;
6278
6279		/* 12. tmp0.w = src1 - tmp0.y       = r */
6280		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6281		alu.op = ALU_OP2_SUB_INT;
6282
6283		alu.dst.sel = tmp0;
6284		alu.dst.chan = 3;
6285		alu.dst.write = 1;
6286
6287		if (signed_op) {
6288			alu.src[0].sel = tmp2;
6289			alu.src[0].chan = 0;
6290		} else {
6291			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6292		}
6293
6294		alu.src[1].sel = tmp0;
6295		alu.src[1].chan = 1;
6296
6297		alu.last = 1;
6298		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6299			return r;
6300
6301		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
6302		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6303		alu.op = ALU_OP2_SETGE_UINT;
6304
6305		alu.dst.sel = tmp1;
6306		alu.dst.chan = 0;
6307		alu.dst.write = 1;
6308
6309		alu.src[0].sel = tmp0;
6310		alu.src[0].chan = 3;
6311		if (signed_op) {
6312			alu.src[1].sel = tmp2;
6313			alu.src[1].chan = 1;
6314		} else {
6315			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6316		}
6317
6318		alu.last = 1;
6319		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6320			return r;
6321
6322		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
6323		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6324		alu.op = ALU_OP2_SETGE_UINT;
6325
6326		alu.dst.sel = tmp1;
6327		alu.dst.chan = 1;
6328		alu.dst.write = 1;
6329
6330		if (signed_op) {
6331			alu.src[0].sel = tmp2;
6332			alu.src[0].chan = 0;
6333		} else {
6334			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6335		}
6336
6337		alu.src[1].sel = tmp0;
6338		alu.src[1].chan = 1;
6339
6340		alu.last = 1;
6341		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6342			return r;
6343
6344		if (mod) { /* UMOD */
6345
6346			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
6347			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6348			alu.op = ALU_OP2_SUB_INT;
6349
6350			alu.dst.sel = tmp1;
6351			alu.dst.chan = 2;
6352			alu.dst.write = 1;
6353
6354			alu.src[0].sel = tmp0;
6355			alu.src[0].chan = 3;
6356
6357			if (signed_op) {
6358				alu.src[1].sel = tmp2;
6359				alu.src[1].chan = 1;
6360			} else {
6361				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6362			}
6363
6364			alu.last = 1;
6365			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6366				return r;
6367
6368			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
6369			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6370			alu.op = ALU_OP2_ADD_INT;
6371
6372			alu.dst.sel = tmp1;
6373			alu.dst.chan = 3;
6374			alu.dst.write = 1;
6375
6376			alu.src[0].sel = tmp0;
6377			alu.src[0].chan = 3;
6378			if (signed_op) {
6379				alu.src[1].sel = tmp2;
6380				alu.src[1].chan = 1;
6381			} else {
6382				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6383			}
6384
6385			alu.last = 1;
6386			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6387				return r;
6388
6389		} else { /* UDIV */
6390
6391			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
6392			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6393			alu.op = ALU_OP2_ADD_INT;
6394
6395			alu.dst.sel = tmp1;
6396			alu.dst.chan = 2;
6397			alu.dst.write = 1;
6398
6399			alu.src[0].sel = tmp0;
6400			alu.src[0].chan = 2;
6401			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6402
6403			alu.last = 1;
6404			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6405				return r;
6406
6407			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
6408			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6409			alu.op = ALU_OP2_ADD_INT;
6410
6411			alu.dst.sel = tmp1;
6412			alu.dst.chan = 3;
6413			alu.dst.write = 1;
6414
6415			alu.src[0].sel = tmp0;
6416			alu.src[0].chan = 2;
6417			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6418
6419			alu.last = 1;
6420			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6421				return r;
6422
6423		}
6424
6425		/* 17. tmp1.x = tmp1.x & tmp1.y */
6426		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6427		alu.op = ALU_OP2_AND_INT;
6428
6429		alu.dst.sel = tmp1;
6430		alu.dst.chan = 0;
6431		alu.dst.write = 1;
6432
6433		alu.src[0].sel = tmp1;
6434		alu.src[0].chan = 0;
6435		alu.src[1].sel = tmp1;
6436		alu.src[1].chan = 1;
6437
6438		alu.last = 1;
6439		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6440			return r;
6441
6442		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
6443		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
6444		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6445		alu.op = ALU_OP3_CNDE_INT;
6446		alu.is_op3 = 1;
6447
6448		alu.dst.sel = tmp0;
6449		alu.dst.chan = 2;
6450		alu.dst.write = 1;
6451
6452		alu.src[0].sel = tmp1;
6453		alu.src[0].chan = 0;
6454		alu.src[1].sel = tmp0;
6455		alu.src[1].chan = mod ? 3 : 2;
6456		alu.src[2].sel = tmp1;
6457		alu.src[2].chan = 2;
6458
6459		alu.last = 1;
6460		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6461			return r;
6462
6463		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6464		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6465		alu.op = ALU_OP3_CNDE_INT;
6466		alu.is_op3 = 1;
6467
6468		if (signed_op) {
6469			alu.dst.sel = tmp0;
6470			alu.dst.chan = 2;
6471			alu.dst.write = 1;
6472		} else {
6473			if (tmp4 > 0) {
6474				alu.dst.sel = tmp4;
6475				alu.dst.chan = i;
6476				alu.dst.write = 1;
6477			} else {
6478				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6479			}
6480		}
6481
6482		alu.src[0].sel = tmp1;
6483		alu.src[0].chan = 1;
6484		alu.src[1].sel = tmp1;
6485		alu.src[1].chan = 3;
6486		alu.src[2].sel = tmp0;
6487		alu.src[2].chan = 2;
6488
6489		alu.last = 1;
6490		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6491			return r;
6492
6493		if (signed_op) {
6494
6495			/* fix the sign of the result */
6496
6497			if (mod) {
6498
6499				/* tmp0.x = -tmp0.z */
6500				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6501				alu.op = ALU_OP2_SUB_INT;
6502
6503				alu.dst.sel = tmp0;
6504				alu.dst.chan = 0;
6505				alu.dst.write = 1;
6506
6507				alu.src[0].sel = V_SQ_ALU_SRC_0;
6508				alu.src[1].sel = tmp0;
6509				alu.src[1].chan = 2;
6510
6511				alu.last = 1;
6512				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6513					return r;
6514
6515				/* sign of the remainder is the same as the sign of src0 */
6516				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6517				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6518				alu.op = ALU_OP3_CNDGE_INT;
6519				alu.is_op3 = 1;
6520
6521				if (tmp4 > 0) {
6522					alu.dst.sel = tmp4;
6523					alu.dst.chan = i;
6524					alu.dst.write = 1;
6525				} else {
6526					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6527				}
6528
6529				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6530				alu.src[1].sel = tmp0;
6531				alu.src[1].chan = 2;
6532				alu.src[2].sel = tmp0;
6533				alu.src[2].chan = 0;
6534
6535				alu.last = 1;
6536				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6537					return r;
6538
6539			} else {
6540
6541				/* tmp0.x = -tmp0.z */
6542				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6543				alu.op = ALU_OP2_SUB_INT;
6544
6545				alu.dst.sel = tmp0;
6546				alu.dst.chan = 0;
6547				alu.dst.write = 1;
6548
6549				alu.src[0].sel = V_SQ_ALU_SRC_0;
6550				alu.src[1].sel = tmp0;
6551				alu.src[1].chan = 2;
6552
6553				alu.last = 1;
6554				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6555					return r;
6556
6557				/* fix the quotient sign (same as the sign of src0*src1) */
6558				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6559				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6560				alu.op = ALU_OP3_CNDGE_INT;
6561				alu.is_op3 = 1;
6562
6563				if (tmp4 > 0) {
6564					alu.dst.sel = tmp4;
6565					alu.dst.chan = i;
6566					alu.dst.write = 1;
6567				} else {
6568					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6569				}
6570
6571				alu.src[0].sel = tmp2;
6572				alu.src[0].chan = 2;
6573				alu.src[1].sel = tmp0;
6574				alu.src[1].chan = 2;
6575				alu.src[2].sel = tmp0;
6576				alu.src[2].chan = 0;
6577
6578				alu.last = 1;
6579				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6580					return r;
6581			}
6582		}
6583	}
6584
6585	if (tmp4 > 0) {
6586		for (i = 0; i <= lasti; ++i) {
6587			if (!(write_mask & (1<<i)))
6588				continue;
6589
6590			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6591			alu.op = ALU_OP1_MOV;
6592			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6593			alu.src[0].sel = tmp4;
6594			alu.src[0].chan = i;
6595
6596			if (i == lasti)
6597				alu.last = 1;
6598			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6599				return r;
6600		}
6601	}
6602
6603	return 0;
6604}
6605
6606static int tgsi_udiv(struct r600_shader_ctx *ctx)
6607{
6608	return tgsi_divmod(ctx, 0, 0);
6609}
6610
6611static int tgsi_umod(struct r600_shader_ctx *ctx)
6612{
6613	return tgsi_divmod(ctx, 1, 0);
6614}
6615
6616static int tgsi_idiv(struct r600_shader_ctx *ctx)
6617{
6618	return tgsi_divmod(ctx, 0, 1);
6619}
6620
6621static int tgsi_imod(struct r600_shader_ctx *ctx)
6622{
6623	return tgsi_divmod(ctx, 1, 1);
6624}
6625
6626
6627static int tgsi_f2i(struct r600_shader_ctx *ctx)
6628{
6629	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6630	struct r600_bytecode_alu alu;
6631	int i, r;
6632	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6633	int last_inst = tgsi_last_instruction(write_mask);
6634
6635	for (i = 0; i < 4; i++) {
6636		if (!(write_mask & (1<<i)))
6637			continue;
6638
6639		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6640		alu.op = ALU_OP1_TRUNC;
6641
6642		alu.dst.sel = ctx->temp_reg;
6643		alu.dst.chan = i;
6644		alu.dst.write = 1;
6645
6646		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6647		if (i == last_inst)
6648			alu.last = 1;
6649		r = r600_bytecode_add_alu(ctx->bc, &alu);
6650		if (r)
6651			return r;
6652	}
6653
6654	for (i = 0; i < 4; i++) {
6655		if (!(write_mask & (1<<i)))
6656			continue;
6657
6658		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6659		alu.op = ctx->inst_info->op;
6660
6661		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6662
6663		alu.src[0].sel = ctx->temp_reg;
6664		alu.src[0].chan = i;
6665
6666		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6667			alu.last = 1;
6668		r = r600_bytecode_add_alu(ctx->bc, &alu);
6669		if (r)
6670			return r;
6671	}
6672
6673	return 0;
6674}
6675
6676static int tgsi_iabs(struct r600_shader_ctx *ctx)
6677{
6678	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6679	struct r600_bytecode_alu alu;
6680	int i, r;
6681	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6682	int last_inst = tgsi_last_instruction(write_mask);
6683
6684	/* tmp = -src */
6685	for (i = 0; i < 4; i++) {
6686		if (!(write_mask & (1<<i)))
6687			continue;
6688
6689		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6690		alu.op = ALU_OP2_SUB_INT;
6691
6692		alu.dst.sel = ctx->temp_reg;
6693		alu.dst.chan = i;
6694		alu.dst.write = 1;
6695
6696		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6697		alu.src[0].sel = V_SQ_ALU_SRC_0;
6698
6699		if (i == last_inst)
6700			alu.last = 1;
6701		r = r600_bytecode_add_alu(ctx->bc, &alu);
6702		if (r)
6703			return r;
6704	}
6705
6706	/* dst = (src >= 0 ? src : tmp) */
6707	for (i = 0; i < 4; i++) {
6708		if (!(write_mask & (1<<i)))
6709			continue;
6710
6711		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6712		alu.op = ALU_OP3_CNDGE_INT;
6713		alu.is_op3 = 1;
6714		alu.dst.write = 1;
6715
6716		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6717
6718		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6719		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6720		alu.src[2].sel = ctx->temp_reg;
6721		alu.src[2].chan = i;
6722
6723		if (i == last_inst)
6724			alu.last = 1;
6725		r = r600_bytecode_add_alu(ctx->bc, &alu);
6726		if (r)
6727			return r;
6728	}
6729	return 0;
6730}
6731
6732static int tgsi_issg(struct r600_shader_ctx *ctx)
6733{
6734	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6735	struct r600_bytecode_alu alu;
6736	int i, r;
6737	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6738	int last_inst = tgsi_last_instruction(write_mask);
6739
6740	/* tmp = (src >= 0 ? src : -1) */
6741	for (i = 0; i < 4; i++) {
6742		if (!(write_mask & (1<<i)))
6743			continue;
6744
6745		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6746		alu.op = ALU_OP3_CNDGE_INT;
6747		alu.is_op3 = 1;
6748
6749		alu.dst.sel = ctx->temp_reg;
6750		alu.dst.chan = i;
6751		alu.dst.write = 1;
6752
6753		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6754		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6755		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6756
6757		if (i == last_inst)
6758			alu.last = 1;
6759		r = r600_bytecode_add_alu(ctx->bc, &alu);
6760		if (r)
6761			return r;
6762	}
6763
6764	/* dst = (tmp > 0 ? 1 : tmp) */
6765	for (i = 0; i < 4; i++) {
6766		if (!(write_mask & (1<<i)))
6767			continue;
6768
6769		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6770		alu.op = ALU_OP3_CNDGT_INT;
6771		alu.is_op3 = 1;
6772		alu.dst.write = 1;
6773
6774		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6775
6776		alu.src[0].sel = ctx->temp_reg;
6777		alu.src[0].chan = i;
6778
6779		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6780
6781		alu.src[2].sel = ctx->temp_reg;
6782		alu.src[2].chan = i;
6783
6784		if (i == last_inst)
6785			alu.last = 1;
6786		r = r600_bytecode_add_alu(ctx->bc, &alu);
6787		if (r)
6788			return r;
6789	}
6790	return 0;
6791}
6792
6793
6794
6795static int tgsi_ssg(struct r600_shader_ctx *ctx)
6796{
6797	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6798	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6799	int last_inst = tgsi_last_instruction(write_mask);
6800	struct r600_bytecode_alu alu;
6801	int i, r;
6802
6803	/* tmp = (src > 0 ? 1 : src) */
6804	for (i = 0; i <= last_inst; i++) {
6805		if (!(write_mask & (1 << i)))
6806			continue;
6807		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6808		alu.op = ALU_OP3_CNDGT;
6809		alu.is_op3 = 1;
6810
6811		alu.dst.sel = ctx->temp_reg;
6812		alu.dst.chan = i;
6813
6814		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6815		alu.src[1].sel = V_SQ_ALU_SRC_1;
6816		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6817
6818		if (i == last_inst)
6819			alu.last = 1;
6820		r = r600_bytecode_add_alu(ctx->bc, &alu);
6821		if (r)
6822			return r;
6823	}
6824
6825	/* dst = (-tmp > 0 ? -1 : tmp) */
6826	for (i = 0; i <= last_inst; i++) {
6827		if (!(write_mask & (1 << i)))
6828			continue;
6829		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6830		alu.op = ALU_OP3_CNDGT;
6831		alu.is_op3 = 1;
6832		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6833
6834		alu.src[0].sel = ctx->temp_reg;
6835		alu.src[0].chan = i;
6836		alu.src[0].neg = 1;
6837
6838		alu.src[1].sel = V_SQ_ALU_SRC_1;
6839		alu.src[1].neg = 1;
6840
6841		alu.src[2].sel = ctx->temp_reg;
6842		alu.src[2].chan = i;
6843
6844		if (i == last_inst)
6845			alu.last = 1;
6846		r = r600_bytecode_add_alu(ctx->bc, &alu);
6847		if (r)
6848			return r;
6849	}
6850	return 0;
6851}
6852
6853static int tgsi_bfi(struct r600_shader_ctx *ctx)
6854{
6855	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6856	struct r600_bytecode_alu alu;
6857	int i, r, t1, t2;
6858
6859	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6860	int last_inst = tgsi_last_instruction(write_mask);
6861
6862	t1 = r600_get_temp(ctx);
6863
6864	for (i = 0; i < 4; i++) {
6865		if (!(write_mask & (1<<i)))
6866			continue;
6867
6868		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6869		alu.op = ALU_OP2_SETGE_INT;
6870		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6871		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6872		alu.src[1].value = 32;
6873		alu.dst.sel = ctx->temp_reg;
6874		alu.dst.chan = i;
6875		alu.dst.write = 1;
6876		alu.last = i == last_inst;
6877		r = r600_bytecode_add_alu(ctx->bc, &alu);
6878		if (r)
6879			return r;
6880	}
6881
6882	for (i = 0; i < 4; i++) {
6883		if (!(write_mask & (1<<i)))
6884			continue;
6885
6886		/* create mask tmp */
6887		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6888		alu.op = ALU_OP2_BFM_INT;
6889		alu.dst.sel = t1;
6890		alu.dst.chan = i;
6891		alu.dst.write = 1;
6892		alu.last = i == last_inst;
6893
6894		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6895		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6896
6897		r = r600_bytecode_add_alu(ctx->bc, &alu);
6898		if (r)
6899			return r;
6900	}
6901
6902	t2 = r600_get_temp(ctx);
6903
6904	for (i = 0; i < 4; i++) {
6905		if (!(write_mask & (1<<i)))
6906			continue;
6907
6908		/* shift insert left */
6909		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6910		alu.op = ALU_OP2_LSHL_INT;
6911		alu.dst.sel = t2;
6912		alu.dst.chan = i;
6913		alu.dst.write = 1;
6914		alu.last = i == last_inst;
6915
6916		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6917		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6918
6919		r = r600_bytecode_add_alu(ctx->bc, &alu);
6920		if (r)
6921			return r;
6922	}
6923
6924	for (i = 0; i < 4; i++) {
6925		if (!(write_mask & (1<<i)))
6926			continue;
6927
6928		/* actual bitfield insert */
6929		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6930		alu.op = ALU_OP3_BFI_INT;
6931		alu.is_op3 = 1;
6932		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6933		alu.dst.chan = i;
6934		alu.dst.write = 1;
6935		alu.last = i == last_inst;
6936
6937		alu.src[0].sel = t1;
6938		alu.src[0].chan = i;
6939		alu.src[1].sel = t2;
6940		alu.src[1].chan = i;
6941		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6942
6943		r = r600_bytecode_add_alu(ctx->bc, &alu);
6944		if (r)
6945			return r;
6946	}
6947
6948	for (i = 0; i < 4; i++) {
6949		if (!(write_mask & (1<<i)))
6950			continue;
6951		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6952		alu.op = ALU_OP3_CNDE_INT;
6953		alu.is_op3 = 1;
6954		alu.src[0].sel = ctx->temp_reg;
6955		alu.src[0].chan = i;
6956		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6957
6958		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6959
6960		alu.src[1].sel = alu.dst.sel;
6961		alu.src[1].chan = i;
6962
6963		alu.last = i == last_inst;
6964		r = r600_bytecode_add_alu(ctx->bc, &alu);
6965		if (r)
6966			return r;
6967	}
6968	return 0;
6969}
6970
6971static int tgsi_msb(struct r600_shader_ctx *ctx)
6972{
6973	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6974	struct r600_bytecode_alu alu;
6975	int i, r, t1, t2;
6976
6977	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6978	int last_inst = tgsi_last_instruction(write_mask);
6979
6980	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6981		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6982
6983	t1 = ctx->temp_reg;
6984
6985	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6986	for (i = 0; i < 4; i++) {
6987		if (!(write_mask & (1<<i)))
6988			continue;
6989
6990		/* t1 = FFBH_INT / FFBH_UINT */
6991		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6992		alu.op = ctx->inst_info->op;
6993		alu.dst.sel = t1;
6994		alu.dst.chan = i;
6995		alu.dst.write = 1;
6996		alu.last = i == last_inst;
6997
6998		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6999
7000		r = r600_bytecode_add_alu(ctx->bc, &alu);
7001		if (r)
7002			return r;
7003	}
7004
7005	t2 = r600_get_temp(ctx);
7006
7007	for (i = 0; i < 4; i++) {
7008		if (!(write_mask & (1<<i)))
7009			continue;
7010
7011		/* t2 = 31 - t1 */
7012		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7013		alu.op = ALU_OP2_SUB_INT;
7014		alu.dst.sel = t2;
7015		alu.dst.chan = i;
7016		alu.dst.write = 1;
7017		alu.last = i == last_inst;
7018
7019		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7020		alu.src[0].value = 31;
7021		alu.src[1].sel = t1;
7022		alu.src[1].chan = i;
7023
7024		r = r600_bytecode_add_alu(ctx->bc, &alu);
7025		if (r)
7026			return r;
7027	}
7028
7029	for (i = 0; i < 4; i++) {
7030		if (!(write_mask & (1<<i)))
7031			continue;
7032
7033		/* result = t1 >= 0 ? t2 : t1 */
7034		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7035		alu.op = ALU_OP3_CNDGE_INT;
7036		alu.is_op3 = 1;
7037		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7038		alu.dst.chan = i;
7039		alu.dst.write = 1;
7040		alu.last = i == last_inst;
7041
7042		alu.src[0].sel = t1;
7043		alu.src[0].chan = i;
7044		alu.src[1].sel = t2;
7045		alu.src[1].chan = i;
7046		alu.src[2].sel = t1;
7047		alu.src[2].chan = i;
7048
7049		r = r600_bytecode_add_alu(ctx->bc, &alu);
7050		if (r)
7051			return r;
7052	}
7053
7054	return 0;
7055}
7056
7057static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
7058{
7059	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7060	struct r600_bytecode_alu alu;
7061	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
7062	unsigned location;
7063	const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
7064
7065	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
7066
7067	/* Interpolators have been marked for use already by allocate_system_value_inputs */
7068	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7069		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7070		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
7071	}
7072	else {
7073		location = TGSI_INTERPOLATE_LOC_CENTROID;
7074		ctx->shader->input[input].uses_interpolate_at_centroid = 1;
7075	}
7076
7077	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
7078	if (k < 0)
7079		k = 0;
7080	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
7081	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
7082
7083	/* NOTE: currently offset is not perspective correct */
7084	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7085		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7086		int sample_gpr = -1;
7087		int gradientsH, gradientsV;
7088		struct r600_bytecode_tex tex;
7089
7090		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7091			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
7092		}
7093
7094		gradientsH = r600_get_temp(ctx);
7095		gradientsV = r600_get_temp(ctx);
7096		for (i = 0; i < 2; i++) {
7097			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7098			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7099			tex.src_gpr = interp_gpr;
7100			tex.src_sel_x = interp_base_chan + 0;
7101			tex.src_sel_y = interp_base_chan + 1;
7102			tex.src_sel_z = 0;
7103			tex.src_sel_w = 0;
7104			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7105			tex.dst_sel_x = 0;
7106			tex.dst_sel_y = 1;
7107			tex.dst_sel_z = 7;
7108			tex.dst_sel_w = 7;
7109			tex.inst_mod = 1; // Use per pixel gradient calculation
7110			tex.sampler_id = 0;
7111			tex.resource_id = tex.sampler_id;
7112			r = r600_bytecode_add_tex(ctx->bc, &tex);
7113			if (r)
7114				return r;
7115		}
7116
7117		for (i = 0; i < 2; i++) {
7118			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7119			alu.op = ALU_OP3_MULADD;
7120			alu.is_op3 = 1;
7121			alu.src[0].sel = gradientsH;
7122			alu.src[0].chan = i;
7123			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7124				alu.src[1].sel = sample_gpr;
7125				alu.src[1].chan = 2;
7126			}
7127			else {
7128				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7129			}
7130			alu.src[2].sel = interp_gpr;
7131			alu.src[2].chan = interp_base_chan + i;
7132			alu.dst.sel = ctx->temp_reg;
7133			alu.dst.chan = i;
7134			alu.last = i == 1;
7135
7136			r = r600_bytecode_add_alu(ctx->bc, &alu);
7137			if (r)
7138				return r;
7139		}
7140
7141		for (i = 0; i < 2; i++) {
7142			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7143			alu.op = ALU_OP3_MULADD;
7144			alu.is_op3 = 1;
7145			alu.src[0].sel = gradientsV;
7146			alu.src[0].chan = i;
7147			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7148				alu.src[1].sel = sample_gpr;
7149				alu.src[1].chan = 3;
7150			}
7151			else {
7152				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7153			}
7154			alu.src[2].sel = ctx->temp_reg;
7155			alu.src[2].chan = i;
7156			alu.dst.sel = ctx->temp_reg;
7157			alu.dst.chan = i;
7158			alu.last = i == 1;
7159
7160			r = r600_bytecode_add_alu(ctx->bc, &alu);
7161			if (r)
7162				return r;
7163		}
7164	}
7165
7166	tmp = r600_get_temp(ctx);
7167	for (i = 0; i < 8; i++) {
7168		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7169		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7170
7171		alu.dst.sel = tmp;
7172		if ((i > 1 && i < 6)) {
7173			alu.dst.write = 1;
7174		}
7175		else {
7176			alu.dst.write = 0;
7177		}
7178		alu.dst.chan = i % 4;
7179
7180		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7181			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7182			alu.src[0].sel = ctx->temp_reg;
7183			alu.src[0].chan = 1 - (i % 2);
7184		} else {
7185			alu.src[0].sel = interp_gpr;
7186			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7187		}
7188		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7189		alu.src[1].chan = 0;
7190
7191		alu.last = i % 4 == 3;
7192		alu.bank_swizzle_force = SQ_ALU_VEC_210;
7193
7194		r = r600_bytecode_add_alu(ctx->bc, &alu);
7195		if (r)
7196			return r;
7197	}
7198
7199	// INTERP can't swizzle dst
7200	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7201	for (i = 0; i <= lasti; i++) {
7202		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7203			continue;
7204
7205		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7206		alu.op = ALU_OP1_MOV;
7207		alu.src[0].sel = tmp;
7208		alu.src[0].chan = ctx->src[0].swizzle[i];
7209		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7210		alu.dst.write = 1;
7211		alu.last = i == lasti;
7212		r = r600_bytecode_add_alu(ctx->bc, &alu);
7213		if (r)
7214			return r;
7215	}
7216
7217	return 0;
7218}
7219
7220
7221static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7222{
7223	struct r600_bytecode_alu alu;
7224	int i, r;
7225
7226	for (i = 0; i < 4; i++) {
7227		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7228		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7229			alu.op = ALU_OP0_NOP;
7230			alu.dst.chan = i;
7231		} else {
7232			alu.op = ALU_OP1_MOV;
7233			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7234			alu.src[0].sel = ctx->temp_reg;
7235			alu.src[0].chan = i;
7236		}
7237		if (i == 3) {
7238			alu.last = 1;
7239		}
7240		r = r600_bytecode_add_alu(ctx->bc, &alu);
7241		if (r)
7242			return r;
7243	}
7244	return 0;
7245}
7246
7247static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7248                                 unsigned writemask,
7249                                 struct r600_bytecode_alu_src *bc_src,
7250                                 const struct r600_shader_src *shader_src)
7251{
7252	struct r600_bytecode_alu alu;
7253	int i, r;
7254	int lasti = tgsi_last_instruction(writemask);
7255	int temp_reg = 0;
7256
7257	r600_bytecode_src(&bc_src[0], shader_src, 0);
7258	r600_bytecode_src(&bc_src[1], shader_src, 1);
7259	r600_bytecode_src(&bc_src[2], shader_src, 2);
7260	r600_bytecode_src(&bc_src[3], shader_src, 3);
7261
7262	if (bc_src->abs) {
7263		temp_reg = r600_get_temp(ctx);
7264
7265		for (i = 0; i < lasti + 1; i++) {
7266			if (!(writemask & (1 << i)))
7267				continue;
7268			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7269			alu.op = ALU_OP1_MOV;
7270			alu.dst.sel = temp_reg;
7271			alu.dst.chan = i;
7272			alu.dst.write = 1;
7273			alu.src[0] = bc_src[i];
7274			if (i == lasti) {
7275				alu.last = 1;
7276			}
7277			r = r600_bytecode_add_alu(ctx->bc, &alu);
7278			if (r)
7279				return r;
7280			memset(&bc_src[i], 0, sizeof(*bc_src));
7281			bc_src[i].sel = temp_reg;
7282			bc_src[i].chan = i;
7283		}
7284	}
7285	return 0;
7286}
7287
7288static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7289{
7290	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7291	struct r600_bytecode_alu alu;
7292	struct r600_bytecode_alu_src srcs[4][4];
7293	int i, j, r;
7294	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7295	unsigned op = ctx->inst_info->op;
7296
7297	if (op == ALU_OP3_MULADD_IEEE &&
7298	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7299		op = ALU_OP3_MULADD;
7300
7301	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7302		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7303					  srcs[j], &ctx->src[j]);
7304		if (r)
7305			return r;
7306	}
7307
7308	for (i = 0; i < lasti + 1; i++) {
7309		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7310			continue;
7311
7312		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7313		alu.op = op;
7314		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7315			alu.src[j] = srcs[j][i];
7316		}
7317
7318		if (dst == -1) {
7319			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7320		} else {
7321			alu.dst.sel = dst;
7322		}
7323		alu.dst.chan = i;
7324		alu.dst.write = 1;
7325		alu.is_op3 = 1;
7326		if (i == lasti) {
7327			alu.last = 1;
7328		}
7329		r = r600_bytecode_add_alu(ctx->bc, &alu);
7330		if (r)
7331			return r;
7332	}
7333	return 0;
7334}
7335
7336static int tgsi_op3(struct r600_shader_ctx *ctx)
7337{
7338	return tgsi_op3_dst(ctx, -1);
7339}
7340
7341static int tgsi_dp(struct r600_shader_ctx *ctx)
7342{
7343	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7344	struct r600_bytecode_alu alu;
7345	int i, j, r;
7346	unsigned op = ctx->inst_info->op;
7347	if (op == ALU_OP2_DOT4_IEEE &&
7348	    ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7349		op = ALU_OP2_DOT4;
7350
7351	for (i = 0; i < 4; i++) {
7352		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7353		alu.op = op;
7354		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7355			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7356		}
7357
7358		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7359		alu.dst.chan = i;
7360		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7361		/* handle some special cases */
7362		switch (inst->Instruction.Opcode) {
7363		case TGSI_OPCODE_DP2:
7364			if (i > 1) {
7365				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7366				alu.src[0].chan = alu.src[1].chan = 0;
7367			}
7368			break;
7369		case TGSI_OPCODE_DP3:
7370			if (i > 2) {
7371				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7372				alu.src[0].chan = alu.src[1].chan = 0;
7373			}
7374			break;
7375		default:
7376			break;
7377		}
7378		if (i == 3) {
7379			alu.last = 1;
7380		}
7381		r = r600_bytecode_add_alu(ctx->bc, &alu);
7382		if (r)
7383			return r;
7384	}
7385	return 0;
7386}
7387
7388static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7389						    unsigned index)
7390{
7391	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7392	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7393		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7394		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7395		ctx->src[index].neg || ctx->src[index].abs ||
7396		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7397}
7398
7399static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7400					unsigned index)
7401{
7402	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7403	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7404}
7405
7406static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7407{
7408	struct r600_bytecode_vtx vtx;
7409	struct r600_bytecode_alu alu;
7410	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7411	int src_gpr, r, i;
7412	int id = tgsi_tex_get_src_gpr(ctx, 1);
7413	int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7414
7415	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7416	if (src_requires_loading) {
7417		for (i = 0; i < 4; i++) {
7418			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7419			alu.op = ALU_OP1_MOV;
7420			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7421			alu.dst.sel = ctx->temp_reg;
7422			alu.dst.chan = i;
7423			if (i == 3)
7424				alu.last = 1;
7425			alu.dst.write = 1;
7426			r = r600_bytecode_add_alu(ctx->bc, &alu);
7427			if (r)
7428				return r;
7429		}
7430		src_gpr = ctx->temp_reg;
7431	}
7432
7433	memset(&vtx, 0, sizeof(vtx));
7434	vtx.op = FETCH_OP_VFETCH;
7435	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7436	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7437	vtx.src_gpr = src_gpr;
7438	vtx.mega_fetch_count = 16;
7439	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7440	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7441	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
7442	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
7443	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
7444	vtx.use_const_fields = 1;
7445	vtx.buffer_index_mode = sampler_index_mode;
7446
7447	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7448		return r;
7449
7450	if (ctx->bc->chip_class >= EVERGREEN)
7451		return 0;
7452
7453	for (i = 0; i < 4; i++) {
7454		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7455		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7456			continue;
7457
7458		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7459		alu.op = ALU_OP2_AND_INT;
7460
7461		alu.dst.chan = i;
7462		alu.dst.sel = vtx.dst_gpr;
7463		alu.dst.write = 1;
7464
7465		alu.src[0].sel = vtx.dst_gpr;
7466		alu.src[0].chan = i;
7467
7468		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7469		alu.src[1].sel += (id * 2);
7470		alu.src[1].chan = i % 4;
7471		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7472
7473		if (i == lasti)
7474			alu.last = 1;
7475		r = r600_bytecode_add_alu(ctx->bc, &alu);
7476		if (r)
7477			return r;
7478	}
7479
7480	if (inst->Dst[0].Register.WriteMask & 3) {
7481		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7482		alu.op = ALU_OP2_OR_INT;
7483
7484		alu.dst.chan = 3;
7485		alu.dst.sel = vtx.dst_gpr;
7486		alu.dst.write = 1;
7487
7488		alu.src[0].sel = vtx.dst_gpr;
7489		alu.src[0].chan = 3;
7490
7491		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7492		alu.src[1].chan = 0;
7493		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7494
7495		alu.last = 1;
7496		r = r600_bytecode_add_alu(ctx->bc, &alu);
7497		if (r)
7498			return r;
7499	}
7500	return 0;
7501}
7502
7503static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7504{
7505	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7506	int r;
7507	int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7508	int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7509
7510	if (ctx->bc->chip_class < EVERGREEN) {
7511		struct r600_bytecode_alu alu;
7512		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7513		alu.op = ALU_OP1_MOV;
7514		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7515		/* r600 we have them at channel 2 of the second dword */
7516		alu.src[0].sel += (id * 2) + 1;
7517		alu.src[0].chan = 1;
7518		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7519		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7520		alu.last = 1;
7521		r = r600_bytecode_add_alu(ctx->bc, &alu);
7522		if (r)
7523			return r;
7524		return 0;
7525	} else {
7526		struct r600_bytecode_vtx vtx;
7527		memset(&vtx, 0, sizeof(vtx));
7528		vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7529		vtx.buffer_id = id + eg_buffer_base;
7530		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7531		vtx.src_gpr = 0;
7532		vtx.mega_fetch_count = 16; /* no idea here really... */
7533		vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7534		vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
7535		vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;		/* SEL_Y */
7536		vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;		/* SEL_Z */
7537		vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;		/* SEL_W */
7538		vtx.data_format = FMT_32_32_32_32;
7539		vtx.buffer_index_mode = sampler_index_mode;
7540
7541		if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7542			return r;
7543		return 0;
7544	}
7545}
7546
7547
7548static int tgsi_tex(struct r600_shader_ctx *ctx)
7549{
7550	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7551	struct r600_bytecode_tex tex;
7552	struct r600_bytecode_tex grad_offs[3];
7553	struct r600_bytecode_alu alu;
7554	unsigned src_gpr;
7555	int r, i, j, n_grad_offs = 0;
7556	int opcode;
7557	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7558				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7559				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7560				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7561
7562	bool txf_add_offsets = inst->Texture.NumOffsets &&
7563			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7564			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7565
7566	/* Texture fetch instructions can only use gprs as source.
7567	 * Also they cannot negate the source or take the absolute value */
7568	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7569                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
7570					     read_compressed_msaa || txf_add_offsets;
7571
7572	boolean src_loaded = FALSE;
7573	unsigned sampler_src_reg = 1;
7574	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7575	boolean has_txq_cube_array_z = false;
7576	unsigned sampler_index_mode;
7577	int array_index_offset_channel = -1;
7578
7579	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7580	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7581	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7582		if (inst->Dst[0].Register.WriteMask & 4) {
7583			ctx->shader->has_txq_cube_array_z_comp = true;
7584			has_txq_cube_array_z = true;
7585		}
7586
7587	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7588	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7589	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7590	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7591		sampler_src_reg = 2;
7592
7593	/* TGSI moves the sampler to src reg 3 for TXD */
7594	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7595		sampler_src_reg = 3;
7596
7597	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7598
7599	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7600
7601	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7602		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7603			if (ctx->bc->chip_class < EVERGREEN)
7604				ctx->shader->uses_tex_buffers = true;
7605			return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7606		}
7607		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7608			if (ctx->bc->chip_class < EVERGREEN)
7609				ctx->shader->uses_tex_buffers = true;
7610			return do_vtx_fetch_inst(ctx, src_requires_loading);
7611		}
7612	}
7613
7614	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7615		int out_chan;
7616		/* Add perspective divide */
7617		if (ctx->bc->chip_class == CAYMAN) {
7618			out_chan = 2;
7619			for (i = 0; i < 3; i++) {
7620				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7621				alu.op = ALU_OP1_RECIP_IEEE;
7622				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7623
7624				alu.dst.sel = ctx->temp_reg;
7625				alu.dst.chan = i;
7626				if (i == 2)
7627					alu.last = 1;
7628				if (out_chan == i)
7629					alu.dst.write = 1;
7630				r = r600_bytecode_add_alu(ctx->bc, &alu);
7631				if (r)
7632					return r;
7633			}
7634
7635		} else {
7636			out_chan = 3;
7637			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7638			alu.op = ALU_OP1_RECIP_IEEE;
7639			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7640
7641			alu.dst.sel = ctx->temp_reg;
7642			alu.dst.chan = out_chan;
7643			alu.last = 1;
7644			alu.dst.write = 1;
7645			r = r600_bytecode_add_alu(ctx->bc, &alu);
7646			if (r)
7647				return r;
7648		}
7649
7650		for (i = 0; i < 3; i++) {
7651			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7652			alu.op = ALU_OP2_MUL;
7653			alu.src[0].sel = ctx->temp_reg;
7654			alu.src[0].chan = out_chan;
7655			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7656			alu.dst.sel = ctx->temp_reg;
7657			alu.dst.chan = i;
7658			alu.dst.write = 1;
7659			r = r600_bytecode_add_alu(ctx->bc, &alu);
7660			if (r)
7661				return r;
7662		}
7663		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7664		alu.op = ALU_OP1_MOV;
7665		alu.src[0].sel = V_SQ_ALU_SRC_1;
7666		alu.src[0].chan = 0;
7667		alu.dst.sel = ctx->temp_reg;
7668		alu.dst.chan = 3;
7669		alu.last = 1;
7670		alu.dst.write = 1;
7671		r = r600_bytecode_add_alu(ctx->bc, &alu);
7672		if (r)
7673			return r;
7674		src_loaded = TRUE;
7675		src_gpr = ctx->temp_reg;
7676	}
7677
7678
7679	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7680	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7681	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7682	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7683	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7684
7685		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7686		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7687
7688		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7689		for (i = 0; i < 4; i++) {
7690			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7691			alu.op = ALU_OP2_CUBE;
7692			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7693			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7694			alu.dst.sel = ctx->temp_reg;
7695			alu.dst.chan = i;
7696			if (i == 3)
7697				alu.last = 1;
7698			alu.dst.write = 1;
7699			r = r600_bytecode_add_alu(ctx->bc, &alu);
7700			if (r)
7701				return r;
7702		}
7703
7704		/* tmp1.z = RCP_e(|tmp1.z|) */
7705		if (ctx->bc->chip_class == CAYMAN) {
7706			for (i = 0; i < 3; i++) {
7707				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7708				alu.op = ALU_OP1_RECIP_IEEE;
7709				alu.src[0].sel = ctx->temp_reg;
7710				alu.src[0].chan = 2;
7711				alu.src[0].abs = 1;
7712				alu.dst.sel = ctx->temp_reg;
7713				alu.dst.chan = i;
7714				if (i == 2)
7715					alu.dst.write = 1;
7716				if (i == 2)
7717					alu.last = 1;
7718				r = r600_bytecode_add_alu(ctx->bc, &alu);
7719				if (r)
7720					return r;
7721			}
7722		} else {
7723			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7724			alu.op = ALU_OP1_RECIP_IEEE;
7725			alu.src[0].sel = ctx->temp_reg;
7726			alu.src[0].chan = 2;
7727			alu.src[0].abs = 1;
7728			alu.dst.sel = ctx->temp_reg;
7729			alu.dst.chan = 2;
7730			alu.dst.write = 1;
7731			alu.last = 1;
7732			r = r600_bytecode_add_alu(ctx->bc, &alu);
7733			if (r)
7734				return r;
7735		}
7736
7737		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
7738		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
7739		 * muladd has no writemask, have to use another temp
7740		 */
7741		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7742		alu.op = ALU_OP3_MULADD;
7743		alu.is_op3 = 1;
7744
7745		alu.src[0].sel = ctx->temp_reg;
7746		alu.src[0].chan = 0;
7747		alu.src[1].sel = ctx->temp_reg;
7748		alu.src[1].chan = 2;
7749
7750		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7751		alu.src[2].chan = 0;
7752		alu.src[2].value = u_bitcast_f2u(1.5f);
7753
7754		alu.dst.sel = ctx->temp_reg;
7755		alu.dst.chan = 0;
7756		alu.dst.write = 1;
7757
7758		r = r600_bytecode_add_alu(ctx->bc, &alu);
7759		if (r)
7760			return r;
7761
7762		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7763		alu.op = ALU_OP3_MULADD;
7764		alu.is_op3 = 1;
7765
7766		alu.src[0].sel = ctx->temp_reg;
7767		alu.src[0].chan = 1;
7768		alu.src[1].sel = ctx->temp_reg;
7769		alu.src[1].chan = 2;
7770
7771		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7772		alu.src[2].chan = 0;
7773		alu.src[2].value = u_bitcast_f2u(1.5f);
7774
7775		alu.dst.sel = ctx->temp_reg;
7776		alu.dst.chan = 1;
7777		alu.dst.write = 1;
7778
7779		alu.last = 1;
7780		r = r600_bytecode_add_alu(ctx->bc, &alu);
7781		if (r)
7782			return r;
7783		/* write initial compare value into Z component
7784		  - W src 0 for shadow cube
7785		  - X src 1 for shadow cube array */
7786		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7787		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7788			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7789			alu.op = ALU_OP1_MOV;
7790			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7791				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7792			else
7793				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7794			alu.dst.sel = ctx->temp_reg;
7795			alu.dst.chan = 2;
7796			alu.dst.write = 1;
7797			alu.last = 1;
7798			r = r600_bytecode_add_alu(ctx->bc, &alu);
7799			if (r)
7800				return r;
7801		}
7802
7803		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7804		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7805			if (ctx->bc->chip_class >= EVERGREEN) {
7806				int mytmp = r600_get_temp(ctx);
7807				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7808				alu.op = ALU_OP1_MOV;
7809				alu.src[0].sel = ctx->temp_reg;
7810				alu.src[0].chan = 3;
7811				alu.dst.sel = mytmp;
7812				alu.dst.chan = 0;
7813				alu.dst.write = 1;
7814				alu.last = 1;
7815				r = r600_bytecode_add_alu(ctx->bc, &alu);
7816				if (r)
7817					return r;
7818
7819				/* Evaluate the array index according to floor(idx + 0.5). This
7820				 * needs to be done before merging the face select value, because
7821				 * otherwise the fractional part of the array index will interfere
7822				 * with the face select value */
7823				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7824				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7825				alu.op = ALU_OP1_RNDNE;
7826				alu.dst.sel = ctx->temp_reg;
7827				alu.dst.chan = 3;
7828				alu.dst.write = 1;
7829				alu.last = 1;
7830				r = r600_bytecode_add_alu(ctx->bc, &alu);
7831				if (r)
7832					return r;
7833
7834				/* Because the array slice index and the cube face index are merged
7835				 * into one value we have to make sure the array slice index is >= 0,
7836				 * otherwise the face selection will fail */
7837				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7838				alu.op = ALU_OP2_MAX;
7839				alu.src[0].sel = ctx->temp_reg;
7840				alu.src[0].chan = 3;
7841				alu.src[1].sel = V_SQ_ALU_SRC_0;
7842				alu.dst.sel = ctx->temp_reg;
7843				alu.dst.chan = 3;
7844				alu.dst.write = 1;
7845				alu.last = 1;
7846				r = r600_bytecode_add_alu(ctx->bc, &alu);
7847				if (r)
7848					return r;
7849
7850				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7851				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7852				alu.op = ALU_OP3_MULADD;
7853				alu.is_op3 = 1;
7854				alu.src[0].sel = ctx->temp_reg;
7855				alu.src[0].chan = 3;
7856				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7857				alu.src[1].chan = 0;
7858				alu.src[1].value = u_bitcast_f2u(8.0f);
7859				alu.src[2].sel = mytmp;
7860				alu.src[2].chan = 0;
7861				alu.dst.sel = ctx->temp_reg;
7862				alu.dst.chan = 3;
7863				alu.dst.write = 1;
7864				alu.last = 1;
7865				r = r600_bytecode_add_alu(ctx->bc, &alu);
7866				if (r)
7867					return r;
7868			} else if (ctx->bc->chip_class < EVERGREEN) {
7869				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7870				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7871				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7872				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7873				tex.src_gpr = r600_get_temp(ctx);
7874				tex.src_sel_x = 0;
7875				tex.src_sel_y = 0;
7876				tex.src_sel_z = 0;
7877				tex.src_sel_w = 0;
7878				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7879				tex.coord_type_x = 1;
7880				tex.coord_type_y = 1;
7881				tex.coord_type_z = 1;
7882				tex.coord_type_w = 1;
7883				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7884				alu.op = ALU_OP1_MOV;
7885				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7886				alu.dst.sel = tex.src_gpr;
7887				alu.dst.chan = 0;
7888				alu.last = 1;
7889				alu.dst.write = 1;
7890				r = r600_bytecode_add_alu(ctx->bc, &alu);
7891				if (r)
7892					return r;
7893
7894				r = r600_bytecode_add_tex(ctx->bc, &tex);
7895				if (r)
7896					return r;
7897			}
7898
7899		}
7900
7901		/* for cube forms of lod and bias we need to route things */
7902		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7903		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7904		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7905		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7906			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7907			alu.op = ALU_OP1_MOV;
7908			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7909			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7910				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7911			else
7912				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7913			alu.dst.sel = ctx->temp_reg;
7914			alu.dst.chan = 2;
7915			alu.last = 1;
7916			alu.dst.write = 1;
7917			r = r600_bytecode_add_alu(ctx->bc, &alu);
7918			if (r)
7919				return r;
7920		}
7921
7922		src_loaded = TRUE;
7923		src_gpr = ctx->temp_reg;
7924	}
7925
7926	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7927		int temp_h = 0, temp_v = 0;
7928		int start_val = 0;
7929
7930		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7931		if (src_loaded == TRUE)
7932			start_val = 1;
7933		else
7934			src_loaded = TRUE;
7935		for (i = start_val; i < 3; i++) {
7936			int treg = r600_get_temp(ctx);
7937
7938			if (i == 0)
7939				src_gpr = treg;
7940			else if (i == 1)
7941				temp_h = treg;
7942			else
7943				temp_v = treg;
7944
7945			for (j = 0; j < 4; j++) {
7946				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7947				alu.op = ALU_OP1_MOV;
7948                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7949                                alu.dst.sel = treg;
7950                                alu.dst.chan = j;
7951                                if (j == 3)
7952                                   alu.last = 1;
7953                                alu.dst.write = 1;
7954                                r = r600_bytecode_add_alu(ctx->bc, &alu);
7955                                if (r)
7956                                    return r;
7957			}
7958		}
7959		for (i = 1; i < 3; i++) {
7960			/* set gradients h/v */
7961			struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7962			memset(t, 0, sizeof(struct r600_bytecode_tex));
7963			t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7964				FETCH_OP_SET_GRADIENTS_V;
7965			t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7966			t->sampler_index_mode = sampler_index_mode;
7967			t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7968			t->resource_index_mode = sampler_index_mode;
7969
7970			t->src_gpr = (i == 1) ? temp_h : temp_v;
7971			t->src_sel_x = 0;
7972			t->src_sel_y = 1;
7973			t->src_sel_z = 2;
7974			t->src_sel_w = 3;
7975
7976			t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7977			t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7978			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7979				t->coord_type_x = 1;
7980				t->coord_type_y = 1;
7981				t->coord_type_z = 1;
7982				t->coord_type_w = 1;
7983			}
7984		}
7985	}
7986
7987	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7988		/* Gather4 should follow the same rules as bilinear filtering, but the hardware
7989		 * incorrectly forces nearest filtering if the texture format is integer.
7990		 * The only effect it has on Gather4, which always returns 4 texels for
7991		 * bilinear filtering, is that the final coordinates are off by 0.5 of
7992		 * the texel size.
7993		 *
7994		 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7995		 * or (0.5 / size) from the normalized coordinates.
7996		 */
7997		if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7998		    inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7999			int treg = r600_get_temp(ctx);
8000
8001			/* mov array and comparison oordinate to temp_reg if needed */
8002			if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8003			     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8004			     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
8005				int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
8006				for (i = 2; i <= end; i++) {
8007					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8008					alu.op = ALU_OP1_MOV;
8009					alu.dst.sel = ctx->temp_reg;
8010					alu.dst.chan = i;
8011					alu.dst.write = 1;
8012					alu.last = (i == end);
8013					r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8014					r = r600_bytecode_add_alu(ctx->bc, &alu);
8015					if (r)
8016						return r;
8017				}
8018			}
8019
8020			if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
8021			    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
8022				for (i = 0; i < 2; i++) {
8023					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8024					alu.op = ALU_OP2_ADD;
8025					alu.dst.sel = ctx->temp_reg;
8026					alu.dst.chan = i;
8027					alu.dst.write = 1;
8028					alu.last = i == 1;
8029					if (src_loaded) {
8030						alu.src[0].sel = ctx->temp_reg;
8031						alu.src[0].chan = i;
8032					} else
8033						r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8034					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8035					alu.src[1].neg = 1;
8036					r = r600_bytecode_add_alu(ctx->bc, &alu);
8037					if (r)
8038						return r;
8039				}
8040			} else {
8041				/* execute a TXQ */
8042				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8043				tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
8044				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8045				tex.sampler_index_mode = sampler_index_mode;
8046				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8047				tex.resource_index_mode = sampler_index_mode;
8048				tex.dst_gpr = treg;
8049				tex.src_sel_x = 4;
8050				tex.src_sel_y = 4;
8051				tex.src_sel_z = 4;
8052				tex.src_sel_w = 4;
8053				tex.dst_sel_x = 0;
8054				tex.dst_sel_y = 1;
8055				tex.dst_sel_z = 7;
8056				tex.dst_sel_w = 7;
8057				r = r600_bytecode_add_tex(ctx->bc, &tex);
8058				if (r)
8059					return r;
8060
8061				/* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
8062				if (ctx->bc->chip_class == CAYMAN) {
8063					/* */
8064					for (i = 0; i < 2; i++) {
8065						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8066						alu.op = ALU_OP1_INT_TO_FLT;
8067						alu.dst.sel = treg;
8068						alu.dst.chan = i;
8069						alu.dst.write = 1;
8070						alu.src[0].sel = treg;
8071						alu.src[0].chan = i;
8072						alu.last = (i == 1) ? 1 : 0;
8073						r = r600_bytecode_add_alu(ctx->bc, &alu);
8074						if (r)
8075							return r;
8076					}
8077					for (j = 0; j < 2; j++) {
8078						for (i = 0; i < 3; i++) {
8079							memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8080							alu.op = ALU_OP1_RECIP_IEEE;
8081							alu.src[0].sel = treg;
8082							alu.src[0].chan = j;
8083							alu.dst.sel = treg;
8084							alu.dst.chan = i;
8085							if (i == 2)
8086								alu.last = 1;
8087							if (i == j)
8088								alu.dst.write = 1;
8089							r = r600_bytecode_add_alu(ctx->bc, &alu);
8090							if (r)
8091								return r;
8092						}
8093					}
8094				} else {
8095					for (i = 0; i < 2; i++) {
8096						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8097						alu.op = ALU_OP1_INT_TO_FLT;
8098						alu.dst.sel = treg;
8099						alu.dst.chan = i;
8100						alu.dst.write = 1;
8101						alu.src[0].sel = treg;
8102						alu.src[0].chan = i;
8103						alu.last = 1;
8104						r = r600_bytecode_add_alu(ctx->bc, &alu);
8105						if (r)
8106							return r;
8107					}
8108					for (i = 0; i < 2; i++) {
8109						memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8110						alu.op = ALU_OP1_RECIP_IEEE;
8111						alu.src[0].sel = treg;
8112						alu.src[0].chan = i;
8113						alu.dst.sel = treg;
8114						alu.dst.chan = i;
8115						alu.last = 1;
8116						alu.dst.write = 1;
8117						r = r600_bytecode_add_alu(ctx->bc, &alu);
8118						if (r)
8119							return r;
8120					}
8121				}
8122				for (i = 0; i < 2; i++) {
8123					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8124					alu.op = ALU_OP3_MULADD;
8125					alu.is_op3 = 1;
8126					alu.dst.sel = ctx->temp_reg;
8127					alu.dst.chan = i;
8128					alu.dst.write = 1;
8129					alu.last = i == 1;
8130					alu.src[0].sel = treg;
8131					alu.src[0].chan = i;
8132					alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8133					alu.src[1].neg = 1;
8134					if (src_loaded) {
8135						alu.src[2].sel = ctx->temp_reg;
8136						alu.src[2].chan = i;
8137					} else
8138						r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8139					r = r600_bytecode_add_alu(ctx->bc, &alu);
8140					if (r)
8141						return r;
8142				}
8143			}
8144			src_loaded = TRUE;
8145			src_gpr = ctx->temp_reg;
8146		}
8147	}
8148
8149	if (src_requires_loading && !src_loaded) {
8150		for (i = 0; i < 4; i++) {
8151			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8152			alu.op = ALU_OP1_MOV;
8153			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8154			alu.dst.sel = ctx->temp_reg;
8155			alu.dst.chan = i;
8156			if (i == 3)
8157				alu.last = 1;
8158			alu.dst.write = 1;
8159			r = r600_bytecode_add_alu(ctx->bc, &alu);
8160			if (r)
8161				return r;
8162		}
8163		src_loaded = TRUE;
8164		src_gpr = ctx->temp_reg;
8165	}
8166
8167	/* get offset values */
8168	if (inst->Texture.NumOffsets) {
8169		assert(inst->Texture.NumOffsets == 1);
8170
8171		/* The texture offset feature doesn't work with the TXF instruction
8172		 * and must be emulated by adding the offset to the texture coordinates. */
8173		if (txf_add_offsets) {
8174			const struct tgsi_texture_offset *off = inst->TexOffsets;
8175
8176			switch (inst->Texture.Texture) {
8177			case TGSI_TEXTURE_3D:
8178				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8179				alu.op = ALU_OP2_ADD_INT;
8180				alu.src[0].sel = src_gpr;
8181				alu.src[0].chan = 2;
8182				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8183				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8184				alu.dst.sel = src_gpr;
8185				alu.dst.chan = 2;
8186				alu.dst.write = 1;
8187				alu.last = 1;
8188				r = r600_bytecode_add_alu(ctx->bc, &alu);
8189				if (r)
8190					return r;
8191				FALLTHROUGH;
8192
8193			case TGSI_TEXTURE_2D:
8194			case TGSI_TEXTURE_SHADOW2D:
8195			case TGSI_TEXTURE_RECT:
8196			case TGSI_TEXTURE_SHADOWRECT:
8197			case TGSI_TEXTURE_2D_ARRAY:
8198			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8199				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8200				alu.op = ALU_OP2_ADD_INT;
8201				alu.src[0].sel = src_gpr;
8202				alu.src[0].chan = 1;
8203				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8204				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8205				alu.dst.sel = src_gpr;
8206				alu.dst.chan = 1;
8207				alu.dst.write = 1;
8208				alu.last = 1;
8209				r = r600_bytecode_add_alu(ctx->bc, &alu);
8210				if (r)
8211					return r;
8212				FALLTHROUGH;
8213
8214			case TGSI_TEXTURE_1D:
8215			case TGSI_TEXTURE_SHADOW1D:
8216			case TGSI_TEXTURE_1D_ARRAY:
8217			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8218				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8219				alu.op = ALU_OP2_ADD_INT;
8220				alu.src[0].sel = src_gpr;
8221				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8222				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8223				alu.dst.sel = src_gpr;
8224				alu.dst.write = 1;
8225				alu.last = 1;
8226				r = r600_bytecode_add_alu(ctx->bc, &alu);
8227				if (r)
8228					return r;
8229				break;
8230				/* texture offsets do not apply to other texture targets */
8231			}
8232		} else {
8233			switch (inst->Texture.Texture) {
8234			case TGSI_TEXTURE_3D:
8235				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8236				FALLTHROUGH;
8237			case TGSI_TEXTURE_2D:
8238			case TGSI_TEXTURE_SHADOW2D:
8239			case TGSI_TEXTURE_RECT:
8240			case TGSI_TEXTURE_SHADOWRECT:
8241			case TGSI_TEXTURE_2D_ARRAY:
8242			case TGSI_TEXTURE_SHADOW2D_ARRAY:
8243				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8244				FALLTHROUGH;
8245			case TGSI_TEXTURE_1D:
8246			case TGSI_TEXTURE_SHADOW1D:
8247			case TGSI_TEXTURE_1D_ARRAY:
8248			case TGSI_TEXTURE_SHADOW1D_ARRAY:
8249				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8250			}
8251		}
8252	}
8253
8254	/* Obtain the sample index for reading a compressed MSAA color texture.
8255	 * To read the FMASK, we use the ldfptr instruction, which tells us
8256	 * where the samples are stored.
8257	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8258	 * which is the identity mapping. Each nibble says which physical sample
8259	 * should be fetched to get that sample.
8260	 *
8261	 * Assume src.z contains the sample index. It should be modified like this:
8262	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8263	 * Then fetch the texel with src.
8264	 */
8265	if (read_compressed_msaa) {
8266		unsigned sample_chan = 3;
8267		unsigned temp = r600_get_temp(ctx);
8268		assert(src_loaded);
8269
8270		/* temp.w = ldfptr() */
8271		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8272		tex.op = FETCH_OP_LD;
8273		tex.inst_mod = 1; /* to indicate this is ldfptr */
8274		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8275		tex.sampler_index_mode = sampler_index_mode;
8276		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8277		tex.resource_index_mode = sampler_index_mode;
8278		tex.src_gpr = src_gpr;
8279		tex.dst_gpr = temp;
8280		tex.dst_sel_x = 7; /* mask out these components */
8281		tex.dst_sel_y = 7;
8282		tex.dst_sel_z = 7;
8283		tex.dst_sel_w = 0; /* store X */
8284		tex.src_sel_x = 0;
8285		tex.src_sel_y = 1;
8286		tex.src_sel_z = 2;
8287		tex.src_sel_w = 3;
8288		tex.offset_x = offset_x;
8289		tex.offset_y = offset_y;
8290		tex.offset_z = offset_z;
8291		r = r600_bytecode_add_tex(ctx->bc, &tex);
8292		if (r)
8293			return r;
8294
8295		/* temp.x = sample_index*4 */
8296		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8297		alu.op = ALU_OP2_MULLO_INT;
8298		alu.src[0].sel = src_gpr;
8299		alu.src[0].chan = sample_chan;
8300		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8301		alu.src[1].value = 4;
8302		alu.dst.sel = temp;
8303		alu.dst.chan = 0;
8304		alu.dst.write = 1;
8305		r = emit_mul_int_op(ctx->bc, &alu);
8306		if (r)
8307			return r;
8308
8309		/* sample_index = temp.w >> temp.x */
8310		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8311		alu.op = ALU_OP2_LSHR_INT;
8312		alu.src[0].sel = temp;
8313		alu.src[0].chan = 3;
8314		alu.src[1].sel = temp;
8315		alu.src[1].chan = 0;
8316		alu.dst.sel = src_gpr;
8317		alu.dst.chan = sample_chan;
8318		alu.dst.write = 1;
8319		alu.last = 1;
8320		r = r600_bytecode_add_alu(ctx->bc, &alu);
8321		if (r)
8322			return r;
8323
8324		/* sample_index & 0xF */
8325		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8326		alu.op = ALU_OP2_AND_INT;
8327		alu.src[0].sel = src_gpr;
8328		alu.src[0].chan = sample_chan;
8329		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8330		alu.src[1].value = 0xF;
8331		alu.dst.sel = src_gpr;
8332		alu.dst.chan = sample_chan;
8333		alu.dst.write = 1;
8334		alu.last = 1;
8335		r = r600_bytecode_add_alu(ctx->bc, &alu);
8336		if (r)
8337			return r;
8338#if 0
8339		/* visualize the FMASK */
8340		for (i = 0; i < 4; i++) {
8341			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8342			alu.op = ALU_OP1_INT_TO_FLT;
8343			alu.src[0].sel = src_gpr;
8344			alu.src[0].chan = sample_chan;
8345			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8346			alu.dst.chan = i;
8347			alu.dst.write = 1;
8348			alu.last = 1;
8349			r = r600_bytecode_add_alu(ctx->bc, &alu);
8350			if (r)
8351				return r;
8352		}
8353		return 0;
8354#endif
8355	}
8356
8357	/* does this shader want a num layers from TXQ for a cube array? */
8358	if (has_txq_cube_array_z) {
8359		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8360
8361		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8362		alu.op = ALU_OP1_MOV;
8363
8364		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8365		if (ctx->bc->chip_class >= EVERGREEN) {
8366			/* with eg each dword is number of cubes */
8367			alu.src[0].sel += id / 4;
8368			alu.src[0].chan = id % 4;
8369		} else {
8370			/* r600 we have them at channel 2 of the second dword */
8371			alu.src[0].sel += (id * 2) + 1;
8372			alu.src[0].chan = 2;
8373		}
8374		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8375		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8376		alu.last = 1;
8377		r = r600_bytecode_add_alu(ctx->bc, &alu);
8378		if (r)
8379			return r;
8380		/* disable writemask from texture instruction */
8381		inst->Dst[0].Register.WriteMask &= ~4;
8382	}
8383
8384	opcode = ctx->inst_info->op;
8385	if (opcode == FETCH_OP_GATHER4 &&
8386		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8387		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8388		struct r600_bytecode_tex *t;
8389		opcode = FETCH_OP_GATHER4_O;
8390
8391		/* GATHER4_O/GATHER4_C_O use offset values loaded by
8392		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
8393		   encoded in the instruction are ignored. */
8394		t = &grad_offs[n_grad_offs++];
8395		memset(t, 0, sizeof(struct r600_bytecode_tex));
8396		t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8397		t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8398		t->sampler_index_mode = sampler_index_mode;
8399		t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8400		t->resource_index_mode = sampler_index_mode;
8401
8402		t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8403		t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8404		t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8405		if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8406			 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8407			/* make sure array index selector is 0, this is just a safety
8408			 * precausion because TGSI seems to emit something strange here */
8409			t->src_sel_z = 4;
8410		else
8411			t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8412
8413		t->src_sel_w = 4;
8414
8415		t->dst_sel_x = 7;
8416		t->dst_sel_y = 7;
8417		t->dst_sel_z = 7;
8418		t->dst_sel_w = 7;
8419	}
8420
8421	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8422	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8423	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8424	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8425	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8426	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8427	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8428		switch (opcode) {
8429		case FETCH_OP_SAMPLE:
8430			opcode = FETCH_OP_SAMPLE_C;
8431			break;
8432		case FETCH_OP_SAMPLE_L:
8433			opcode = FETCH_OP_SAMPLE_C_L;
8434			break;
8435		case FETCH_OP_SAMPLE_LB:
8436			opcode = FETCH_OP_SAMPLE_C_LB;
8437			break;
8438		case FETCH_OP_SAMPLE_G:
8439			opcode = FETCH_OP_SAMPLE_C_G;
8440			break;
8441		/* Texture gather variants */
8442		case FETCH_OP_GATHER4:
8443			opcode = FETCH_OP_GATHER4_C;
8444			break;
8445		case FETCH_OP_GATHER4_O:
8446			opcode = FETCH_OP_GATHER4_C_O;
8447			break;
8448		}
8449	}
8450
8451	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8452	tex.op = opcode;
8453
8454	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8455	tex.sampler_index_mode = sampler_index_mode;
8456	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8457	tex.resource_index_mode = sampler_index_mode;
8458	tex.src_gpr = src_gpr;
8459	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8460
8461	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8462		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8463		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8464	}
8465
8466	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8467		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8468		tex.inst_mod = texture_component_select;
8469
8470		if (ctx->bc->chip_class == CAYMAN) {
8471			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8472			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8473			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8474			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8475		} else {
8476			/* GATHER4 result order is different from TGSI TG4 */
8477			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8478			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8479			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8480			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8481		}
8482	}
8483	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8484		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8485		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8486		tex.dst_sel_z = 7;
8487		tex.dst_sel_w = 7;
8488	}
8489	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8490		tex.dst_sel_x = 3;
8491		tex.dst_sel_y = 7;
8492		tex.dst_sel_z = 7;
8493		tex.dst_sel_w = 7;
8494	}
8495	else {
8496		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8497		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8498		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8499		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8500	}
8501
8502
8503	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8504		tex.src_sel_x = 4;
8505		tex.src_sel_y = 4;
8506		tex.src_sel_z = 4;
8507		tex.src_sel_w = 4;
8508	} else if (src_loaded) {
8509		tex.src_sel_x = 0;
8510		tex.src_sel_y = 1;
8511		tex.src_sel_z = 2;
8512		tex.src_sel_w = 3;
8513	} else {
8514		tex.src_sel_x = ctx->src[0].swizzle[0];
8515		tex.src_sel_y = ctx->src[0].swizzle[1];
8516		tex.src_sel_z = ctx->src[0].swizzle[2];
8517		tex.src_sel_w = ctx->src[0].swizzle[3];
8518		tex.src_rel = ctx->src[0].rel;
8519	}
8520
8521	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8522	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8523	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8524	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8525		tex.src_sel_x = 1;
8526		tex.src_sel_y = 0;
8527		tex.src_sel_z = 3;
8528		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8529	}
8530
8531	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8532	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8533		tex.coord_type_x = 1;
8534		tex.coord_type_y = 1;
8535	}
8536	tex.coord_type_z = 1;
8537	tex.coord_type_w = 1;
8538
8539	tex.offset_x = offset_x;
8540	tex.offset_y = offset_y;
8541	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8542		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8543		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8544		tex.offset_z = 0;
8545	}
8546	else {
8547		tex.offset_z = offset_z;
8548	}
8549
8550	/* Put the depth for comparison in W.
8551	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8552	 * Some instructions expect the depth in Z. */
8553	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8554	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8555	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8556	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8557	    opcode != FETCH_OP_SAMPLE_C_L &&
8558	    opcode != FETCH_OP_SAMPLE_C_LB) {
8559		tex.src_sel_w = tex.src_sel_z;
8560	}
8561
8562	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8563	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8564		if (opcode == FETCH_OP_SAMPLE_C_L ||
8565		    opcode == FETCH_OP_SAMPLE_C_LB) {
8566			/* the array index is read from Y */
8567			tex.coord_type_y = 0;
8568			array_index_offset_channel = tex.src_sel_y;
8569		} else {
8570			/* the array index is read from Z */
8571			tex.coord_type_z = 0;
8572			tex.src_sel_z = tex.src_sel_y;
8573			array_index_offset_channel = tex.src_sel_z;
8574		}
8575	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8576		    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8577		tex.coord_type_z = 0;
8578		array_index_offset_channel = tex.src_sel_z;
8579	} else if  ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8580		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8581		    (ctx->bc->chip_class >= EVERGREEN))
8582		/* the array index is read from Z, coordinate will be corrected elsewhere  */
8583		tex.coord_type_z = 0;
8584
8585	/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8586	 * evaluate the array index  */
8587	if (array_index_offset_channel >= 0 &&
8588		 opcode != FETCH_OP_LD &&
8589		 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8590		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8591		alu.src[0].sel =  tex.src_gpr;
8592		alu.src[0].chan =  array_index_offset_channel;
8593		alu.src[0].rel = tex.src_rel;
8594		alu.op = ALU_OP1_RNDNE;
8595		alu.dst.sel = tex.src_gpr;
8596		alu.dst.chan = array_index_offset_channel;
8597		alu.dst.rel = tex.src_rel;
8598		alu.dst.write = 1;
8599		alu.last = 1;
8600		r = r600_bytecode_add_alu(ctx->bc, &alu);
8601		if (r)
8602			return r;
8603	}
8604
8605	/* mask unused source components */
8606	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8607		switch (inst->Texture.Texture) {
8608		case TGSI_TEXTURE_2D:
8609		case TGSI_TEXTURE_RECT:
8610			tex.src_sel_z = 7;
8611			tex.src_sel_w = 7;
8612			break;
8613		case TGSI_TEXTURE_1D_ARRAY:
8614			tex.src_sel_y = 7;
8615			tex.src_sel_w = 7;
8616			break;
8617		case TGSI_TEXTURE_1D:
8618			tex.src_sel_y = 7;
8619			tex.src_sel_z = 7;
8620			tex.src_sel_w = 7;
8621			break;
8622		}
8623	}
8624
8625	/* Emit set gradient and offset instructions. */
8626	for (i = 0; i < n_grad_offs; ++i) {
8627		r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8628		if (r)
8629			return r;
8630	}
8631
8632	r = r600_bytecode_add_tex(ctx->bc, &tex);
8633	if (r)
8634		return r;
8635
8636	/* add shadow ambient support  - gallium doesn't do it yet */
8637	return 0;
8638}
8639
8640static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8641				  struct tgsi_full_src_register *src)
8642{
8643	unsigned i;
8644
8645	if (src->Register.Indirect) {
8646		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8647			if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8648				return ctx->shader->atomics[i].hw_idx;
8649		}
8650	} else {
8651		uint32_t index = src->Register.Index;
8652		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8653			if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8654				continue;
8655			if (index > ctx->shader->atomics[i].end)
8656				continue;
8657			if (index < ctx->shader->atomics[i].start)
8658				continue;
8659			uint32_t offset = (index - ctx->shader->atomics[i].start);
8660			return ctx->shader->atomics[i].hw_idx + offset;
8661		}
8662	}
8663	assert(0);
8664	return -1;
8665}
8666
8667static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8668			     int *uav_id_p, int *uav_index_mode_p)
8669{
8670	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8671	int uav_id, uav_index_mode = 0;
8672	int r;
8673	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8674
8675	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8676
8677	if (inst->Src[0].Register.Indirect) {
8678		if (is_cm) {
8679			struct r600_bytecode_alu alu;
8680			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8681			alu.op = ALU_OP2_LSHL_INT;
8682			alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8683			alu.src[0].chan = 0;
8684			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8685			alu.src[1].value = 2;
8686			alu.dst.sel = ctx->temp_reg;
8687			alu.dst.chan = 0;
8688			alu.dst.write = 1;
8689			alu.last = 1;
8690			r = r600_bytecode_add_alu(ctx->bc, &alu);
8691			if (r)
8692				return r;
8693
8694			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8695					   ctx->temp_reg, 0,
8696					   ctx->temp_reg, 0,
8697					   V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8698			if (r)
8699				return r;
8700		} else
8701			uav_index_mode = 2;
8702	} else if (is_cm) {
8703		r = single_alu_op2(ctx, ALU_OP1_MOV,
8704				   ctx->temp_reg, 0,
8705				   V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8706				   0, 0);
8707		if (r)
8708			return r;
8709	}
8710	*uav_id_p = uav_id;
8711	*uav_index_mode_p = uav_index_mode;
8712	return 0;
8713}
8714
8715static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8716{
8717	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8718	int r;
8719	struct r600_bytecode_gds gds;
8720	int uav_id = 0;
8721	int uav_index_mode = 0;
8722	bool is_cm = (ctx->bc->chip_class == CAYMAN);
8723
8724	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8725	if (r)
8726		return r;
8727
8728	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8729	gds.op = FETCH_OP_GDS_READ_RET;
8730	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8731	gds.uav_id = is_cm ? 0 : uav_id;
8732	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8733	gds.src_gpr = ctx->temp_reg;
8734	gds.src_sel_x = (is_cm) ? 0 : 4;
8735	gds.src_sel_y = 4;
8736	gds.src_sel_z = 4;
8737	gds.dst_sel_x = 0;
8738	gds.dst_sel_y = 7;
8739	gds.dst_sel_z = 7;
8740	gds.dst_sel_w = 7;
8741	gds.src_gpr2 = 0;
8742	gds.alloc_consume = !is_cm;
8743	r = r600_bytecode_add_gds(ctx->bc, &gds);
8744	if (r)
8745		return r;
8746
8747	ctx->bc->cf_last->vpm = 1;
8748	return 0;
8749}
8750
8751/* this fixes up 1D arrays properly */
8752static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8753{
8754	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8755	int r, i;
8756	struct r600_bytecode_alu alu;
8757	int temp_reg = r600_get_temp(ctx);
8758
8759	for (i = 0; i < 4; i++) {
8760		bool def_val = true, write_zero = false;
8761		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8762		alu.op = ALU_OP1_MOV;
8763		alu.dst.sel = temp_reg;
8764		alu.dst.chan = i;
8765
8766		switch (inst->Memory.Texture) {
8767		case TGSI_TEXTURE_BUFFER:
8768		case TGSI_TEXTURE_1D:
8769			if (i == 1 || i == 2 || i == 3) {
8770				write_zero = true;
8771			}
8772			break;
8773		case TGSI_TEXTURE_1D_ARRAY:
8774			if (i == 1 || i == 3)
8775				write_zero = true;
8776			else if (i == 2) {
8777				r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8778				def_val = false;
8779			}
8780			break;
8781		case TGSI_TEXTURE_2D:
8782			if (i == 2 || i == 3)
8783				write_zero = true;
8784			break;
8785		default:
8786			if (i == 3)
8787				write_zero = true;
8788			break;
8789		}
8790
8791		if (write_zero) {
8792			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8793			alu.src[0].value = 0;
8794		} else if (def_val) {
8795			r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8796		}
8797
8798		if (i == 3)
8799			alu.last = 1;
8800		alu.dst.write = 1;
8801		r = r600_bytecode_add_alu(ctx->bc, &alu);
8802		if (r)
8803			return r;
8804	}
8805	*idx_gpr = temp_reg;
8806	return 0;
8807}
8808
8809static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8810			     int temp_reg)
8811{
8812	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8813	int r;
8814	if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8815		int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8816		r = single_alu_op2(ctx, ALU_OP1_MOV,
8817				   temp_reg, 0,
8818				   V_SQ_ALU_SRC_LITERAL, value >> 2,
8819				   0, 0);
8820		if (r)
8821			return r;
8822	} else {
8823		struct r600_bytecode_alu alu;
8824		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8825		alu.op = ALU_OP2_LSHR_INT;
8826		r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8827		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8828		alu.src[1].value = 2;
8829		alu.dst.sel = temp_reg;
8830		alu.dst.write = 1;
8831		alu.last = 1;
8832		r = r600_bytecode_add_alu(ctx->bc, &alu);
8833		if (r)
8834			return r;
8835	}
8836	return 0;
8837}
8838
8839static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8840{
8841	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8842	/* have to work out the offset into the RAT immediate return buffer */
8843	struct r600_bytecode_vtx vtx;
8844	struct r600_bytecode_cf *cf;
8845	int r;
8846	int temp_reg = r600_get_temp(ctx);
8847	unsigned rat_index_mode;
8848	unsigned base;
8849
8850	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8851	base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8852
8853	r = load_buffer_coord(ctx, 1, temp_reg);
8854	if (r)
8855		return r;
8856	ctx->bc->cf_last->barrier = 1;
8857	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8858	vtx.op = FETCH_OP_VFETCH;
8859	vtx.buffer_id = inst->Src[0].Register.Index + base;
8860	vtx.buffer_index_mode = rat_index_mode;
8861	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8862	vtx.src_gpr = temp_reg;
8863	vtx.src_sel_x = 0;
8864	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8865	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
8866	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
8867	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
8868	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
8869	vtx.num_format_all = 1;
8870	vtx.format_comp_all = 1;
8871	vtx.srf_mode_all = 0;
8872
8873	if (inst->Dst[0].Register.WriteMask & 8) {
8874		vtx.data_format = FMT_32_32_32_32;
8875		vtx.use_const_fields = 0;
8876	} else if (inst->Dst[0].Register.WriteMask & 4) {
8877		vtx.data_format = FMT_32_32_32;
8878		vtx.use_const_fields = 0;
8879	} else if (inst->Dst[0].Register.WriteMask & 2) {
8880		vtx.data_format = FMT_32_32;
8881		vtx.use_const_fields = 0;
8882	} else {
8883		vtx.data_format = FMT_32;
8884		vtx.use_const_fields = 0;
8885	}
8886
8887	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8888	if (r)
8889		return r;
8890	cf = ctx->bc->cf_last;
8891	cf->barrier = 1;
8892	return 0;
8893}
8894
8895static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8896{
8897	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8898	/* have to work out the offset into the RAT immediate return buffer */
8899	struct r600_bytecode_vtx vtx;
8900	struct r600_bytecode_cf *cf;
8901	int r;
8902	int idx_gpr;
8903	unsigned format, num_format, format_comp, endian;
8904	const struct util_format_description *desc;
8905	unsigned rat_index_mode;
8906	unsigned immed_base;
8907
8908	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8909
8910	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8911	r = load_index_src(ctx, 1, &idx_gpr);
8912	if (r)
8913		return r;
8914
8915	if (rat_index_mode)
8916		egcm_load_index_reg(ctx->bc, 1, false);
8917
8918	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8919	cf = ctx->bc->cf_last;
8920
8921	cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8922	cf->rat.inst = V_RAT_INST_NOP_RTN;
8923	cf->rat.index_mode = rat_index_mode;
8924	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8925	cf->output.gpr = ctx->thread_id_gpr;
8926	cf->output.index_gpr = idx_gpr;
8927	cf->output.comp_mask = 0xf;
8928	cf->output.burst_count = 1;
8929	cf->vpm = 1;
8930	cf->barrier = 1;
8931	cf->mark = 1;
8932	cf->output.elem_size = 0;
8933
8934	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8935	cf = ctx->bc->cf_last;
8936	cf->barrier = 1;
8937
8938	desc = util_format_description(inst->Memory.Format);
8939	r600_vertex_data_type(inst->Memory.Format,
8940			      &format, &num_format, &format_comp, &endian);
8941	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8942	vtx.op = FETCH_OP_VFETCH;
8943	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8944	vtx.buffer_index_mode = rat_index_mode;
8945	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8946	vtx.src_gpr = ctx->thread_id_gpr;
8947	vtx.src_sel_x = 1;
8948	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8949	vtx.dst_sel_x = desc->swizzle[0];
8950	vtx.dst_sel_y = desc->swizzle[1];
8951	vtx.dst_sel_z = desc->swizzle[2];
8952	vtx.dst_sel_w = desc->swizzle[3];
8953	vtx.srf_mode_all = 1;
8954	vtx.data_format = format;
8955	vtx.num_format_all = num_format;
8956	vtx.format_comp_all = format_comp;
8957	vtx.endian = endian;
8958	vtx.offset = 0;
8959	vtx.mega_fetch_count = 3;
8960	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8961	if (r)
8962		return r;
8963	cf = ctx->bc->cf_last;
8964	cf->barrier = 1;
8965	return 0;
8966}
8967
8968static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8969{
8970	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8971	struct r600_bytecode_alu alu;
8972	int r;
8973	int temp_reg = r600_get_temp(ctx);
8974
8975	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8976	alu.op = ALU_OP1_MOV;
8977	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8978	alu.dst.sel = temp_reg;
8979	alu.dst.write = 1;
8980	alu.last = 1;
8981	r = r600_bytecode_add_alu(ctx->bc, &alu);
8982	if (r)
8983		return r;
8984
8985	r = do_lds_fetch_values(ctx, temp_reg,
8986				ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8987	if (r)
8988		return r;
8989	return 0;
8990}
8991
8992static int tgsi_load(struct r600_shader_ctx *ctx)
8993{
8994	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8995	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8996		return tgsi_load_rat(ctx);
8997	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8998		return tgsi_load_gds(ctx);
8999	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9000		return tgsi_load_buffer(ctx);
9001	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9002		return tgsi_load_lds(ctx);
9003	return 0;
9004}
9005
9006static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
9007{
9008	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9009	struct r600_bytecode_cf *cf;
9010	int r, i;
9011	unsigned rat_index_mode;
9012	int lasti;
9013	int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
9014
9015	r = load_buffer_coord(ctx, 0, treg2);
9016	if (r)
9017		return r;
9018
9019	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9020	if (rat_index_mode)
9021		egcm_load_index_reg(ctx->bc, 1, false);
9022
9023	for (i = 0; i <= 3; i++) {
9024		struct r600_bytecode_alu alu;
9025		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9026		alu.op = ALU_OP1_MOV;
9027		alu.dst.sel = temp_reg;
9028		alu.dst.chan = i;
9029		alu.src[0].sel = V_SQ_ALU_SRC_0;
9030		alu.last = (i == 3);
9031		alu.dst.write = 1;
9032		r = r600_bytecode_add_alu(ctx->bc, &alu);
9033		if (r)
9034			return r;
9035	}
9036
9037	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9038	for (i = 0; i <= lasti; i++) {
9039		struct r600_bytecode_alu alu;
9040		if (!((1 << i) & inst->Dst[0].Register.WriteMask))
9041			continue;
9042
9043		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9044				   temp_reg, 0,
9045				   treg2, 0,
9046				   V_SQ_ALU_SRC_LITERAL, i);
9047		if (r)
9048			return r;
9049
9050		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9051		alu.op = ALU_OP1_MOV;
9052		alu.dst.sel = ctx->temp_reg;
9053		alu.dst.chan = 0;
9054
9055		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9056		alu.last = 1;
9057		alu.dst.write = 1;
9058		r = r600_bytecode_add_alu(ctx->bc, &alu);
9059		if (r)
9060			return r;
9061
9062		r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9063		cf = ctx->bc->cf_last;
9064
9065		cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
9066		cf->rat.inst = V_RAT_INST_STORE_TYPED;
9067		cf->rat.index_mode = rat_index_mode;
9068		cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9069		cf->output.gpr = ctx->temp_reg;
9070		cf->output.index_gpr = temp_reg;
9071		cf->output.comp_mask = 1;
9072		cf->output.burst_count = 1;
9073		cf->vpm = 1;
9074		cf->barrier = 1;
9075		cf->output.elem_size = 0;
9076	}
9077	return 0;
9078}
9079
9080static int tgsi_store_rat(struct r600_shader_ctx *ctx)
9081{
9082	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9083	struct r600_bytecode_cf *cf;
9084	bool src_requires_loading = false;
9085	int val_gpr, idx_gpr;
9086	int r, i;
9087	unsigned rat_index_mode;
9088
9089	rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9090
9091	r = load_index_src(ctx, 0, &idx_gpr);
9092	if (r)
9093		return r;
9094
9095	if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
9096		src_requires_loading = true;
9097
9098	if (src_requires_loading) {
9099		struct r600_bytecode_alu alu;
9100		for (i = 0; i < 4; i++) {
9101			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9102			alu.op = ALU_OP1_MOV;
9103			alu.dst.sel = ctx->temp_reg;
9104			alu.dst.chan = i;
9105
9106			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9107			if (i == 3)
9108				alu.last = 1;
9109			alu.dst.write = 1;
9110			r = r600_bytecode_add_alu(ctx->bc, &alu);
9111			if (r)
9112				return r;
9113		}
9114		val_gpr = ctx->temp_reg;
9115	} else
9116		val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9117	if (rat_index_mode)
9118		egcm_load_index_reg(ctx->bc, 1, false);
9119
9120	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9121	cf = ctx->bc->cf_last;
9122
9123	cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9124	cf->rat.inst = V_RAT_INST_STORE_TYPED;
9125	cf->rat.index_mode = rat_index_mode;
9126	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9127	cf->output.gpr = val_gpr;
9128	cf->output.index_gpr = idx_gpr;
9129	cf->output.comp_mask = 0xf;
9130	cf->output.burst_count = 1;
9131	cf->vpm = 1;
9132	cf->barrier = 1;
9133	cf->output.elem_size = 0;
9134	return 0;
9135}
9136
9137static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9138{
9139	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9140	struct r600_bytecode_alu alu;
9141	int r, i, lasti;
9142	int write_mask = inst->Dst[0].Register.WriteMask;
9143	int temp_reg = r600_get_temp(ctx);
9144
9145	/* LDS write */
9146	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9147	alu.op = ALU_OP1_MOV;
9148	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9149	alu.dst.sel = temp_reg;
9150	alu.dst.write = 1;
9151	alu.last = 1;
9152	r = r600_bytecode_add_alu(ctx->bc, &alu);
9153	if (r)
9154		return r;
9155
9156	lasti = tgsi_last_instruction(write_mask);
9157	for (i = 1; i <= lasti; i++) {
9158		if (!(write_mask & (1 << i)))
9159			continue;
9160		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9161				   temp_reg, i,
9162				   temp_reg, 0,
9163				   V_SQ_ALU_SRC_LITERAL, 4 * i);
9164		if (r)
9165			return r;
9166	}
9167	for (i = 0; i <= lasti; i++) {
9168		if (!(write_mask & (1 << i)))
9169			continue;
9170
9171		if ((i == 0 && ((write_mask & 3) == 3)) ||
9172		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
9173			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9174			alu.op = LDS_OP3_LDS_WRITE_REL;
9175
9176			alu.src[0].sel = temp_reg;
9177			alu.src[0].chan = i;
9178			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9179			r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9180			alu.last = 1;
9181			alu.is_lds_idx_op = true;
9182			alu.lds_idx = 1;
9183			r = r600_bytecode_add_alu(ctx->bc, &alu);
9184			if (r)
9185				return r;
9186			i += 1;
9187			continue;
9188		}
9189		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9190		alu.op = LDS_OP2_LDS_WRITE;
9191
9192		alu.src[0].sel = temp_reg;
9193		alu.src[0].chan = i;
9194		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9195
9196		alu.last = 1;
9197		alu.is_lds_idx_op = true;
9198
9199		r = r600_bytecode_add_alu(ctx->bc, &alu);
9200		if (r)
9201			return r;
9202	}
9203	return 0;
9204}
9205
9206static int tgsi_store(struct r600_shader_ctx *ctx)
9207{
9208	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9209	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9210		return tgsi_store_buffer_rat(ctx);
9211	else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9212		return tgsi_store_lds(ctx);
9213	else
9214		return tgsi_store_rat(ctx);
9215}
9216
9217static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9218{
9219	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9220	/* have to work out the offset into the RAT immediate return buffer */
9221	struct r600_bytecode_alu alu;
9222	struct r600_bytecode_vtx vtx;
9223	struct r600_bytecode_cf *cf;
9224	int r;
9225	int idx_gpr;
9226	unsigned format, num_format, format_comp, endian;
9227	const struct util_format_description *desc;
9228	unsigned rat_index_mode;
9229	unsigned immed_base;
9230	unsigned rat_base;
9231
9232	immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9233	rat_base = ctx->shader->rat_base;
9234
9235        if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9236		immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9237		rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9238
9239		r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9240		if (r)
9241			return r;
9242		idx_gpr = ctx->temp_reg;
9243	} else {
9244		r = load_index_src(ctx, 1, &idx_gpr);
9245		if (r)
9246			return r;
9247	}
9248
9249	rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9250
9251	if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9252		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9253		alu.op = ALU_OP1_MOV;
9254		alu.dst.sel = ctx->thread_id_gpr;
9255		alu.dst.chan = 0;
9256		alu.dst.write = 1;
9257		r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9258		alu.last = 1;
9259		r = r600_bytecode_add_alu(ctx->bc, &alu);
9260		if (r)
9261			return r;
9262
9263		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9264		alu.op = ALU_OP1_MOV;
9265		alu.dst.sel = ctx->thread_id_gpr;
9266		if (ctx->bc->chip_class == CAYMAN)
9267			alu.dst.chan = 2;
9268		else
9269			alu.dst.chan = 3;
9270		alu.dst.write = 1;
9271		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9272		alu.last = 1;
9273		r = r600_bytecode_add_alu(ctx->bc, &alu);
9274		if (r)
9275			return r;
9276	} else {
9277		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9278		alu.op = ALU_OP1_MOV;
9279		alu.dst.sel = ctx->thread_id_gpr;
9280		alu.dst.chan = 0;
9281		alu.dst.write = 1;
9282		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9283		alu.last = 1;
9284		r = r600_bytecode_add_alu(ctx->bc, &alu);
9285		if (r)
9286			return r;
9287	}
9288
9289	if (rat_index_mode)
9290		egcm_load_index_reg(ctx->bc, 1, false);
9291	r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9292	cf = ctx->bc->cf_last;
9293
9294	cf->rat.id = rat_base + inst->Src[0].Register.Index;
9295	cf->rat.inst = ctx->inst_info->op;
9296	cf->rat.index_mode = rat_index_mode;
9297	cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9298	cf->output.gpr = ctx->thread_id_gpr;
9299	cf->output.index_gpr = idx_gpr;
9300	cf->output.comp_mask = 0xf;
9301	cf->output.burst_count = 1;
9302	cf->vpm = 1;
9303	cf->barrier = 1;
9304	cf->mark = 1;
9305	cf->output.elem_size = 0;
9306	r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9307	cf = ctx->bc->cf_last;
9308	cf->barrier = 1;
9309	cf->cf_addr = 1;
9310
9311	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9312	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9313		desc = util_format_description(inst->Memory.Format);
9314		r600_vertex_data_type(inst->Memory.Format,
9315				      &format, &num_format, &format_comp, &endian);
9316		vtx.dst_sel_x = desc->swizzle[0];
9317	} else {
9318		format = FMT_32;
9319		num_format = 1;
9320		format_comp = 0;
9321		endian = 0;
9322		vtx.dst_sel_x = 0;
9323	}
9324	vtx.op = FETCH_OP_VFETCH;
9325	vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9326	vtx.buffer_index_mode = rat_index_mode;
9327	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9328	vtx.src_gpr = ctx->thread_id_gpr;
9329	vtx.src_sel_x = 1;
9330	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9331	vtx.dst_sel_y = 7;
9332	vtx.dst_sel_z = 7;
9333	vtx.dst_sel_w = 7;
9334	vtx.use_const_fields = 0;
9335	vtx.srf_mode_all = 1;
9336	vtx.data_format = format;
9337	vtx.num_format_all = num_format;
9338	vtx.format_comp_all = format_comp;
9339	vtx.endian = endian;
9340	vtx.offset = 0;
9341	vtx.mega_fetch_count = 0xf;
9342	r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9343	if (r)
9344		return r;
9345	cf = ctx->bc->cf_last;
9346	cf->vpm = 1;
9347	cf->barrier = 1;
9348	return 0;
9349}
9350
9351static int get_gds_op(int opcode)
9352{
9353	switch (opcode) {
9354	case TGSI_OPCODE_ATOMUADD:
9355		return FETCH_OP_GDS_ADD_RET;
9356	case TGSI_OPCODE_ATOMAND:
9357		return FETCH_OP_GDS_AND_RET;
9358	case TGSI_OPCODE_ATOMOR:
9359		return FETCH_OP_GDS_OR_RET;
9360	case TGSI_OPCODE_ATOMXOR:
9361		return FETCH_OP_GDS_XOR_RET;
9362	case TGSI_OPCODE_ATOMUMIN:
9363		return FETCH_OP_GDS_MIN_UINT_RET;
9364	case TGSI_OPCODE_ATOMUMAX:
9365		return FETCH_OP_GDS_MAX_UINT_RET;
9366	case TGSI_OPCODE_ATOMXCHG:
9367		return FETCH_OP_GDS_XCHG_RET;
9368	case TGSI_OPCODE_ATOMCAS:
9369		return FETCH_OP_GDS_CMP_XCHG_RET;
9370	default:
9371		return -1;
9372	}
9373}
9374
9375static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9376{
9377	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9378	struct r600_bytecode_gds gds;
9379	struct r600_bytecode_alu alu;
9380	int gds_op = get_gds_op(inst->Instruction.Opcode);
9381	int r;
9382	int uav_id = 0;
9383	int uav_index_mode = 0;
9384	bool is_cm = (ctx->bc->chip_class == CAYMAN);
9385
9386	if (gds_op == -1) {
9387		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9388		return -1;
9389	}
9390
9391	r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9392	if (r)
9393		return r;
9394
9395	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9396		if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9397			int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9398			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9399			alu.op = ALU_OP1_MOV;
9400			alu.dst.sel = ctx->temp_reg;
9401			alu.dst.chan = is_cm ? 2 : 1;
9402			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9403			alu.src[0].value = value;
9404			alu.last = 1;
9405			alu.dst.write = 1;
9406			r = r600_bytecode_add_alu(ctx->bc, &alu);
9407			if (r)
9408				return r;
9409		} else {
9410			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9411			alu.op = ALU_OP1_MOV;
9412			alu.dst.sel = ctx->temp_reg;
9413			alu.dst.chan = is_cm ? 2 : 1;
9414			r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9415			alu.last = 1;
9416			alu.dst.write = 1;
9417			r = r600_bytecode_add_alu(ctx->bc, &alu);
9418			if (r)
9419				return r;
9420		}
9421	}
9422	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9423		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9424		int abs_value = abs(value);
9425		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9426			gds_op = FETCH_OP_GDS_SUB_RET;
9427		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9428		alu.op = ALU_OP1_MOV;
9429		alu.dst.sel = ctx->temp_reg;
9430		alu.dst.chan = is_cm ? 1 : 0;
9431		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9432		alu.src[0].value = abs_value;
9433		alu.last = 1;
9434		alu.dst.write = 1;
9435		r = r600_bytecode_add_alu(ctx->bc, &alu);
9436		if (r)
9437			return r;
9438	} else {
9439		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9440		alu.op = ALU_OP1_MOV;
9441		alu.dst.sel = ctx->temp_reg;
9442		alu.dst.chan = is_cm ? 1 : 0;
9443		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9444		alu.last = 1;
9445		alu.dst.write = 1;
9446		r = r600_bytecode_add_alu(ctx->bc, &alu);
9447		if (r)
9448			return r;
9449	}
9450
9451
9452	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9453	gds.op = gds_op;
9454	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9455	gds.uav_id = is_cm ? 0 : uav_id;
9456	gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9457	gds.src_gpr = ctx->temp_reg;
9458	gds.src_gpr2 = 0;
9459	gds.src_sel_x = is_cm ? 0 : 4;
9460	gds.src_sel_y = is_cm ? 1 : 0;
9461	if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9462		gds.src_sel_z = is_cm ? 2 : 1;
9463	else
9464		gds.src_sel_z = 7;
9465	gds.dst_sel_x = 0;
9466	gds.dst_sel_y = 7;
9467	gds.dst_sel_z = 7;
9468	gds.dst_sel_w = 7;
9469	gds.alloc_consume = !is_cm;
9470
9471	r = r600_bytecode_add_gds(ctx->bc, &gds);
9472	if (r)
9473		return r;
9474	ctx->bc->cf_last->vpm = 1;
9475	return 0;
9476}
9477
9478static int get_lds_op(int opcode)
9479{
9480	switch (opcode) {
9481	case TGSI_OPCODE_ATOMUADD:
9482		return LDS_OP2_LDS_ADD_RET;
9483	case TGSI_OPCODE_ATOMAND:
9484		return LDS_OP2_LDS_AND_RET;
9485	case TGSI_OPCODE_ATOMOR:
9486		return LDS_OP2_LDS_OR_RET;
9487	case TGSI_OPCODE_ATOMXOR:
9488		return LDS_OP2_LDS_XOR_RET;
9489	case TGSI_OPCODE_ATOMUMIN:
9490		return LDS_OP2_LDS_MIN_UINT_RET;
9491	case TGSI_OPCODE_ATOMUMAX:
9492		return LDS_OP2_LDS_MAX_UINT_RET;
9493	case TGSI_OPCODE_ATOMIMIN:
9494		return LDS_OP2_LDS_MIN_INT_RET;
9495	case TGSI_OPCODE_ATOMIMAX:
9496		return LDS_OP2_LDS_MAX_INT_RET;
9497	case TGSI_OPCODE_ATOMXCHG:
9498		return LDS_OP2_LDS_XCHG_RET;
9499	case TGSI_OPCODE_ATOMCAS:
9500		return LDS_OP3_LDS_CMP_XCHG_RET;
9501	default:
9502		return -1;
9503	}
9504}
9505
9506static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9507{
9508	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9509	int lds_op = get_lds_op(inst->Instruction.Opcode);
9510	int r;
9511
9512	struct r600_bytecode_alu alu;
9513	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9514	alu.op = lds_op;
9515	alu.is_lds_idx_op = true;
9516	alu.last = 1;
9517	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9518	r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9519	if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9520		r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9521	else
9522		alu.src[2].sel = V_SQ_ALU_SRC_0;
9523	r = r600_bytecode_add_alu(ctx->bc, &alu);
9524	if (r)
9525		return r;
9526
9527	/* then read from LDS_OQ_A_POP */
9528	memset(&alu, 0, sizeof(alu));
9529
9530	alu.op = ALU_OP1_MOV;
9531	alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9532	alu.src[0].chan = 0;
9533	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9534	alu.dst.write = 1;
9535	alu.last = 1;
9536	r = r600_bytecode_add_alu(ctx->bc, &alu);
9537	if (r)
9538		return r;
9539
9540	return 0;
9541}
9542
9543static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9544{
9545	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9546	if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9547		return tgsi_atomic_op_rat(ctx);
9548	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9549		return tgsi_atomic_op_gds(ctx);
9550	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9551		return tgsi_atomic_op_rat(ctx);
9552	if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9553		return tgsi_atomic_op_lds(ctx);
9554	return 0;
9555}
9556
9557static int tgsi_resq(struct r600_shader_ctx *ctx)
9558{
9559	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9560	unsigned sampler_index_mode;
9561	struct r600_bytecode_tex tex;
9562	int r;
9563	boolean has_txq_cube_array_z = false;
9564
9565	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9566	    (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9567		if (ctx->bc->chip_class < EVERGREEN)
9568			ctx->shader->uses_tex_buffers = true;
9569		unsigned eg_buffer_base = 0;
9570		eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9571		if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9572			eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9573		return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9574	}
9575
9576	if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9577	    inst->Dst[0].Register.WriteMask & 4) {
9578		ctx->shader->has_txq_cube_array_z_comp = true;
9579		has_txq_cube_array_z = true;
9580	}
9581
9582	sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9583	if (sampler_index_mode)
9584		egcm_load_index_reg(ctx->bc, 1, false);
9585
9586
9587	/* does this shader want a num layers from TXQ for a cube array? */
9588	if (has_txq_cube_array_z) {
9589		int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9590		struct r600_bytecode_alu alu;
9591
9592		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9593		alu.op = ALU_OP1_MOV;
9594
9595		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9596		/* with eg each dword is either number of cubes */
9597		alu.src[0].sel += id / 4;
9598		alu.src[0].chan = id % 4;
9599		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9600		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9601		alu.last = 1;
9602		r = r600_bytecode_add_alu(ctx->bc, &alu);
9603		if (r)
9604			return r;
9605		/* disable writemask from texture instruction */
9606		inst->Dst[0].Register.WriteMask &= ~4;
9607	}
9608	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9609	tex.op = ctx->inst_info->op;
9610	tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9611	tex.sampler_index_mode = sampler_index_mode;
9612	tex.resource_id = tex.sampler_id;
9613	tex.resource_index_mode = sampler_index_mode;
9614	tex.src_sel_x = 4;
9615	tex.src_sel_y = 4;
9616	tex.src_sel_z = 4;
9617	tex.src_sel_w = 4;
9618	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9619	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9620	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9621	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9622	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9623	r = r600_bytecode_add_tex(ctx->bc, &tex);
9624	if (r)
9625		return r;
9626
9627	return 0;
9628}
9629
9630static int tgsi_lrp(struct r600_shader_ctx *ctx)
9631{
9632	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9633	struct r600_bytecode_alu alu;
9634	unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9635	struct r600_bytecode_alu_src srcs[2][4];
9636	unsigned i;
9637	int r;
9638
9639	/* optimize if it's just an equal balance */
9640	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9641		for (i = 0; i < lasti + 1; i++) {
9642			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9643				continue;
9644
9645			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9646			alu.op = ALU_OP2_ADD;
9647			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9648			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9649			alu.omod = 3;
9650			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9651			alu.dst.chan = i;
9652			if (i == lasti) {
9653				alu.last = 1;
9654			}
9655			r = r600_bytecode_add_alu(ctx->bc, &alu);
9656			if (r)
9657				return r;
9658		}
9659		return 0;
9660	}
9661
9662	/* 1 - src0 */
9663	for (i = 0; i < lasti + 1; i++) {
9664		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9665			continue;
9666
9667		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9668		alu.op = ALU_OP2_ADD;
9669		alu.src[0].sel = V_SQ_ALU_SRC_1;
9670		alu.src[0].chan = 0;
9671		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9672		r600_bytecode_src_toggle_neg(&alu.src[1]);
9673		alu.dst.sel = ctx->temp_reg;
9674		alu.dst.chan = i;
9675		if (i == lasti) {
9676			alu.last = 1;
9677		}
9678		alu.dst.write = 1;
9679		r = r600_bytecode_add_alu(ctx->bc, &alu);
9680		if (r)
9681			return r;
9682	}
9683
9684	/* (1 - src0) * src2 */
9685	for (i = 0; i < lasti + 1; i++) {
9686		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9687			continue;
9688
9689		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9690		alu.op = ALU_OP2_MUL;
9691		alu.src[0].sel = ctx->temp_reg;
9692		alu.src[0].chan = i;
9693		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9694		alu.dst.sel = ctx->temp_reg;
9695		alu.dst.chan = i;
9696		if (i == lasti) {
9697			alu.last = 1;
9698		}
9699		alu.dst.write = 1;
9700		r = r600_bytecode_add_alu(ctx->bc, &alu);
9701		if (r)
9702			return r;
9703	}
9704
9705	/* src0 * src1 + (1 - src0) * src2 */
9706
9707	for (i = 0; i < 2; i++) {
9708		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9709					  srcs[i], &ctx->src[i]);
9710		if (r)
9711			return r;
9712	}
9713
9714	for (i = 0; i < lasti + 1; i++) {
9715		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9716			continue;
9717
9718		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9719		alu.op = ALU_OP3_MULADD;
9720		alu.is_op3 = 1;
9721		alu.src[0] = srcs[0][i];
9722		alu.src[1] = srcs[1][i];
9723		alu.src[2].sel = ctx->temp_reg;
9724		alu.src[2].chan = i;
9725
9726		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9727		alu.dst.chan = i;
9728		if (i == lasti) {
9729			alu.last = 1;
9730		}
9731		r = r600_bytecode_add_alu(ctx->bc, &alu);
9732		if (r)
9733			return r;
9734	}
9735	return 0;
9736}
9737
9738static int tgsi_cmp(struct r600_shader_ctx *ctx)
9739{
9740	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9741	struct r600_bytecode_alu alu;
9742	int i, r, j;
9743	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9744	struct r600_bytecode_alu_src srcs[3][4];
9745
9746	unsigned op;
9747
9748	if (ctx->src[0].abs && ctx->src[0].neg) {
9749		op = ALU_OP3_CNDE;
9750		ctx->src[0].abs = 0;
9751		ctx->src[0].neg = 0;
9752	} else {
9753		op = ALU_OP3_CNDGE;
9754	}
9755
9756	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9757		r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9758					  srcs[j], &ctx->src[j]);
9759		if (r)
9760			return r;
9761	}
9762
9763	for (i = 0; i < lasti + 1; i++) {
9764		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9765			continue;
9766
9767		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9768		alu.op = op;
9769		alu.src[0] = srcs[0][i];
9770		alu.src[1] = srcs[2][i];
9771		alu.src[2] = srcs[1][i];
9772
9773		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9774		alu.dst.chan = i;
9775		alu.dst.write = 1;
9776		alu.is_op3 = 1;
9777		if (i == lasti)
9778			alu.last = 1;
9779		r = r600_bytecode_add_alu(ctx->bc, &alu);
9780		if (r)
9781			return r;
9782	}
9783	return 0;
9784}
9785
9786static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9787{
9788	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9789	struct r600_bytecode_alu alu;
9790	int i, r;
9791	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9792
9793	for (i = 0; i < lasti + 1; i++) {
9794		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9795			continue;
9796
9797		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9798		alu.op = ALU_OP3_CNDE_INT;
9799		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9800		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9801		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9802		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9803		alu.dst.chan = i;
9804		alu.dst.write = 1;
9805		alu.is_op3 = 1;
9806		if (i == lasti)
9807			alu.last = 1;
9808		r = r600_bytecode_add_alu(ctx->bc, &alu);
9809		if (r)
9810			return r;
9811	}
9812	return 0;
9813}
9814
9815static int tgsi_exp(struct r600_shader_ctx *ctx)
9816{
9817	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9818	struct r600_bytecode_alu alu;
9819	int r;
9820	unsigned i;
9821
9822	/* result.x = 2^floor(src); */
9823	if (inst->Dst[0].Register.WriteMask & 1) {
9824		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9825
9826		alu.op = ALU_OP1_FLOOR;
9827		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9828
9829		alu.dst.sel = ctx->temp_reg;
9830		alu.dst.chan = 0;
9831		alu.dst.write = 1;
9832		alu.last = 1;
9833		r = r600_bytecode_add_alu(ctx->bc, &alu);
9834		if (r)
9835			return r;
9836
9837		if (ctx->bc->chip_class == CAYMAN) {
9838			for (i = 0; i < 3; i++) {
9839				alu.op = ALU_OP1_EXP_IEEE;
9840				alu.src[0].sel = ctx->temp_reg;
9841				alu.src[0].chan = 0;
9842
9843				alu.dst.sel = ctx->temp_reg;
9844				alu.dst.chan = i;
9845				alu.dst.write = i == 0;
9846				alu.last = i == 2;
9847				r = r600_bytecode_add_alu(ctx->bc, &alu);
9848				if (r)
9849					return r;
9850			}
9851		} else {
9852			alu.op = ALU_OP1_EXP_IEEE;
9853			alu.src[0].sel = ctx->temp_reg;
9854			alu.src[0].chan = 0;
9855
9856			alu.dst.sel = ctx->temp_reg;
9857			alu.dst.chan = 0;
9858			alu.dst.write = 1;
9859			alu.last = 1;
9860			r = r600_bytecode_add_alu(ctx->bc, &alu);
9861			if (r)
9862				return r;
9863		}
9864	}
9865
9866	/* result.y = tmp - floor(tmp); */
9867	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9868		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9869
9870		alu.op = ALU_OP1_FRACT;
9871		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9872
9873		alu.dst.sel = ctx->temp_reg;
9874#if 0
9875		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9876		if (r)
9877			return r;
9878#endif
9879		alu.dst.write = 1;
9880		alu.dst.chan = 1;
9881
9882		alu.last = 1;
9883
9884		r = r600_bytecode_add_alu(ctx->bc, &alu);
9885		if (r)
9886			return r;
9887	}
9888
9889	/* result.z = RoughApprox2ToX(tmp);*/
9890	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9891		if (ctx->bc->chip_class == CAYMAN) {
9892			for (i = 0; i < 3; i++) {
9893				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9894				alu.op = ALU_OP1_EXP_IEEE;
9895				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9896
9897				alu.dst.sel = ctx->temp_reg;
9898				alu.dst.chan = i;
9899				if (i == 2) {
9900					alu.dst.write = 1;
9901					alu.last = 1;
9902				}
9903
9904				r = r600_bytecode_add_alu(ctx->bc, &alu);
9905				if (r)
9906					return r;
9907			}
9908		} else {
9909			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9910			alu.op = ALU_OP1_EXP_IEEE;
9911			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9912
9913			alu.dst.sel = ctx->temp_reg;
9914			alu.dst.write = 1;
9915			alu.dst.chan = 2;
9916
9917			alu.last = 1;
9918
9919			r = r600_bytecode_add_alu(ctx->bc, &alu);
9920			if (r)
9921				return r;
9922		}
9923	}
9924
9925	/* result.w = 1.0;*/
9926	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9927		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9928
9929		alu.op = ALU_OP1_MOV;
9930		alu.src[0].sel = V_SQ_ALU_SRC_1;
9931		alu.src[0].chan = 0;
9932
9933		alu.dst.sel = ctx->temp_reg;
9934		alu.dst.chan = 3;
9935		alu.dst.write = 1;
9936		alu.last = 1;
9937		r = r600_bytecode_add_alu(ctx->bc, &alu);
9938		if (r)
9939			return r;
9940	}
9941	return tgsi_helper_copy(ctx, inst);
9942}
9943
9944static int tgsi_log(struct r600_shader_ctx *ctx)
9945{
9946	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9947	struct r600_bytecode_alu alu;
9948	int r;
9949	unsigned i;
9950
9951	/* result.x = floor(log2(|src|)); */
9952	if (inst->Dst[0].Register.WriteMask & 1) {
9953		if (ctx->bc->chip_class == CAYMAN) {
9954			for (i = 0; i < 3; i++) {
9955				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9956
9957				alu.op = ALU_OP1_LOG_IEEE;
9958				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9959				r600_bytecode_src_set_abs(&alu.src[0]);
9960
9961				alu.dst.sel = ctx->temp_reg;
9962				alu.dst.chan = i;
9963				if (i == 0)
9964					alu.dst.write = 1;
9965				if (i == 2)
9966					alu.last = 1;
9967				r = r600_bytecode_add_alu(ctx->bc, &alu);
9968				if (r)
9969					return r;
9970			}
9971
9972		} else {
9973			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9974
9975			alu.op = ALU_OP1_LOG_IEEE;
9976			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9977			r600_bytecode_src_set_abs(&alu.src[0]);
9978
9979			alu.dst.sel = ctx->temp_reg;
9980			alu.dst.chan = 0;
9981			alu.dst.write = 1;
9982			alu.last = 1;
9983			r = r600_bytecode_add_alu(ctx->bc, &alu);
9984			if (r)
9985				return r;
9986		}
9987
9988		alu.op = ALU_OP1_FLOOR;
9989		alu.src[0].sel = ctx->temp_reg;
9990		alu.src[0].chan = 0;
9991
9992		alu.dst.sel = ctx->temp_reg;
9993		alu.dst.chan = 0;
9994		alu.dst.write = 1;
9995		alu.last = 1;
9996
9997		r = r600_bytecode_add_alu(ctx->bc, &alu);
9998		if (r)
9999			return r;
10000	}
10001
10002	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
10003	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
10004
10005		if (ctx->bc->chip_class == CAYMAN) {
10006			for (i = 0; i < 3; i++) {
10007				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10008
10009				alu.op = ALU_OP1_LOG_IEEE;
10010				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10011				r600_bytecode_src_set_abs(&alu.src[0]);
10012
10013				alu.dst.sel = ctx->temp_reg;
10014				alu.dst.chan = i;
10015				if (i == 1)
10016					alu.dst.write = 1;
10017				if (i == 2)
10018					alu.last = 1;
10019
10020				r = r600_bytecode_add_alu(ctx->bc, &alu);
10021				if (r)
10022					return r;
10023			}
10024		} else {
10025			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10026
10027			alu.op = ALU_OP1_LOG_IEEE;
10028			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10029			r600_bytecode_src_set_abs(&alu.src[0]);
10030
10031			alu.dst.sel = ctx->temp_reg;
10032			alu.dst.chan = 1;
10033			alu.dst.write = 1;
10034			alu.last = 1;
10035
10036			r = r600_bytecode_add_alu(ctx->bc, &alu);
10037			if (r)
10038				return r;
10039		}
10040
10041		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10042
10043		alu.op = ALU_OP1_FLOOR;
10044		alu.src[0].sel = ctx->temp_reg;
10045		alu.src[0].chan = 1;
10046
10047		alu.dst.sel = ctx->temp_reg;
10048		alu.dst.chan = 1;
10049		alu.dst.write = 1;
10050		alu.last = 1;
10051
10052		r = r600_bytecode_add_alu(ctx->bc, &alu);
10053		if (r)
10054			return r;
10055
10056		if (ctx->bc->chip_class == CAYMAN) {
10057			for (i = 0; i < 3; i++) {
10058				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10059				alu.op = ALU_OP1_EXP_IEEE;
10060				alu.src[0].sel = ctx->temp_reg;
10061				alu.src[0].chan = 1;
10062
10063				alu.dst.sel = ctx->temp_reg;
10064				alu.dst.chan = i;
10065				if (i == 1)
10066					alu.dst.write = 1;
10067				if (i == 2)
10068					alu.last = 1;
10069
10070				r = r600_bytecode_add_alu(ctx->bc, &alu);
10071				if (r)
10072					return r;
10073			}
10074		} else {
10075			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10076			alu.op = ALU_OP1_EXP_IEEE;
10077			alu.src[0].sel = ctx->temp_reg;
10078			alu.src[0].chan = 1;
10079
10080			alu.dst.sel = ctx->temp_reg;
10081			alu.dst.chan = 1;
10082			alu.dst.write = 1;
10083			alu.last = 1;
10084
10085			r = r600_bytecode_add_alu(ctx->bc, &alu);
10086			if (r)
10087				return r;
10088		}
10089
10090		if (ctx->bc->chip_class == CAYMAN) {
10091			for (i = 0; i < 3; i++) {
10092				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10093				alu.op = ALU_OP1_RECIP_IEEE;
10094				alu.src[0].sel = ctx->temp_reg;
10095				alu.src[0].chan = 1;
10096
10097				alu.dst.sel = ctx->temp_reg;
10098				alu.dst.chan = i;
10099				if (i == 1)
10100					alu.dst.write = 1;
10101				if (i == 2)
10102					alu.last = 1;
10103
10104				r = r600_bytecode_add_alu(ctx->bc, &alu);
10105				if (r)
10106					return r;
10107			}
10108		} else {
10109			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10110			alu.op = ALU_OP1_RECIP_IEEE;
10111			alu.src[0].sel = ctx->temp_reg;
10112			alu.src[0].chan = 1;
10113
10114			alu.dst.sel = ctx->temp_reg;
10115			alu.dst.chan = 1;
10116			alu.dst.write = 1;
10117			alu.last = 1;
10118
10119			r = r600_bytecode_add_alu(ctx->bc, &alu);
10120			if (r)
10121				return r;
10122		}
10123
10124		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10125
10126		alu.op = ALU_OP2_MUL;
10127
10128		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10129		r600_bytecode_src_set_abs(&alu.src[0]);
10130
10131		alu.src[1].sel = ctx->temp_reg;
10132		alu.src[1].chan = 1;
10133
10134		alu.dst.sel = ctx->temp_reg;
10135		alu.dst.chan = 1;
10136		alu.dst.write = 1;
10137		alu.last = 1;
10138
10139		r = r600_bytecode_add_alu(ctx->bc, &alu);
10140		if (r)
10141			return r;
10142	}
10143
10144	/* result.z = log2(|src|);*/
10145	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10146		if (ctx->bc->chip_class == CAYMAN) {
10147			for (i = 0; i < 3; i++) {
10148				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10149
10150				alu.op = ALU_OP1_LOG_IEEE;
10151				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10152				r600_bytecode_src_set_abs(&alu.src[0]);
10153
10154				alu.dst.sel = ctx->temp_reg;
10155				if (i == 2)
10156					alu.dst.write = 1;
10157				alu.dst.chan = i;
10158				if (i == 2)
10159					alu.last = 1;
10160
10161				r = r600_bytecode_add_alu(ctx->bc, &alu);
10162				if (r)
10163					return r;
10164			}
10165		} else {
10166			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10167
10168			alu.op = ALU_OP1_LOG_IEEE;
10169			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10170			r600_bytecode_src_set_abs(&alu.src[0]);
10171
10172			alu.dst.sel = ctx->temp_reg;
10173			alu.dst.write = 1;
10174			alu.dst.chan = 2;
10175			alu.last = 1;
10176
10177			r = r600_bytecode_add_alu(ctx->bc, &alu);
10178			if (r)
10179				return r;
10180		}
10181	}
10182
10183	/* result.w = 1.0; */
10184	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10185		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10186
10187		alu.op = ALU_OP1_MOV;
10188		alu.src[0].sel = V_SQ_ALU_SRC_1;
10189		alu.src[0].chan = 0;
10190
10191		alu.dst.sel = ctx->temp_reg;
10192		alu.dst.chan = 3;
10193		alu.dst.write = 1;
10194		alu.last = 1;
10195
10196		r = r600_bytecode_add_alu(ctx->bc, &alu);
10197		if (r)
10198			return r;
10199	}
10200
10201	return tgsi_helper_copy(ctx, inst);
10202}
10203
10204static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10205{
10206	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10207	struct r600_bytecode_alu alu;
10208	int r;
10209	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10210	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10211
10212	assert(inst->Dst[0].Register.Index < 3);
10213	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10214
10215	switch (inst->Instruction.Opcode) {
10216	case TGSI_OPCODE_ARL:
10217		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10218		break;
10219	case TGSI_OPCODE_ARR:
10220		alu.op = ALU_OP1_FLT_TO_INT;
10221		break;
10222	case TGSI_OPCODE_UARL:
10223		alu.op = ALU_OP1_MOV;
10224		break;
10225	default:
10226		assert(0);
10227		return -1;
10228	}
10229
10230	for (i = 0; i <= lasti; ++i) {
10231		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10232			continue;
10233		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10234		alu.last = i == lasti;
10235		alu.dst.sel = reg;
10236	        alu.dst.chan = i;
10237		alu.dst.write = 1;
10238		r = r600_bytecode_add_alu(ctx->bc, &alu);
10239		if (r)
10240			return r;
10241	}
10242
10243	if (inst->Dst[0].Register.Index > 0)
10244		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10245	else
10246		ctx->bc->ar_loaded = 0;
10247
10248	return 0;
10249}
10250static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10251{
10252	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10253	struct r600_bytecode_alu alu;
10254	int r;
10255	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10256
10257	switch (inst->Instruction.Opcode) {
10258	case TGSI_OPCODE_ARL:
10259		memset(&alu, 0, sizeof(alu));
10260		alu.op = ALU_OP1_FLOOR;
10261		alu.dst.sel = ctx->bc->ar_reg;
10262		alu.dst.write = 1;
10263		for (i = 0; i <= lasti; ++i) {
10264			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
10265				alu.dst.chan = i;
10266				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10267				alu.last = i == lasti;
10268				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10269					return r;
10270			}
10271		}
10272
10273		memset(&alu, 0, sizeof(alu));
10274		alu.op = ALU_OP1_FLT_TO_INT;
10275		alu.src[0].sel = ctx->bc->ar_reg;
10276		alu.dst.sel = ctx->bc->ar_reg;
10277		alu.dst.write = 1;
10278		/* FLT_TO_INT is trans-only on r600/r700 */
10279		alu.last = TRUE;
10280		for (i = 0; i <= lasti; ++i) {
10281			alu.dst.chan = i;
10282			alu.src[0].chan = i;
10283			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10284				return r;
10285		}
10286		break;
10287	case TGSI_OPCODE_ARR:
10288		memset(&alu, 0, sizeof(alu));
10289		alu.op = ALU_OP1_FLT_TO_INT;
10290		alu.dst.sel = ctx->bc->ar_reg;
10291		alu.dst.write = 1;
10292		/* FLT_TO_INT is trans-only on r600/r700 */
10293		alu.last = TRUE;
10294		for (i = 0; i <= lasti; ++i) {
10295			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10296				alu.dst.chan = i;
10297				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10298				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10299					return r;
10300			}
10301		}
10302		break;
10303	case TGSI_OPCODE_UARL:
10304		memset(&alu, 0, sizeof(alu));
10305		alu.op = ALU_OP1_MOV;
10306		alu.dst.sel = ctx->bc->ar_reg;
10307		alu.dst.write = 1;
10308		for (i = 0; i <= lasti; ++i) {
10309			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10310				alu.dst.chan = i;
10311				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10312				alu.last = i == lasti;
10313				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10314					return r;
10315			}
10316		}
10317		break;
10318	default:
10319		assert(0);
10320		return -1;
10321	}
10322
10323	ctx->bc->ar_loaded = 0;
10324	return 0;
10325}
10326
10327static int tgsi_opdst(struct r600_shader_ctx *ctx)
10328{
10329	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10330	struct r600_bytecode_alu alu;
10331	int i, r = 0;
10332
10333	for (i = 0; i < 4; i++) {
10334		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10335
10336		alu.op = ALU_OP2_MUL;
10337		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10338
10339		if (i == 0 || i == 3) {
10340			alu.src[0].sel = V_SQ_ALU_SRC_1;
10341		} else {
10342			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10343		}
10344
10345		if (i == 0 || i == 2) {
10346			alu.src[1].sel = V_SQ_ALU_SRC_1;
10347		} else {
10348			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10349		}
10350		if (i == 3)
10351			alu.last = 1;
10352		r = r600_bytecode_add_alu(ctx->bc, &alu);
10353		if (r)
10354			return r;
10355	}
10356	return 0;
10357}
10358
10359static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10360			   struct r600_bytecode_alu_src *src)
10361{
10362	struct r600_bytecode_alu alu;
10363	int r;
10364
10365	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10366	alu.op = opcode;
10367	alu.execute_mask = 1;
10368	alu.update_pred = 1;
10369
10370	alu.dst.sel = ctx->temp_reg;
10371	alu.dst.write = 1;
10372	alu.dst.chan = 0;
10373
10374	alu.src[0] = *src;
10375	alu.src[1].sel = V_SQ_ALU_SRC_0;
10376	alu.src[1].chan = 0;
10377
10378	alu.last = 1;
10379
10380	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10381	if (r)
10382		return r;
10383	return 0;
10384}
10385
10386static int pops(struct r600_shader_ctx *ctx, int pops)
10387{
10388	unsigned force_pop = ctx->bc->force_add_cf;
10389
10390	if (!force_pop) {
10391		int alu_pop = 3;
10392		if (ctx->bc->cf_last) {
10393			if (ctx->bc->cf_last->op == CF_OP_ALU)
10394				alu_pop = 0;
10395			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10396				alu_pop = 1;
10397		}
10398		alu_pop += pops;
10399		if (alu_pop == 1) {
10400			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10401			ctx->bc->force_add_cf = 1;
10402		} else if (alu_pop == 2) {
10403			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10404			ctx->bc->force_add_cf = 1;
10405		} else {
10406			force_pop = 1;
10407		}
10408	}
10409
10410	if (force_pop) {
10411		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10412		ctx->bc->cf_last->pop_count = pops;
10413		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10414	}
10415
10416	return 0;
10417}
10418
10419static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10420                                              unsigned reason)
10421{
10422	struct r600_stack_info *stack = &ctx->bc->stack;
10423	unsigned elements;
10424	int entries;
10425
10426	unsigned entry_size = stack->entry_size;
10427
10428	elements = (stack->loop + stack->push_wqm ) * entry_size;
10429	elements += stack->push;
10430
10431	switch (ctx->bc->chip_class) {
10432	case R600:
10433	case R700:
10434		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10435		 * the stack must be reserved to hold the current active/continue
10436		 * masks */
10437		if (reason == FC_PUSH_VPM || stack->push > 0) {
10438			elements += 2;
10439		}
10440		break;
10441
10442	case CAYMAN:
10443		/* r9xx: any stack operation on empty stack consumes 2 additional
10444		 * elements */
10445		elements += 2;
10446
10447		FALLTHROUGH;
10448		/* FIXME: do the two elements added above cover the cases for the
10449		 * r8xx+ below? */
10450
10451	case EVERGREEN:
10452		/* r8xx+: 2 extra elements are not always required, but one extra
10453		 * element must be added for each of the following cases:
10454		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10455		 *    stack usage.
10456		 *    (Currently we don't use ALU_ELSE_AFTER.)
10457		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10458		 *    PUSH instruction executed.
10459		 *
10460		 *    NOTE: it seems we also need to reserve additional element in some
10461		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10462		 *    then STACK_SIZE should be 2 instead of 1 */
10463		if (reason == FC_PUSH_VPM || stack->push > 0) {
10464			elements += 1;
10465		}
10466		break;
10467
10468	default:
10469		assert(0);
10470		break;
10471	}
10472
10473	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10474	 * for all chips, so we use 4 in the final formula, not the real entry_size
10475	 * for the chip */
10476	entry_size = 4;
10477
10478	entries = (elements + (entry_size - 1)) / entry_size;
10479
10480	if (entries > stack->max_entries)
10481		stack->max_entries = entries;
10482	return elements;
10483}
10484
10485static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10486{
10487	switch(reason) {
10488	case FC_PUSH_VPM:
10489		--ctx->bc->stack.push;
10490		assert(ctx->bc->stack.push >= 0);
10491		break;
10492	case FC_PUSH_WQM:
10493		--ctx->bc->stack.push_wqm;
10494		assert(ctx->bc->stack.push_wqm >= 0);
10495		break;
10496	case FC_LOOP:
10497		--ctx->bc->stack.loop;
10498		assert(ctx->bc->stack.loop >= 0);
10499		break;
10500	default:
10501		assert(0);
10502		break;
10503	}
10504}
10505
10506static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10507{
10508	switch (reason) {
10509	case FC_PUSH_VPM:
10510		++ctx->bc->stack.push;
10511		break;
10512	case FC_PUSH_WQM:
10513		++ctx->bc->stack.push_wqm;
10514		break;
10515	case FC_LOOP:
10516		++ctx->bc->stack.loop;
10517		break;
10518	default:
10519		assert(0);
10520	}
10521
10522	return callstack_update_max_depth(ctx, reason);
10523}
10524
10525static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10526{
10527	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10528
10529	sp->mid = realloc((void *)sp->mid,
10530						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10531	sp->mid[sp->num_mid] = ctx->bc->cf_last;
10532	sp->num_mid++;
10533}
10534
10535static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10536{
10537	assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10538	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10539	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10540	ctx->bc->fc_sp++;
10541}
10542
10543static void fc_poplevel(struct r600_shader_ctx *ctx)
10544{
10545	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10546	free(sp->mid);
10547	sp->mid = NULL;
10548	sp->num_mid = 0;
10549	sp->start = NULL;
10550	sp->type = 0;
10551	ctx->bc->fc_sp--;
10552}
10553
10554#if 0
10555static int emit_return(struct r600_shader_ctx *ctx)
10556{
10557	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10558	return 0;
10559}
10560
10561static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10562{
10563
10564	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10565	ctx->bc->cf_last->pop_count = pops;
10566	/* XXX work out offset */
10567	return 0;
10568}
10569
10570static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10571{
10572	return 0;
10573}
10574
10575static void emit_testflag(struct r600_shader_ctx *ctx)
10576{
10577
10578}
10579
10580static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10581{
10582	emit_testflag(ctx);
10583	emit_jump_to_offset(ctx, 1, 4);
10584	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10585	pops(ctx, ifidx + 1);
10586	emit_return(ctx);
10587}
10588
10589static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10590{
10591	emit_testflag(ctx);
10592
10593	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10594	ctx->bc->cf_last->pop_count = 1;
10595
10596	fc_set_mid(ctx, fc_sp);
10597
10598	pops(ctx, 1);
10599}
10600#endif
10601
10602static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10603		   struct r600_bytecode_alu_src *src)
10604{
10605	int alu_type = CF_OP_ALU_PUSH_BEFORE;
10606	bool needs_workaround = false;
10607	int elems = callstack_push(ctx, FC_PUSH_VPM);
10608
10609	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
10610		needs_workaround = true;
10611
10612	if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10613		unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10614		unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10615
10616		if (elems && (!dmod1 || !dmod2))
10617			needs_workaround = true;
10618	}
10619
10620	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10621	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10622	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10623	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10624	if (needs_workaround) {
10625		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10626		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10627		alu_type = CF_OP_ALU;
10628	}
10629
10630	emit_logic_pred(ctx, opcode, alu_type, src);
10631
10632	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10633
10634	fc_pushlevel(ctx, FC_IF);
10635
10636	return 0;
10637}
10638
10639static int tgsi_if(struct r600_shader_ctx *ctx)
10640{
10641	struct r600_bytecode_alu_src alu_src;
10642	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10643
10644	return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10645}
10646
10647static int tgsi_uif(struct r600_shader_ctx *ctx)
10648{
10649	struct r600_bytecode_alu_src alu_src;
10650	r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10651	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10652}
10653
10654static int tgsi_else(struct r600_shader_ctx *ctx)
10655{
10656	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10657	ctx->bc->cf_last->pop_count = 1;
10658
10659	fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10660	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10661	return 0;
10662}
10663
10664static int tgsi_endif(struct r600_shader_ctx *ctx)
10665{
10666	int offset = 2;
10667	pops(ctx, 1);
10668	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10669		R600_ERR("if/endif unbalanced in shader\n");
10670		return -1;
10671	}
10672
10673	/* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10674	if (ctx->bc->cf_last->eg_alu_extended)
10675			offset += 2;
10676
10677	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10678		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10679		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10680	} else {
10681		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10682	}
10683	fc_poplevel(ctx);
10684
10685	callstack_pop(ctx, FC_PUSH_VPM);
10686	return 0;
10687}
10688
10689static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10690{
10691	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10692	 * limited to 4096 iterations, like the other LOOP_* instructions. */
10693	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10694
10695	fc_pushlevel(ctx, FC_LOOP);
10696
10697	/* check stack depth */
10698	callstack_push(ctx, FC_LOOP);
10699	return 0;
10700}
10701
10702static int tgsi_endloop(struct r600_shader_ctx *ctx)
10703{
10704	int i;
10705
10706	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10707
10708	if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10709		R600_ERR("loop/endloop in shader code are not paired.\n");
10710		return -EINVAL;
10711	}
10712
10713	/* fixup loop pointers - from r600isa
10714	   LOOP END points to CF after LOOP START,
10715	   LOOP START point to CF after LOOP END
10716	   BRK/CONT point to LOOP END CF
10717	*/
10718	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10719
10720	ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10721
10722	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10723		ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10724	}
10725	/* XXX add LOOPRET support */
10726	fc_poplevel(ctx);
10727	callstack_pop(ctx, FC_LOOP);
10728	return 0;
10729}
10730
10731static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10732{
10733	unsigned int fscp;
10734
10735	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10736	{
10737		if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10738			break;
10739	}
10740
10741	if (fscp == 0) {
10742		R600_ERR("Break not inside loop/endloop pair\n");
10743		return -EINVAL;
10744	}
10745
10746	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10747
10748	fc_set_mid(ctx, fscp - 1);
10749
10750	return 0;
10751}
10752
10753static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10754{
10755	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10756	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10757	int r;
10758
10759	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10760		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10761
10762	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10763	if (!r) {
10764		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10765		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10766			return emit_inc_ring_offset(ctx, stream, TRUE);
10767	}
10768	return r;
10769}
10770
10771static int tgsi_umad(struct r600_shader_ctx *ctx)
10772{
10773	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10774	struct r600_bytecode_alu alu;
10775	int i, j, r;
10776	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10777
10778	/* src0 * src1 */
10779	for (i = 0; i < lasti + 1; i++) {
10780		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10781			continue;
10782
10783		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10784
10785		alu.dst.chan = i;
10786		alu.dst.sel = ctx->temp_reg;
10787		alu.dst.write = 1;
10788
10789		alu.op = ALU_OP2_MULLO_UINT;
10790		for (j = 0; j < 2; j++) {
10791			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10792		}
10793
10794		alu.last = 1;
10795		r = emit_mul_int_op(ctx->bc, &alu);
10796		if (r)
10797			return r;
10798	}
10799
10800
10801	for (i = 0; i < lasti + 1; i++) {
10802		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10803			continue;
10804
10805		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10806		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10807
10808		alu.op = ALU_OP2_ADD_INT;
10809
10810		alu.src[0].sel = ctx->temp_reg;
10811		alu.src[0].chan = i;
10812
10813		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10814		if (i == lasti) {
10815			alu.last = 1;
10816		}
10817		r = r600_bytecode_add_alu(ctx->bc, &alu);
10818		if (r)
10819			return r;
10820	}
10821	return 0;
10822}
10823
10824static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10825{
10826	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10827	struct r600_bytecode_alu alu;
10828	int r, i;
10829	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10830
10831	/* temp.xy = f32_to_f16(src) */
10832	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10833	alu.op = ALU_OP1_FLT32_TO_FLT16;
10834	alu.dst.chan = 0;
10835	alu.dst.sel = ctx->temp_reg;
10836	alu.dst.write = 1;
10837	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10838	r = r600_bytecode_add_alu(ctx->bc, &alu);
10839	if (r)
10840		return r;
10841	alu.dst.chan = 1;
10842	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10843	alu.last = 1;
10844	r = r600_bytecode_add_alu(ctx->bc, &alu);
10845	if (r)
10846		return r;
10847
10848	/* dst.x = temp.y * 0x10000 + temp.x */
10849	for (i = 0; i < lasti + 1; i++) {
10850		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10851			continue;
10852
10853		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10854		alu.op = ALU_OP3_MULADD_UINT24;
10855		alu.is_op3 = 1;
10856		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10857		alu.last = i == lasti;
10858		alu.src[0].sel = ctx->temp_reg;
10859		alu.src[0].chan = 1;
10860		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10861		alu.src[1].value = 0x10000;
10862		alu.src[2].sel = ctx->temp_reg;
10863		alu.src[2].chan = 0;
10864		r = r600_bytecode_add_alu(ctx->bc, &alu);
10865		if (r)
10866			return r;
10867	}
10868
10869	return 0;
10870}
10871
10872static int tgsi_up2h(struct r600_shader_ctx *ctx)
10873{
10874	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10875	struct r600_bytecode_alu alu;
10876	int r, i;
10877	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10878
10879	/* temp.x = src.x */
10880	/* note: no need to mask out the high bits */
10881	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10882	alu.op = ALU_OP1_MOV;
10883	alu.dst.chan = 0;
10884	alu.dst.sel = ctx->temp_reg;
10885	alu.dst.write = 1;
10886	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10887	r = r600_bytecode_add_alu(ctx->bc, &alu);
10888	if (r)
10889		return r;
10890
10891	/* temp.y = src.x >> 16 */
10892	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10893	alu.op = ALU_OP2_LSHR_INT;
10894	alu.dst.chan = 1;
10895	alu.dst.sel = ctx->temp_reg;
10896	alu.dst.write = 1;
10897	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10898	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10899	alu.src[1].value = 16;
10900	alu.last = 1;
10901	r = r600_bytecode_add_alu(ctx->bc, &alu);
10902	if (r)
10903		return r;
10904
10905	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10906	for (i = 0; i < lasti + 1; i++) {
10907		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10908			continue;
10909		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10910		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10911		alu.op = ALU_OP1_FLT16_TO_FLT32;
10912		alu.src[0].sel = ctx->temp_reg;
10913		alu.src[0].chan = i % 2;
10914		alu.last = i == lasti;
10915		r = r600_bytecode_add_alu(ctx->bc, &alu);
10916		if (r)
10917			return r;
10918	}
10919
10920	return 0;
10921}
10922
10923static int tgsi_bfe(struct r600_shader_ctx *ctx)
10924{
10925	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10926	struct r600_bytecode_alu alu;
10927	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10928	int r, i;
10929	int dst = -1;
10930
10931	if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10932	     inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10933	    (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10934	     inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10935		dst = r600_get_temp(ctx);
10936
10937	r = tgsi_op3_dst(ctx, dst);
10938	if (r)
10939		return r;
10940
10941	for (i = 0; i < lasti + 1; i++) {
10942		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10943		alu.op = ALU_OP2_SETGE_INT;
10944		r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10945		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10946		alu.src[1].value = 32;
10947		alu.dst.sel = ctx->temp_reg;
10948		alu.dst.chan = i;
10949		alu.dst.write = 1;
10950		if (i == lasti)
10951			alu.last = 1;
10952		r = r600_bytecode_add_alu(ctx->bc, &alu);
10953		if (r)
10954			return r;
10955	}
10956
10957	for (i = 0; i < lasti + 1; i++) {
10958		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10959		alu.op = ALU_OP3_CNDE_INT;
10960		alu.is_op3 = 1;
10961		alu.src[0].sel = ctx->temp_reg;
10962		alu.src[0].chan = i;
10963
10964		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10965		if (dst != -1)
10966			alu.src[1].sel = dst;
10967		else
10968			alu.src[1].sel = alu.dst.sel;
10969		alu.src[1].chan = i;
10970		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10971		alu.dst.write = 1;
10972		if (i == lasti)
10973			alu.last = 1;
10974		r = r600_bytecode_add_alu(ctx->bc, &alu);
10975		if (r)
10976			return r;
10977	}
10978
10979	return 0;
10980}
10981
10982static int tgsi_clock(struct r600_shader_ctx *ctx)
10983{
10984	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10985	struct r600_bytecode_alu alu;
10986	int r;
10987
10988	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10989	alu.op = ALU_OP1_MOV;
10990	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10991	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10992	r = r600_bytecode_add_alu(ctx->bc, &alu);
10993	if (r)
10994		return r;
10995	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10996	alu.op = ALU_OP1_MOV;
10997	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10998	alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10999	alu.last = 1;
11000	r = r600_bytecode_add_alu(ctx->bc, &alu);
11001	if (r)
11002		return r;
11003	return 0;
11004}
11005
11006static int emit_u64add(struct r600_shader_ctx *ctx, int op,
11007		       int treg,
11008		       int src0_sel, int src0_chan,
11009		       int src1_sel, int src1_chan)
11010{
11011	struct r600_bytecode_alu alu;
11012	int r;
11013	int opc;
11014
11015	if (op == ALU_OP2_ADD_INT)
11016		opc = ALU_OP2_ADDC_UINT;
11017	else
11018		opc = ALU_OP2_SUBB_UINT;
11019
11020	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11021	alu.op = op;            ;
11022	alu.dst.sel = treg;
11023	alu.dst.chan = 0;
11024	alu.dst.write = 1;
11025	alu.src[0].sel = src0_sel;
11026	alu.src[0].chan = src0_chan + 0;
11027	alu.src[1].sel = src1_sel;
11028	alu.src[1].chan = src1_chan + 0;
11029	alu.src[1].neg = 0;
11030	r = r600_bytecode_add_alu(ctx->bc, &alu);
11031	if (r)
11032		return r;
11033
11034	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11035	alu.op = op;
11036	alu.dst.sel = treg;
11037	alu.dst.chan = 1;
11038	alu.dst.write = 1;
11039	alu.src[0].sel = src0_sel;
11040	alu.src[0].chan = src0_chan + 1;
11041	alu.src[1].sel = src1_sel;
11042	alu.src[1].chan = src1_chan + 1;
11043	alu.src[1].neg = 0;
11044	r = r600_bytecode_add_alu(ctx->bc, &alu);
11045	if (r)
11046		return r;
11047
11048	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11049	alu.op = opc;
11050	alu.dst.sel = treg;
11051	alu.dst.chan = 2;
11052	alu.dst.write = 1;
11053	alu.last = 1;
11054	alu.src[0].sel = src0_sel;
11055	alu.src[0].chan = src0_chan + 0;
11056	alu.src[1].sel = src1_sel;
11057	alu.src[1].chan = src1_chan + 0;
11058	alu.src[1].neg = 0;
11059	r = r600_bytecode_add_alu(ctx->bc, &alu);
11060	if (r)
11061		return r;
11062
11063	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11064	alu.op = op;
11065	alu.dst.sel = treg;
11066	alu.dst.chan = 1;
11067	alu.dst.write = 1;
11068	alu.src[0].sel = treg;
11069	alu.src[0].chan = 1;
11070	alu.src[1].sel = treg;
11071	alu.src[1].chan = 2;
11072	alu.last = 1;
11073	r = r600_bytecode_add_alu(ctx->bc, &alu);
11074	if (r)
11075		return r;
11076	return 0;
11077}
11078
11079static int egcm_u64add(struct r600_shader_ctx *ctx)
11080{
11081	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11082	struct r600_bytecode_alu alu;
11083	int r;
11084	int treg = ctx->temp_reg;
11085	int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
11086
11087	if (ctx->src[1].neg) {
11088		op = ALU_OP2_SUB_INT;
11089		opc = ALU_OP2_SUBB_UINT;
11090	}
11091	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11092	alu.op = op;            ;
11093	alu.dst.sel = treg;
11094	alu.dst.chan = 0;
11095	alu.dst.write = 1;
11096	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11097	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11098	alu.src[1].neg = 0;
11099	r = r600_bytecode_add_alu(ctx->bc, &alu);
11100	if (r)
11101		return r;
11102
11103	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11104	alu.op = op;
11105	alu.dst.sel = treg;
11106	alu.dst.chan = 1;
11107	alu.dst.write = 1;
11108	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11109	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11110	alu.src[1].neg = 0;
11111	r = r600_bytecode_add_alu(ctx->bc, &alu);
11112	if (r)
11113		return r;
11114
11115	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11116	alu.op = opc              ;
11117	alu.dst.sel = treg;
11118	alu.dst.chan = 2;
11119	alu.dst.write = 1;
11120	alu.last = 1;
11121	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11122	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11123	alu.src[1].neg = 0;
11124	r = r600_bytecode_add_alu(ctx->bc, &alu);
11125	if (r)
11126		return r;
11127
11128	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11129	alu.op = op;
11130	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11131	alu.src[0].sel = treg;
11132	alu.src[0].chan = 1;
11133	alu.src[1].sel = treg;
11134	alu.src[1].chan = 2;
11135	alu.last = 1;
11136	r = r600_bytecode_add_alu(ctx->bc, &alu);
11137	if (r)
11138		return r;
11139	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11140	alu.op = ALU_OP1_MOV;
11141	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11142	alu.src[0].sel = treg;
11143	alu.src[0].chan = 0;
11144	alu.last = 1;
11145	r = r600_bytecode_add_alu(ctx->bc, &alu);
11146	if (r)
11147		return r;
11148	return 0;
11149}
11150
11151
11152static int egcm_i64neg(struct r600_shader_ctx *ctx)
11153{
11154	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11155	struct r600_bytecode_alu alu;
11156	int r;
11157	int treg = ctx->temp_reg;
11158	const int op = ALU_OP2_SUB_INT;
11159	const int opc = ALU_OP2_SUBB_UINT;
11160
11161	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11162	alu.op = op;            ;
11163	alu.dst.sel = treg;
11164	alu.dst.chan = 0;
11165	alu.dst.write = 1;
11166	alu.src[0].sel = V_SQ_ALU_SRC_0;
11167	r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11168	alu.src[1].neg = 0;
11169	r = r600_bytecode_add_alu(ctx->bc, &alu);
11170	if (r)
11171		return r;
11172
11173	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11174	alu.op = op;
11175	alu.dst.sel = treg;
11176	alu.dst.chan = 1;
11177	alu.dst.write = 1;
11178	alu.src[0].sel = V_SQ_ALU_SRC_0;
11179	r600_bytecode_src(&alu.src[1], &ctx->src[0], 1);
11180	alu.src[1].neg = 0;
11181	r = r600_bytecode_add_alu(ctx->bc, &alu);
11182	if (r)
11183		return r;
11184
11185	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11186	alu.op = opc              ;
11187	alu.dst.sel = treg;
11188	alu.dst.chan = 2;
11189	alu.dst.write = 1;
11190	alu.last = 1;
11191	alu.src[0].sel = V_SQ_ALU_SRC_0;
11192	r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
11193	alu.src[1].neg = 0;
11194	r = r600_bytecode_add_alu(ctx->bc, &alu);
11195	if (r)
11196		return r;
11197
11198	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11199	alu.op = op;
11200	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11201	alu.src[0].sel = treg;
11202	alu.src[0].chan = 1;
11203	alu.src[1].sel = treg;
11204	alu.src[1].chan = 2;
11205	alu.last = 1;
11206	r = r600_bytecode_add_alu(ctx->bc, &alu);
11207	if (r)
11208		return r;
11209	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11210	alu.op = ALU_OP1_MOV;
11211	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11212	alu.src[0].sel = treg;
11213	alu.src[0].chan = 0;
11214	alu.last = 1;
11215	r = r600_bytecode_add_alu(ctx->bc, &alu);
11216	if (r)
11217		return r;
11218	return 0;
11219}
11220
11221/* result.y = mul_high a, b
11222   result.x = mul a,b
11223   result.y += a.x * b.y + a.y * b.x;
11224*/
11225static int egcm_u64mul(struct r600_shader_ctx *ctx)
11226{
11227	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11228	struct r600_bytecode_alu alu;
11229	int r;
11230	int treg = ctx->temp_reg;
11231
11232	/* temp.x = mul_lo a.x, b.x */
11233	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11234	alu.op = ALU_OP2_MULLO_UINT;
11235	alu.dst.sel = treg;
11236	alu.dst.chan = 0;
11237	alu.dst.write = 1;
11238	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11239	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11240	r = emit_mul_int_op(ctx->bc, &alu);
11241	if (r)
11242		return r;
11243
11244	/* temp.y = mul_hi a.x, b.x */
11245	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11246	alu.op = ALU_OP2_MULHI_UINT;
11247	alu.dst.sel = treg;
11248	alu.dst.chan = 1;
11249	alu.dst.write = 1;
11250	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11251	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11252	r = emit_mul_int_op(ctx->bc, &alu);
11253	if (r)
11254		return r;
11255
11256	/* temp.z = mul a.x, b.y */
11257	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11258	alu.op = ALU_OP2_MULLO_UINT;
11259	alu.dst.sel = treg;
11260	alu.dst.chan = 2;
11261	alu.dst.write = 1;
11262	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11263	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11264	r = emit_mul_int_op(ctx->bc, &alu);
11265	if (r)
11266		return r;
11267
11268	/* temp.w = mul a.y, b.x */
11269	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11270	alu.op = ALU_OP2_MULLO_UINT;
11271	alu.dst.sel = treg;
11272	alu.dst.chan = 3;
11273	alu.dst.write = 1;
11274	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11275	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11276	r = emit_mul_int_op(ctx->bc, &alu);
11277	if (r)
11278		return r;
11279
11280	/* temp.z = temp.z + temp.w */
11281	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11282	alu.op = ALU_OP2_ADD_INT;
11283	alu.dst.sel = treg;
11284	alu.dst.chan = 2;
11285	alu.dst.write = 1;
11286	alu.src[0].sel = treg;
11287	alu.src[0].chan = 2;
11288	alu.src[1].sel = treg;
11289	alu.src[1].chan = 3;
11290	alu.last = 1;
11291	r = r600_bytecode_add_alu(ctx->bc, &alu);
11292	if (r)
11293		return r;
11294
11295	/* temp.y = temp.y + temp.z */
11296	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11297	alu.op = ALU_OP2_ADD_INT;
11298	alu.dst.sel = treg;
11299	alu.dst.chan = 1;
11300	alu.dst.write = 1;
11301	alu.src[0].sel = treg;
11302	alu.src[0].chan = 1;
11303	alu.src[1].sel = treg;
11304	alu.src[1].chan = 2;
11305	alu.last = 1;
11306	r = r600_bytecode_add_alu(ctx->bc, &alu);
11307	if (r)
11308		return r;
11309
11310	/* dst.x = temp.x */
11311	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11312	alu.op = ALU_OP1_MOV;
11313	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11314	alu.src[0].sel = treg;
11315	alu.src[0].chan = 0;
11316	r = r600_bytecode_add_alu(ctx->bc, &alu);
11317	if (r)
11318		return r;
11319
11320	/* dst.y = temp.y */
11321	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11322	alu.op = ALU_OP1_MOV;
11323	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11324	alu.src[0].sel = treg;
11325	alu.src[0].chan = 1;
11326	alu.last = 1;
11327	r = r600_bytecode_add_alu(ctx->bc, &alu);
11328	if (r)
11329		return r;
11330
11331	return 0;
11332}
11333
11334static int emit_u64sge(struct r600_shader_ctx *ctx,
11335		       int treg,
11336		       int src0_sel, int src0_base_chan,
11337		       int src1_sel, int src1_base_chan)
11338{
11339	int r;
11340	/* for 64-bit sge */
11341	/* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11342	r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11343			   treg, 1,
11344			   src0_sel, src0_base_chan + 1,
11345			   src1_sel, src1_base_chan + 1);
11346	if (r)
11347		return r;
11348
11349	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11350			   treg, 0,
11351			   src0_sel, src0_base_chan,
11352			   src1_sel, src1_base_chan);
11353	if (r)
11354		return r;
11355
11356	r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11357			   treg, 2,
11358			   src0_sel, src0_base_chan + 1,
11359			   src1_sel, src1_base_chan + 1);
11360	if (r)
11361		return r;
11362
11363	r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11364			   treg, 0,
11365			   treg, 0,
11366			   treg, 2);
11367	if (r)
11368		return r;
11369
11370	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11371			   treg, 0,
11372			   treg, 0,
11373			   treg, 1);
11374	if (r)
11375		return r;
11376	return 0;
11377}
11378
11379/* this isn't a complete div it's just enough for qbo shader to work */
11380static int egcm_u64div(struct r600_shader_ctx *ctx)
11381{
11382	struct r600_bytecode_alu alu;
11383	struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11384	int r, i;
11385	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11386
11387	/* make sure we are dividing my a const with 0 in the high bits */
11388	if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11389		return -1;
11390	if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11391		return -1;
11392	/* make sure we are doing one division */
11393	if (inst->Dst[0].Register.WriteMask != 0x3)
11394		return -1;
11395
11396	/* emit_if uses ctx->temp_reg so we can't */
11397	int treg = r600_get_temp(ctx);
11398	int tmp_num = r600_get_temp(ctx);
11399	int sub_tmp = r600_get_temp(ctx);
11400
11401	/* tmp quot are tmp_num.zw */
11402	r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11403	r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11404	r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11405	r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11406
11407	/* MOV tmp_num.xy, numerator */
11408	r = single_alu_op2(ctx, ALU_OP1_MOV,
11409			   tmp_num, 0,
11410			   alu_num_lo.sel, alu_num_lo.chan,
11411			   0, 0);
11412	if (r)
11413		return r;
11414	r = single_alu_op2(ctx, ALU_OP1_MOV,
11415			   tmp_num, 1,
11416			   alu_num_hi.sel, alu_num_hi.chan,
11417			   0, 0);
11418	if (r)
11419		return r;
11420
11421	r = single_alu_op2(ctx, ALU_OP1_MOV,
11422			   tmp_num, 2,
11423			   V_SQ_ALU_SRC_LITERAL, 0,
11424			   0, 0);
11425	if (r)
11426		return r;
11427
11428	r = single_alu_op2(ctx, ALU_OP1_MOV,
11429			   tmp_num, 3,
11430			   V_SQ_ALU_SRC_LITERAL, 0,
11431			   0, 0);
11432	if (r)
11433		return r;
11434
11435	/* treg 0 is log2_denom */
11436	/* normally this gets the MSB for the denom high value
11437	   - however we know this will always be 0 here. */
11438	r = single_alu_op2(ctx,
11439			   ALU_OP1_MOV,
11440			   treg, 0,
11441			   V_SQ_ALU_SRC_LITERAL, 32,
11442			   0, 0);
11443	if (r)
11444		return r;
11445
11446	/* normally check demon hi for 0, but we know it is already */
11447	/* t0.z = num_hi >= denom_lo */
11448	r = single_alu_op2(ctx,
11449			   ALU_OP2_SETGE_UINT,
11450			   treg, 1,
11451			   alu_num_hi.sel, alu_num_hi.chan,
11452			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11453	if (r)
11454		return r;
11455
11456	memset(&alu_src, 0, sizeof(alu_src));
11457	alu_src.sel = treg;
11458	alu_src.chan = 1;
11459	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11460	if (r)
11461		return r;
11462
11463	/* for loops in here */
11464	/* get msb t0.x = msb(src[1].x) first */
11465	int msb_lo = util_last_bit(alu_denom_lo.value);
11466	r = single_alu_op2(ctx, ALU_OP1_MOV,
11467			   treg, 0,
11468			   V_SQ_ALU_SRC_LITERAL, msb_lo,
11469			   0, 0);
11470	if (r)
11471		return r;
11472
11473	/* unroll the asm here */
11474	for (i = 0; i < 31; i++) {
11475		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11476				   treg, 2,
11477				   V_SQ_ALU_SRC_LITERAL, i,
11478				   treg, 0);
11479		if (r)
11480			return r;
11481
11482		/* we can do this on the CPU */
11483		uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11484		/* t0.z = tmp_num.y >= t0.z */
11485		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11486				   treg, 1,
11487				   tmp_num, 1,
11488				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11489		if (r)
11490			return r;
11491
11492		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11493				   treg, 1,
11494				   treg, 1,
11495				   treg, 2);
11496		if (r)
11497			return r;
11498
11499		memset(&alu_src, 0, sizeof(alu_src));
11500		alu_src.sel = treg;
11501		alu_src.chan = 1;
11502		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11503		if (r)
11504			return r;
11505
11506		r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11507				   tmp_num, 1,
11508				   tmp_num, 1,
11509				   V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11510		if (r)
11511			return r;
11512
11513		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11514				   tmp_num, 3,
11515				   tmp_num, 3,
11516				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11517		if (r)
11518			return r;
11519
11520		r = tgsi_endif(ctx);
11521		if (r)
11522			return r;
11523	}
11524
11525	/* log2_denom is always <= 31, so manually peel the last loop
11526	 * iteration.
11527	 */
11528	r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11529			   treg, 1,
11530			   tmp_num, 1,
11531			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11532	if (r)
11533		return r;
11534
11535	memset(&alu_src, 0, sizeof(alu_src));
11536	alu_src.sel = treg;
11537	alu_src.chan = 1;
11538	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11539	if (r)
11540		return r;
11541
11542	r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11543			   tmp_num, 1,
11544			   tmp_num, 1,
11545			   V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11546	if (r)
11547		return r;
11548
11549	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11550			   tmp_num, 3,
11551			   tmp_num, 3,
11552			   V_SQ_ALU_SRC_LITERAL, 1U);
11553	if (r)
11554		return r;
11555	r = tgsi_endif(ctx);
11556	if (r)
11557		return r;
11558
11559	r = tgsi_endif(ctx);
11560	if (r)
11561		return r;
11562
11563	/* onto the second loop to unroll */
11564	for (i = 0; i < 31; i++) {
11565		r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11566				   treg, 1,
11567				   V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11568				   treg, 0);
11569		if (r)
11570			return r;
11571
11572		uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11573		r = single_alu_op2(ctx, ALU_OP1_MOV,
11574				   treg, 2,
11575				   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11576				   0, 0);
11577		if (r)
11578			return r;
11579
11580		r = single_alu_op2(ctx, ALU_OP1_MOV,
11581				   treg, 3,
11582				   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11583				   0, 0);
11584		if (r)
11585			return r;
11586
11587		r = emit_u64sge(ctx, sub_tmp,
11588				tmp_num, 0,
11589				treg, 2);
11590		if (r)
11591			return r;
11592
11593		r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11594				   treg, 1,
11595				   treg, 1,
11596				   sub_tmp, 0);
11597		if (r)
11598			return r;
11599
11600		memset(&alu_src, 0, sizeof(alu_src));
11601		alu_src.sel = treg;
11602		alu_src.chan = 1;
11603		r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11604		if (r)
11605			return r;
11606
11607
11608		r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11609				sub_tmp,
11610				tmp_num, 0,
11611				treg, 2);
11612		if (r)
11613			return r;
11614
11615		r = single_alu_op2(ctx, ALU_OP1_MOV,
11616				   tmp_num, 0,
11617				   sub_tmp, 0,
11618				   0, 0);
11619		if (r)
11620			return r;
11621
11622		r = single_alu_op2(ctx, ALU_OP1_MOV,
11623				   tmp_num, 1,
11624				   sub_tmp, 1,
11625				   0, 0);
11626		if (r)
11627			return r;
11628
11629		r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11630				   tmp_num, 2,
11631				   tmp_num, 2,
11632				   V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11633		if (r)
11634			return r;
11635
11636		r = tgsi_endif(ctx);
11637		if (r)
11638			return r;
11639	}
11640
11641	/* log2_denom is always <= 63, so manually peel the last loop
11642	 * iteration.
11643	 */
11644	uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11645	r = single_alu_op2(ctx, ALU_OP1_MOV,
11646			   treg, 2,
11647			   V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11648			   0, 0);
11649	if (r)
11650		return r;
11651
11652	r = single_alu_op2(ctx, ALU_OP1_MOV,
11653			   treg, 3,
11654			   V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11655			   0, 0);
11656	if (r)
11657		return r;
11658
11659	r = emit_u64sge(ctx, sub_tmp,
11660			tmp_num, 0,
11661			treg, 2);
11662	if (r)
11663		return r;
11664
11665	memset(&alu_src, 0, sizeof(alu_src));
11666	alu_src.sel = sub_tmp;
11667	alu_src.chan = 0;
11668	r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11669	if (r)
11670		return r;
11671
11672	r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11673			sub_tmp,
11674			tmp_num, 0,
11675			treg, 2);
11676	if (r)
11677		return r;
11678
11679	r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11680			   tmp_num, 2,
11681			   tmp_num, 2,
11682			   V_SQ_ALU_SRC_LITERAL, 1U);
11683	if (r)
11684		return r;
11685	r = tgsi_endif(ctx);
11686	if (r)
11687		return r;
11688
11689	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11690	alu.op = ALU_OP1_MOV;
11691	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11692	alu.src[0].sel = tmp_num;
11693	alu.src[0].chan = 2;
11694	r = r600_bytecode_add_alu(ctx->bc, &alu);
11695	if (r)
11696		return r;
11697
11698	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11699	alu.op = ALU_OP1_MOV;
11700	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11701	alu.src[0].sel = tmp_num;
11702	alu.src[0].chan = 3;
11703	alu.last = 1;
11704	r = r600_bytecode_add_alu(ctx->bc, &alu);
11705	if (r)
11706		return r;
11707	return 0;
11708}
11709
11710static int egcm_u64sne(struct r600_shader_ctx *ctx)
11711{
11712	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11713	struct r600_bytecode_alu alu;
11714	int r;
11715	int treg = ctx->temp_reg;
11716
11717	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11718	alu.op = ALU_OP2_SETNE_INT;
11719	alu.dst.sel = treg;
11720	alu.dst.chan = 0;
11721	alu.dst.write = 1;
11722	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11723	r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11724	r = r600_bytecode_add_alu(ctx->bc, &alu);
11725	if (r)
11726		return r;
11727
11728	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11729	alu.op = ALU_OP2_SETNE_INT;
11730	alu.dst.sel = treg;
11731	alu.dst.chan = 1;
11732	alu.dst.write = 1;
11733	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11734	r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11735	alu.last = 1;
11736	r = r600_bytecode_add_alu(ctx->bc, &alu);
11737	if (r)
11738		return r;
11739
11740	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11741	alu.op = ALU_OP2_OR_INT;
11742	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11743	alu.src[0].sel = treg;
11744	alu.src[0].chan = 0;
11745	alu.src[1].sel = treg;
11746	alu.src[1].chan = 1;
11747	alu.last = 1;
11748	r = r600_bytecode_add_alu(ctx->bc, &alu);
11749	if (r)
11750		return r;
11751	return 0;
11752}
11753
11754static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11755	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
11756	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11757	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11758
11759	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11760
11761	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11762	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11763	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11764	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11765	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11766	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11767	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11768	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11769	/* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11770	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11771	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11772	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11773	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11774	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11775	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11776	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11777	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11778	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11779	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11780	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11781	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11782	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11783	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11784	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11785	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11786	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11787	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11788	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11789	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11790	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
11791	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11792	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11793	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11794	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11795	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11796	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11797	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11798	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11799	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11800	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11801	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
11802	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
11803	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
11804	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
11805	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
11806	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
11807	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
11808	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
11809	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
11810	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
11811	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
11812	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
11813	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11814	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11815	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11816	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
11817	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
11818	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
11819	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
11820	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
11821	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
11822	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
11823	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
11824	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
11825	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11826	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
11827	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
11828	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11829	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11830	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11831	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
11832	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
11833	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
11834	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
11835	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
11836	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11837	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
11838	[81]			= { ALU_OP0_NOP, tgsi_unsupported},
11839	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
11840	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
11841	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11842	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
11843	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
11844	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11845	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
11846	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
11847	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
11848	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
11849	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
11850	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
11851	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
11852	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11853	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11854	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11855	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11856	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
11857	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11858	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
11859	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
11860	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11861	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11862	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
11863	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
11864	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
11865	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
11866	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
11867	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11868	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11869	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
11870	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
11871	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
11872	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
11873	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
11874	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
11875	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
11876	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11877	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
11878	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
11879	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
11880	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
11881	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
11882	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11883	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11884	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11885	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11886	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
11887	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
11888	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
11889	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
11890	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
11891	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
11892	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11893	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
11894	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
11895	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11896	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11897	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11898	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11899	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
11900	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
11901	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
11902	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
11903	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
11904	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
11905	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
11906	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
11907	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
11908	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
11909	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
11910	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
11911	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
11912	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
11913	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
11914	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11915	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
11916	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
11917	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
11918	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
11919	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
11920	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
11921	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
11922	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
11923	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
11924	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
11925	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
11926	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
11927	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
11928	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11929	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
11930	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11931	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11932	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
11933	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
11934	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
11935	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
11936	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
11937	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11938	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11939	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
11940	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
11941	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
11942	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
11943	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
11944	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
11945	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
11946	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
11947	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
11948	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11949	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
11950	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
11951	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
11952	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
11953};
11954
11955static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11956	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
11957	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
11958	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
11959	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11960	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
11961	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
11962	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
11963	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
11964	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
11965	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11966	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
11967	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
11968	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
11969	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
11970	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
11971	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
11972	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
11973	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
11974	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
11975	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11976	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
11977	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
11978	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
11979	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
11980	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
11981	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
11982	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
11983	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11984	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11985	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
11986	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
11987	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
11988	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
11989	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
11990	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
11991	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
11992	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11993	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11994	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
11995	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
11996	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
11997	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
11998	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
11999	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12000	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12001	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12002	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12003	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
12004	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12005	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12006	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12007	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12008	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12009	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12010	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12011	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12012	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12013	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12014	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12015	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12016	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12017	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12018	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12019	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12020	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12021	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12022	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12023	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12024	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12025	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12026	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12027	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12028	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12029	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12030	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12031	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12032	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12033	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12034	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12035	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12036	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12037	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12038	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
12039	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12040	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12041	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12042	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12043	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12044	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12045	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12046	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12047	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12048	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12049	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12050	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12051	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12052	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12053	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12054	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12055	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12056	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12057	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12058	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12059	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12060	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12061	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12062	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12063	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12064	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12065	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12066	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12067	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12068	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12069	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12070	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12071	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12072	/* Refer below for TGSI_OPCODE_DFMA */
12073	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
12074	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12075	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12076	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12077	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12078	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12079	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12080	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12081	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
12082	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
12083	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12084	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12085	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12086	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12087	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12088	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12089	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
12090	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12091	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12092	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12093	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12094	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12095	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12096	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12097	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12098	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12099	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12100	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12101	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12102	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12103	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12104	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12105	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12106	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12107	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12108	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12109	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12110	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12111	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12112	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12113	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12114	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12115	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12116	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12117	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12118	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12119	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12120	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12121	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12122	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12123	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12124	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12125	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12126	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12127	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12128	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12129	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12130	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12131	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12132	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12133	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12134	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
12135	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
12136	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12137	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12138	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12139	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12140	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12141	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12142	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12143	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12144	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12145	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12146	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12147	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12148	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12149	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12150	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12151	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12152	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12153	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12154	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12155	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12156	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12157	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12158	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12159	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12160	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12161	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12162	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12163	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12164	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12165	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12166	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12167	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12168	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12169	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12170	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12171	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12172	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12173	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12174	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12175	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12176	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12177	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12178	[TGSI_OPCODE_I64NEG]    = { ALU_OP0_NOP, egcm_i64neg },
12179	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12180};
12181
12182static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12183	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
12184	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
12185	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
12186	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12187	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12188	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
12189	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
12190	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL_IEEE, tgsi_op2},
12191	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
12192	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12193	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12194	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
12195	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN_DX10, tgsi_op2},
12196	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX_DX10, tgsi_op2},
12197	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
12198	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
12199	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD_IEEE, tgsi_op3},
12200	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
12201	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
12202	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12203	[21]	= { ALU_OP0_NOP, tgsi_unsupported},
12204	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
12205	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
12206	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
12207	[25]			= { ALU_OP0_NOP, tgsi_unsupported},
12208	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
12209	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
12210	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12211	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12212	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
12213	[31]	= { ALU_OP0_NOP, tgsi_unsupported},
12214	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
12215	[TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
12216	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
12217	[35]			= { ALU_OP0_NOP, tgsi_unsupported},
12218	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
12219	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12220	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12221	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
12222	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
12223	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12224	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12225	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12226	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
12227	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
12228	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
12229	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
12230	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
12231	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
12232	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
12233	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
12234	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
12235	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
12236	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
12237	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
12238	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
12239	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
12240	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
12241	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
12242	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
12243	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
12244	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
12245	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
12246	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
12247	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
12248	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
12249	[67]			= { ALU_OP0_NOP, tgsi_unsupported},
12250	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12251	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
12252	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
12253	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4_IEEE, tgsi_dp},
12254	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12255	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12256	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
12257	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
12258	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
12259	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
12260	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
12261	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12262	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12263	[82]			= { ALU_OP0_NOP, tgsi_unsupported},
12264	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
12265	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
12266	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
12267	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
12268	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
12269	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
12270	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
12271	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
12272	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
12273	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
12274	[93]			= { ALU_OP0_NOP, tgsi_unsupported},
12275	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
12276	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12277	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12278	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12279	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12280	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
12281	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12282	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
12283	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
12284	[103]			= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12285	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12286	[TGSI_OPCODE_RESQ]     	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12287	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
12288	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
12289	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
12290	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
12291	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12292	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12293	[TGSI_OPCODE_MEMBAR]    = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12294	[113]	= { ALU_OP0_NOP, tgsi_unsupported},
12295	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
12296	[115]			= { ALU_OP0_NOP, tgsi_unsupported},
12297	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
12298	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
12299	/* Refer below for TGSI_OPCODE_DFMA */
12300	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
12301	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
12302	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
12303	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
12304	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
12305	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
12306	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
12307	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12308	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12309	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12310	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
12311	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
12312	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
12313	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
12314	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
12315	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
12316	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12317	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
12318	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
12319	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
12320	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12321	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
12322	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12323	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
12324	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
12325	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
12326	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
12327	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
12328	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
12329	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
12330	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
12331	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
12332	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
12333	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
12334	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
12335	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
12336	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
12337	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
12338	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12339	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
12340	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
12341	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
12342	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
12343	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_store},
12344	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
12345	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
12346	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
12347	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12348	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12349	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12350	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12351	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12352	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12353	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12354	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12355	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12356	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12357	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12358	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
12359	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
12360	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
12361	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12362	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12363	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
12364	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
12365	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_bfe},
12366	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_bfe},
12367	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
12368	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
12369	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
12370	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
12371	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
12372	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
12373	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12374	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12375	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
12376	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12377	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12378	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
12379	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
12380	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
12381	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
12382	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
12383	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
12384	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
12385	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12386	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12387	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12388	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12389	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12390	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12391	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12392	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
12393	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
12394	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
12395	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
12396	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12397	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12398	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12399	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12400	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12401	[TGSI_OPCODE_U64SNE]    = { ALU_OP0_NOP, egcm_u64sne },
12402	[TGSI_OPCODE_U64ADD]    = { ALU_OP0_NOP, egcm_u64add },
12403	[TGSI_OPCODE_U64MUL]    = { ALU_OP0_NOP, egcm_u64mul },
12404	[TGSI_OPCODE_U64DIV]    = { ALU_OP0_NOP, egcm_u64div },
12405	[TGSI_OPCODE_I64NEG]    = { ALU_OP0_NOP, egcm_i64neg },
12406	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
12407};
12408