1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_opcodes.h"
25#include "r600_formats.h"
26#include "r600_shader.h"
27#include "r600d.h"
28
29#include <errno.h>
30#include "util/u_bitcast.h"
31#include "util/u_dump.h"
32#include "util/u_memory.h"
33#include "util/u_math.h"
34#include "pipe/p_shader_tokens.h"
35
36#include "sb/sb_public.h"
37
38#define NUM_OF_CYCLES 3
39#define NUM_OF_COMPONENTS 4
40
41static inline bool alu_writes(struct r600_bytecode_alu *alu)
42{
43	return alu->dst.write || alu->is_op3;
44}
45
46static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu)
47{
48	return r600_isa_alu(alu->op)->src_count;
49}
50
51static struct r600_bytecode_cf *r600_bytecode_cf(void)
52{
53	struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
54
55	if (!cf)
56		return NULL;
57	LIST_INITHEAD(&cf->list);
58	LIST_INITHEAD(&cf->alu);
59	LIST_INITHEAD(&cf->vtx);
60	LIST_INITHEAD(&cf->tex);
61	LIST_INITHEAD(&cf->gds);
62	return cf;
63}
64
65static struct r600_bytecode_alu *r600_bytecode_alu(void)
66{
67	struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
68
69	if (!alu)
70		return NULL;
71	LIST_INITHEAD(&alu->list);
72	return alu;
73}
74
75static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
76{
77	struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
78
79	if (!vtx)
80		return NULL;
81	LIST_INITHEAD(&vtx->list);
82	return vtx;
83}
84
85static struct r600_bytecode_tex *r600_bytecode_tex(void)
86{
87	struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
88
89	if (!tex)
90		return NULL;
91	LIST_INITHEAD(&tex->list);
92	return tex;
93}
94
95static struct r600_bytecode_gds *r600_bytecode_gds(void)
96{
97	struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds);
98
99	if (gds == NULL)
100		return NULL;
101	LIST_INITHEAD(&gds->list);
102	return gds;
103}
104
105static unsigned stack_entry_size(enum radeon_family chip) {
106	/* Wavefront size:
107	 *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
108	 *       Aruba/Sumo/Sumo2/redwood/juniper
109	 *   32: R630/R730/R710/Palm/Cedar
110	 *   16: R610/Rs780
111	 *
112	 * Stack row size:
113	 * 	Wavefront Size                        16  32  48  64
114	 * 	Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
115	 * 	Columns per Row (R9xx+)                8   4   4   4 */
116
117	switch (chip) {
118	/* FIXME: are some chips missing here? */
119	/* wavefront size 16 */
120	case CHIP_RV610:
121	case CHIP_RS780:
122	case CHIP_RV620:
123	case CHIP_RS880:
124	/* wavefront size 32 */
125	case CHIP_RV630:
126	case CHIP_RV635:
127	case CHIP_RV730:
128	case CHIP_RV710:
129	case CHIP_PALM:
130	case CHIP_CEDAR:
131		return 8;
132
133	/* wavefront size 64 */
134	default:
135		return 4;
136	}
137}
138
139void r600_bytecode_init(struct r600_bytecode *bc,
140			enum chip_class chip_class,
141			enum radeon_family family,
142			bool has_compressed_msaa_texturing)
143{
144	static unsigned next_shader_id = 0;
145
146	bc->debug_id = ++next_shader_id;
147
148	if ((chip_class == R600) &&
149	    (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) {
150		bc->ar_handling = AR_HANDLE_RV6XX;
151		bc->r6xx_nop_after_rel_dst = 1;
152	} else {
153		bc->ar_handling = AR_HANDLE_NORMAL;
154		bc->r6xx_nop_after_rel_dst = 0;
155	}
156
157	LIST_INITHEAD(&bc->cf);
158	bc->chip_class = chip_class;
159	bc->family = family;
160	bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing;
161	bc->stack.entry_size = stack_entry_size(family);
162}
163
164int r600_bytecode_add_cf(struct r600_bytecode *bc)
165{
166	struct r600_bytecode_cf *cf = r600_bytecode_cf();
167
168	if (!cf)
169		return -ENOMEM;
170	LIST_ADDTAIL(&cf->list, &bc->cf);
171	if (bc->cf_last) {
172		cf->id = bc->cf_last->id + 2;
173		if (bc->cf_last->eg_alu_extended) {
174			/* take into account extended alu size */
175			cf->id += 2;
176			bc->ndw += 2;
177		}
178	}
179	bc->cf_last = cf;
180	bc->ncf++;
181	bc->ndw += 2;
182	bc->force_add_cf = 0;
183	bc->ar_loaded = 0;
184	return 0;
185}
186
187int r600_bytecode_add_output(struct r600_bytecode *bc,
188		const struct r600_bytecode_output *output)
189{
190	int r;
191
192	if (output->gpr >= bc->ngpr)
193		bc->ngpr = output->gpr + 1;
194
195	if (bc->cf_last && (bc->cf_last->op == output->op ||
196		(bc->cf_last->op == CF_OP_EXPORT &&
197		output->op == CF_OP_EXPORT_DONE)) &&
198		output->type == bc->cf_last->output.type &&
199		output->elem_size == bc->cf_last->output.elem_size &&
200		output->swizzle_x == bc->cf_last->output.swizzle_x &&
201		output->swizzle_y == bc->cf_last->output.swizzle_y &&
202		output->swizzle_z == bc->cf_last->output.swizzle_z &&
203		output->swizzle_w == bc->cf_last->output.swizzle_w &&
204		output->comp_mask == bc->cf_last->output.comp_mask &&
205		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
206
207		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
208			(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
209
210			bc->cf_last->op = bc->cf_last->output.op = output->op;
211			bc->cf_last->output.gpr = output->gpr;
212			bc->cf_last->output.array_base = output->array_base;
213			bc->cf_last->output.burst_count += output->burst_count;
214			return 0;
215
216		} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
217			output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
218
219			bc->cf_last->op = bc->cf_last->output.op = output->op;
220			bc->cf_last->output.burst_count += output->burst_count;
221			return 0;
222		}
223	}
224
225	r = r600_bytecode_add_cf(bc);
226	if (r)
227		return r;
228	bc->cf_last->op = output->op;
229	memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
230	bc->cf_last->barrier = 1;
231	return 0;
232}
233
234int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
235		const struct r600_bytecode_output *output)
236{
237	assert(bc->n_pending_outputs + 1 < ARRAY_SIZE(bc->pending_outputs));
238	bc->pending_outputs[bc->n_pending_outputs++] = *output;
239
240	return 0;
241}
242
243void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack)
244{
245	bc->need_wait_ack = need_wait_ack;
246}
247
248boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc)
249{
250	return bc->need_wait_ack;
251}
252
253/* alu instructions that can ony exits once per group */
254static int is_alu_once_inst(struct r600_bytecode_alu *alu)
255{
256	return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER;
257}
258
259static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
260{
261	return (r600_isa_alu(alu->op)->flags & AF_REPL) &&
262			(r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V);
263}
264
265static int is_alu_mova_inst(struct r600_bytecode_alu *alu)
266{
267	return r600_isa_alu(alu->op)->flags & AF_MOVA;
268}
269
270static int alu_uses_rel(struct r600_bytecode_alu *alu)
271{
272	unsigned num_src = r600_bytecode_get_num_operands(alu);
273	unsigned src;
274
275	if (alu->dst.rel) {
276		return 1;
277	}
278
279	for (src = 0; src < num_src; ++src) {
280		if (alu->src[src].rel) {
281			return 1;
282		}
283	}
284	return 0;
285}
286
287static int is_lds_read(int sel)
288{
289  return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP;
290}
291
292static int alu_uses_lds(struct r600_bytecode_alu *alu)
293{
294	unsigned num_src = r600_bytecode_get_num_operands(alu);
295	unsigned src;
296
297	for (src = 0; src < num_src; ++src) {
298		if (is_lds_read(alu->src[src].sel)) {
299			return 1;
300		}
301	}
302	return 0;
303}
304
305static int is_alu_64bit_inst(struct r600_bytecode_alu *alu)
306{
307	const struct alu_op_info *op = r600_isa_alu(alu->op);
308	return (op->flags & AF_64);
309}
310
311static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
312{
313	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
314	return !(slots & AF_S);
315}
316
317static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
318{
319	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
320	return !(slots & AF_V);
321}
322
323/* alu instructions that can execute on any unit */
324static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
325{
326	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
327	return slots == AF_VS;
328}
329
330static int is_nop_inst(struct r600_bytecode_alu *alu)
331{
332	return alu->op == ALU_OP0_NOP;
333}
334
335static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
336			    struct r600_bytecode_alu *assignment[5])
337{
338	struct r600_bytecode_alu *alu;
339	unsigned i, chan, trans;
340	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
341
342	for (i = 0; i < max_slots; i++)
343		assignment[i] = NULL;
344
345	for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
346		chan = alu->dst.chan;
347		if (max_slots == 4)
348			trans = 0;
349		else if (is_alu_trans_unit_inst(bc, alu))
350			trans = 1;
351		else if (is_alu_vec_unit_inst(bc, alu))
352			trans = 0;
353		else if (assignment[chan])
354			trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
355		else
356			trans = 0;
357
358		if (trans) {
359			if (assignment[4]) {
360				assert(0); /* ALU.Trans has already been allocated. */
361				return -1;
362			}
363			assignment[4] = alu;
364		} else {
365			if (assignment[chan]) {
366				assert(0); /* ALU.chan has already been allocated. */
367				return -1;
368			}
369			assignment[chan] = alu;
370		}
371
372		if (alu->last)
373			break;
374	}
375	return 0;
376}
377
378struct alu_bank_swizzle {
379	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
380	int	hw_cfile_addr[4];
381	int	hw_cfile_elem[4];
382};
383
384static const unsigned cycle_for_bank_swizzle_vec[][3] = {
385	[SQ_ALU_VEC_012] = { 0, 1, 2 },
386	[SQ_ALU_VEC_021] = { 0, 2, 1 },
387	[SQ_ALU_VEC_120] = { 1, 2, 0 },
388	[SQ_ALU_VEC_102] = { 1, 0, 2 },
389	[SQ_ALU_VEC_201] = { 2, 0, 1 },
390	[SQ_ALU_VEC_210] = { 2, 1, 0 }
391};
392
393static const unsigned cycle_for_bank_swizzle_scl[][3] = {
394	[SQ_ALU_SCL_210] = { 2, 1, 0 },
395	[SQ_ALU_SCL_122] = { 1, 2, 2 },
396	[SQ_ALU_SCL_212] = { 2, 1, 2 },
397	[SQ_ALU_SCL_221] = { 2, 2, 1 }
398};
399
400static void init_bank_swizzle(struct alu_bank_swizzle *bs)
401{
402	int i, cycle, component;
403	/* set up gpr use */
404	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
405		for (component = 0; component < NUM_OF_COMPONENTS; component++)
406			 bs->hw_gpr[cycle][component] = -1;
407	for (i = 0; i < 4; i++)
408		bs->hw_cfile_addr[i] = -1;
409	for (i = 0; i < 4; i++)
410		bs->hw_cfile_elem[i] = -1;
411}
412
413static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
414{
415	if (bs->hw_gpr[cycle][chan] == -1)
416		bs->hw_gpr[cycle][chan] = sel;
417	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
418		/* Another scalar operation has already used the GPR read port for the channel. */
419		return -1;
420	}
421	return 0;
422}
423
424static int reserve_cfile(const struct r600_bytecode *bc,
425			 struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
426{
427	int res, num_res = 4;
428	if (bc->chip_class >= R700) {
429		num_res = 2;
430		chan /= 2;
431	}
432	for (res = 0; res < num_res; ++res) {
433		if (bs->hw_cfile_addr[res] == -1) {
434			bs->hw_cfile_addr[res] = sel;
435			bs->hw_cfile_elem[res] = chan;
436			return 0;
437		} else if (bs->hw_cfile_addr[res] == sel &&
438			bs->hw_cfile_elem[res] == chan)
439			return 0; /* Read for this scalar element already reserved, nothing to do here. */
440	}
441	/* All cfile read ports are used, cannot reference vector element. */
442	return -1;
443}
444
445static int is_gpr(unsigned sel)
446{
447	return (sel <= 127);
448}
449
450/* CB constants start at 512, and get translated to a kcache index when ALU
451 * clauses are constructed. Note that we handle kcache constants the same way
452 * as (the now gone) cfile constants, is that really required? */
453static int is_cfile(unsigned sel)
454{
455	return (sel > 255 && sel < 512) ||
456		(sel > 511 && sel < 4607) || /* Kcache before translation. */
457		(sel > 127 && sel < 192); /* Kcache after translation. */
458}
459
460static int is_const(int sel)
461{
462	return is_cfile(sel) ||
463		(sel >= V_SQ_ALU_SRC_0 &&
464		sel <= V_SQ_ALU_SRC_LITERAL);
465}
466
467static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
468			struct alu_bank_swizzle *bs, int bank_swizzle)
469{
470	int r, src, num_src, sel, elem, cycle;
471
472	num_src = r600_bytecode_get_num_operands(alu);
473	for (src = 0; src < num_src; src++) {
474		sel = alu->src[src].sel;
475		elem = alu->src[src].chan;
476		if (is_gpr(sel)) {
477			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
478			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
479				/* Nothing to do; special-case optimization,
480				 * second source uses first source’s reservation. */
481				continue;
482			else {
483				r = reserve_gpr(bs, sel, elem, cycle);
484				if (r)
485					return r;
486			}
487		} else if (is_cfile(sel)) {
488			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
489			if (r)
490				return r;
491		}
492		/* No restrictions on PV, PS, literal or special constants. */
493	}
494	return 0;
495}
496
497static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
498			struct alu_bank_swizzle *bs, int bank_swizzle)
499{
500	int r, src, num_src, const_count, sel, elem, cycle;
501
502	num_src = r600_bytecode_get_num_operands(alu);
503	for (const_count = 0, src = 0; src < num_src; ++src) {
504		sel = alu->src[src].sel;
505		elem = alu->src[src].chan;
506		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
507			if (const_count >= 2)
508				/* More than two references to a constant in
509				 * transcendental operation. */
510				return -1;
511			else
512				const_count++;
513		}
514		if (is_cfile(sel)) {
515			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
516			if (r)
517				return r;
518		}
519	}
520	for (src = 0; src < num_src; ++src) {
521		sel = alu->src[src].sel;
522		elem = alu->src[src].chan;
523		if (is_gpr(sel)) {
524			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
525			if (cycle < const_count)
526				/* Cycle for GPR load conflicts with
527				 * constant load in transcendental operation. */
528				return -1;
529			r = reserve_gpr(bs, sel, elem, cycle);
530			if (r)
531				return r;
532		}
533		/* PV PS restrictions */
534		if (const_count && (sel == 254 || sel == 255)) {
535			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
536			if (cycle < const_count)
537				return -1;
538		}
539	}
540	return 0;
541}
542
543static int check_and_set_bank_swizzle(const struct r600_bytecode *bc,
544				      struct r600_bytecode_alu *slots[5])
545{
546	struct alu_bank_swizzle bs;
547	int bank_swizzle[5];
548	int i, r = 0, forced = 1;
549	boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
550	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
551
552	for (i = 0; i < max_slots; i++) {
553		if (slots[i]) {
554			if (slots[i]->bank_swizzle_force) {
555				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
556			} else {
557				forced = 0;
558			}
559		}
560
561		if (i < 4 && slots[i])
562			scalar_only = false;
563	}
564	if (forced)
565		return 0;
566
567	/* Just check every possible combination of bank swizzle.
568	 * Not very efficent, but works on the first try in most of the cases. */
569	for (i = 0; i < 4; i++)
570		if (!slots[i] || !slots[i]->bank_swizzle_force)
571			bank_swizzle[i] = SQ_ALU_VEC_012;
572		else
573			bank_swizzle[i] = slots[i]->bank_swizzle;
574
575	bank_swizzle[4] = SQ_ALU_SCL_210;
576	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
577
578		init_bank_swizzle(&bs);
579		if (scalar_only == false) {
580			for (i = 0; i < 4; i++) {
581				if (slots[i]) {
582					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
583					if (r)
584						break;
585				}
586			}
587		} else
588			r = 0;
589
590		if (!r && max_slots == 5 && slots[4]) {
591			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
592		}
593		if (!r) {
594			for (i = 0; i < max_slots; i++) {
595				if (slots[i])
596					slots[i]->bank_swizzle = bank_swizzle[i];
597			}
598			return 0;
599		}
600
601		if (scalar_only) {
602			bank_swizzle[4]++;
603		} else {
604			for (i = 0; i < max_slots; i++) {
605				if (!slots[i] || !slots[i]->bank_swizzle_force) {
606					bank_swizzle[i]++;
607					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
608						break;
609					else if (i < max_slots - 1)
610						bank_swizzle[i] = SQ_ALU_VEC_012;
611					else
612						return -1;
613				}
614			}
615		}
616	}
617
618	/* Couldn't find a working swizzle. */
619	return -1;
620}
621
622static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
623				  struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
624{
625	struct r600_bytecode_alu *prev[5];
626	int gpr[5], chan[5];
627	int i, j, r, src, num_src;
628	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
629
630	r = assign_alu_units(bc, alu_prev, prev);
631	if (r)
632		return r;
633
634	for (i = 0; i < max_slots; ++i) {
635		if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) {
636
637			if (is_alu_64bit_inst(prev[i])) {
638				gpr[i] = -1;
639				continue;
640			}
641
642			gpr[i] = prev[i]->dst.sel;
643			/* cube writes more than PV.X */
644			if (is_alu_reduction_inst(bc, prev[i]))
645				chan[i] = 0;
646			else
647				chan[i] = prev[i]->dst.chan;
648		} else
649			gpr[i] = -1;
650	}
651
652	for (i = 0; i < max_slots; ++i) {
653		struct r600_bytecode_alu *alu = slots[i];
654		if (!alu)
655			continue;
656
657		if (is_alu_64bit_inst(alu))
658			continue;
659		num_src = r600_bytecode_get_num_operands(alu);
660		for (src = 0; src < num_src; ++src) {
661			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
662				continue;
663
664			if (bc->chip_class < CAYMAN) {
665				if (alu->src[src].sel == gpr[4] &&
666				    alu->src[src].chan == chan[4] &&
667				    alu_prev->pred_sel == alu->pred_sel) {
668					alu->src[src].sel = V_SQ_ALU_SRC_PS;
669					alu->src[src].chan = 0;
670					continue;
671				}
672			}
673
674			for (j = 0; j < 4; ++j) {
675				if (alu->src[src].sel == gpr[j] &&
676					alu->src[src].chan == j &&
677				      alu_prev->pred_sel == alu->pred_sel) {
678					alu->src[src].sel = V_SQ_ALU_SRC_PV;
679					alu->src[src].chan = chan[j];
680					break;
681				}
682			}
683		}
684	}
685
686	return 0;
687}
688
689void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg, unsigned abs)
690{
691	switch(value) {
692	case 0:
693		*sel = V_SQ_ALU_SRC_0;
694		break;
695	case 1:
696		*sel = V_SQ_ALU_SRC_1_INT;
697		break;
698	case -1:
699		*sel = V_SQ_ALU_SRC_M_1_INT;
700		break;
701	case 0x3F800000: /* 1.0f */
702		*sel = V_SQ_ALU_SRC_1;
703		break;
704	case 0x3F000000: /* 0.5f */
705		*sel = V_SQ_ALU_SRC_0_5;
706		break;
707	case 0xBF800000: /* -1.0f */
708		*sel = V_SQ_ALU_SRC_1;
709		*neg ^= !abs;
710		break;
711	case 0xBF000000: /* -0.5f */
712		*sel = V_SQ_ALU_SRC_0_5;
713		*neg ^= !abs;
714		break;
715	default:
716		*sel = V_SQ_ALU_SRC_LITERAL;
717		break;
718	}
719}
720
721/* compute how many literal are needed */
722static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu,
723				 uint32_t literal[4], unsigned *nliteral)
724{
725	unsigned num_src = r600_bytecode_get_num_operands(alu);
726	unsigned i, j;
727
728	for (i = 0; i < num_src; ++i) {
729		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
730			uint32_t value = alu->src[i].value;
731			unsigned found = 0;
732			for (j = 0; j < *nliteral; ++j) {
733				if (literal[j] == value) {
734					found = 1;
735					break;
736				}
737			}
738			if (!found) {
739				if (*nliteral >= 4)
740					return -EINVAL;
741				literal[(*nliteral)++] = value;
742			}
743		}
744	}
745	return 0;
746}
747
748static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu,
749					      uint32_t literal[4], unsigned nliteral)
750{
751	unsigned num_src = r600_bytecode_get_num_operands(alu);
752	unsigned i, j;
753
754	for (i = 0; i < num_src; ++i) {
755		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
756			uint32_t value = alu->src[i].value;
757			for (j = 0; j < nliteral; ++j) {
758				if (literal[j] == value) {
759					alu->src[i].chan = j;
760					break;
761				}
762			}
763		}
764	}
765}
766
767static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
768			     struct r600_bytecode_alu *alu_prev)
769{
770	struct r600_bytecode_alu *prev[5];
771	struct r600_bytecode_alu *result[5] = { NULL };
772
773	uint32_t literal[4], prev_literal[4];
774	unsigned nliteral = 0, prev_nliteral = 0;
775
776	int i, j, r, src, num_src;
777	int num_once_inst = 0;
778	int have_mova = 0, have_rel = 0;
779	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
780
781	r = assign_alu_units(bc, alu_prev, prev);
782	if (r)
783		return r;
784
785	for (i = 0; i < max_slots; ++i) {
786		if (prev[i]) {
787		      if (prev[i]->pred_sel)
788			      return 0;
789		      if (is_alu_once_inst(prev[i]))
790			      return 0;
791		}
792		if (slots[i]) {
793			if (slots[i]->pred_sel)
794				return 0;
795			if (is_alu_once_inst(slots[i]))
796				return 0;
797		}
798	}
799
800	for (i = 0; i < max_slots; ++i) {
801		struct r600_bytecode_alu *alu;
802
803		if (num_once_inst > 0)
804		   return 0;
805
806		/* check number of literals */
807		if (prev[i]) {
808			if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral))
809				return 0;
810			if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral))
811				return 0;
812			if (is_alu_mova_inst(prev[i])) {
813				if (have_rel)
814					return 0;
815				have_mova = 1;
816			}
817
818			if (alu_uses_rel(prev[i])) {
819				if (have_mova) {
820					return 0;
821				}
822				have_rel = 1;
823			}
824			if (alu_uses_lds(prev[i]))
825				return 0;
826
827			num_once_inst += is_alu_once_inst(prev[i]);
828		}
829		if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral))
830			return 0;
831
832		/* Let's check used slots. */
833		if (prev[i] && !slots[i]) {
834			result[i] = prev[i];
835			continue;
836		} else if (prev[i] && slots[i]) {
837			if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
838				/* Trans unit is still free try to use it. */
839				if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) {
840					result[i] = prev[i];
841					result[4] = slots[i];
842				} else if (is_alu_any_unit_inst(bc, prev[i])) {
843					if (slots[i]->dst.sel == prev[i]->dst.sel &&
844					    alu_writes(slots[i]) &&
845					    alu_writes(prev[i]))
846						return 0;
847
848					result[i] = slots[i];
849					result[4] = prev[i];
850				} else
851					return 0;
852			} else
853				return 0;
854		} else if(!slots[i]) {
855			continue;
856		} else {
857			if (max_slots == 5 && slots[i] && prev[4] &&
858					slots[i]->dst.sel == prev[4]->dst.sel &&
859					slots[i]->dst.chan == prev[4]->dst.chan &&
860					alu_writes(slots[i]) &&
861					alu_writes(prev[4]))
862				return 0;
863
864			result[i] = slots[i];
865		}
866
867		alu = slots[i];
868		num_once_inst += is_alu_once_inst(alu);
869
870		/* don't reschedule NOPs */
871		if (is_nop_inst(alu))
872			return 0;
873
874		if (is_alu_mova_inst(alu)) {
875			if (have_rel) {
876				return 0;
877			}
878			have_mova = 1;
879		}
880
881		if (alu_uses_rel(alu)) {
882			if (have_mova) {
883				return 0;
884			}
885			have_rel = 1;
886		}
887
888		if (alu->op == ALU_OP0_SET_CF_IDX0 ||
889			alu->op == ALU_OP0_SET_CF_IDX1)
890			return 0; /* data hazard with MOVA */
891
892		/* Let's check source gprs */
893		num_src = r600_bytecode_get_num_operands(alu);
894		for (src = 0; src < num_src; ++src) {
895
896			/* Constants don't matter. */
897			if (!is_gpr(alu->src[src].sel))
898				continue;
899
900			for (j = 0; j < max_slots; ++j) {
901				if (!prev[j] || !alu_writes(prev[j]))
902					continue;
903
904				/* If it's relative then we can't determin which gpr is really used. */
905				if (prev[j]->dst.chan == alu->src[src].chan &&
906					(prev[j]->dst.sel == alu->src[src].sel ||
907					prev[j]->dst.rel || alu->src[src].rel))
908					return 0;
909			}
910		}
911	}
912
913	/* more than one PRED_ or KILL_ ? */
914	if (num_once_inst > 1)
915		return 0;
916
917	/* check if the result can still be swizzlet */
918	r = check_and_set_bank_swizzle(bc, result);
919	if (r)
920		return 0;
921
922	/* looks like everything worked out right, apply the changes */
923
924	/* undo adding previus literals */
925	bc->cf_last->ndw -= align(prev_nliteral, 2);
926
927	/* sort instructions */
928	for (i = 0; i < max_slots; ++i) {
929		slots[i] = result[i];
930		if (result[i]) {
931			LIST_DEL(&result[i]->list);
932			result[i]->last = 0;
933			LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
934		}
935	}
936
937	/* determine new last instruction */
938	LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
939
940	/* determine new first instruction */
941	for (i = 0; i < max_slots; ++i) {
942		if (result[i]) {
943			bc->cf_last->curr_bs_head = result[i];
944			break;
945		}
946	}
947
948	bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
949	bc->cf_last->prev2_bs_head = NULL;
950
951	return 0;
952}
953
954/* we'll keep kcache sets sorted by bank & addr */
955static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
956		struct r600_bytecode_kcache *kcache,
957		unsigned bank, unsigned line, unsigned index_mode)
958{
959	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
960
961	for (i = 0; i < kcache_banks; i++) {
962		if (kcache[i].mode) {
963			int d;
964
965			if (kcache[i].bank < bank)
966				continue;
967
968			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
969					kcache[i].bank > bank) {
970				/* try to insert new line */
971				if (kcache[kcache_banks-1].mode) {
972					/* all sets are in use */
973					return -ENOMEM;
974				}
975
976				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
977				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
978				kcache[i].bank = bank;
979				kcache[i].addr = line;
980				kcache[i].index_mode = index_mode;
981				return 0;
982			}
983
984			d = line - kcache[i].addr;
985
986			if (d == -1) {
987				kcache[i].addr--;
988				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
989					/* we are prepending the line to the current set,
990					 * discarding the existing second line,
991					 * so we'll have to insert line+2 after it */
992					line += 2;
993					continue;
994				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
995					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
996					return 0;
997				} else {
998					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
999					return -ENOMEM;
1000				}
1001			} else if (d == 1) {
1002				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1003				return 0;
1004			} else if (d == 0)
1005				return 0;
1006		} else { /* free kcache set - use it */
1007			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1008			kcache[i].bank = bank;
1009			kcache[i].addr = line;
1010			kcache[i].index_mode = index_mode;
1011			return 0;
1012		}
1013	}
1014	return -ENOMEM;
1015}
1016
1017static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
1018		struct r600_bytecode_kcache *kcache,
1019		struct r600_bytecode_alu *alu)
1020{
1021	int i, r;
1022
1023	for (i = 0; i < 3; i++) {
1024		unsigned bank, line, sel = alu->src[i].sel, index_mode;
1025
1026		if (sel < 512)
1027			continue;
1028
1029		bank = alu->src[i].kc_bank;
1030		assert(bank < R600_MAX_HW_CONST_BUFFERS);
1031		line = (sel-512)>>4;
1032		index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE
1033
1034		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode)))
1035			return r;
1036	}
1037	return 0;
1038}
1039
1040static int r600_bytecode_assign_kcache_banks(
1041		struct r600_bytecode_alu *alu,
1042		struct r600_bytecode_kcache * kcache)
1043{
1044	int i, j;
1045
1046	/* Alter the src operands to refer to the kcache. */
1047	for (i = 0; i < 3; ++i) {
1048		static const unsigned int base[] = {128, 160, 256, 288};
1049		unsigned int line, sel = alu->src[i].sel, found = 0;
1050
1051		if (sel < 512)
1052			continue;
1053
1054		sel -= 512;
1055		line = sel>>4;
1056
1057		for (j = 0; j < 4 && !found; ++j) {
1058			switch (kcache[j].mode) {
1059			case V_SQ_CF_KCACHE_NOP:
1060			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
1061				R600_ERR("unexpected kcache line mode\n");
1062				return -ENOMEM;
1063			default:
1064				if (kcache[j].bank == alu->src[i].kc_bank &&
1065						kcache[j].addr <= line &&
1066						line < kcache[j].addr + kcache[j].mode) {
1067					alu->src[i].sel = sel - (kcache[j].addr<<4);
1068					alu->src[i].sel += base[j];
1069					found=1;
1070			    }
1071			}
1072		}
1073	}
1074	return 0;
1075}
1076
1077static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc,
1078		struct r600_bytecode_alu *alu,
1079		unsigned type)
1080{
1081	struct r600_bytecode_kcache kcache_sets[4];
1082	struct r600_bytecode_kcache *kcache = kcache_sets;
1083	int r;
1084
1085	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
1086
1087	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1088		/* can't alloc, need to start new clause */
1089		if ((r = r600_bytecode_add_cf(bc))) {
1090			return r;
1091		}
1092		bc->cf_last->op = type;
1093
1094		/* retry with the new clause */
1095		kcache = bc->cf_last->kcache;
1096		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1097			/* can't alloc again- should never happen */
1098			return r;
1099		}
1100	} else {
1101		/* update kcache sets */
1102		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
1103	}
1104
1105	/* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */
1106	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP ||
1107		kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) {
1108		if (bc->chip_class < EVERGREEN)
1109			return -ENOMEM;
1110		bc->cf_last->eg_alu_extended = 1;
1111	}
1112
1113	return 0;
1114}
1115
1116static int insert_nop_r6xx(struct r600_bytecode *bc)
1117{
1118	struct r600_bytecode_alu alu;
1119	int r, i;
1120
1121	for (i = 0; i < 4; i++) {
1122		memset(&alu, 0, sizeof(alu));
1123		alu.op = ALU_OP0_NOP;
1124		alu.src[0].chan = i;
1125		alu.dst.chan = i;
1126		alu.last = (i == 3);
1127		r = r600_bytecode_add_alu(bc, &alu);
1128		if (r)
1129			return r;
1130	}
1131	return 0;
1132}
1133
1134/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1135static int load_ar_r6xx(struct r600_bytecode *bc)
1136{
1137	struct r600_bytecode_alu alu;
1138	int r;
1139
1140	if (bc->ar_loaded)
1141		return 0;
1142
1143	/* hack to avoid making MOVA the last instruction in the clause */
1144	if ((bc->cf_last->ndw>>1) >= 110)
1145		bc->force_add_cf = 1;
1146
1147	memset(&alu, 0, sizeof(alu));
1148	alu.op = ALU_OP1_MOVA_GPR_INT;
1149	alu.src[0].sel = bc->ar_reg;
1150	alu.src[0].chan = bc->ar_chan;
1151	alu.last = 1;
1152	alu.index_mode = INDEX_MODE_LOOP;
1153	r = r600_bytecode_add_alu(bc, &alu);
1154	if (r)
1155		return r;
1156
1157	/* no requirement to set uses waterfall on MOVA_GPR_INT */
1158	bc->ar_loaded = 1;
1159	return 0;
1160}
1161
1162/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1163static int load_ar(struct r600_bytecode *bc)
1164{
1165	struct r600_bytecode_alu alu;
1166	int r;
1167
1168	if (bc->ar_handling)
1169		return load_ar_r6xx(bc);
1170
1171	if (bc->ar_loaded)
1172		return 0;
1173
1174	/* hack to avoid making MOVA the last instruction in the clause */
1175	if ((bc->cf_last->ndw>>1) >= 110)
1176		bc->force_add_cf = 1;
1177
1178	memset(&alu, 0, sizeof(alu));
1179	alu.op = ALU_OP1_MOVA_INT;
1180	alu.src[0].sel = bc->ar_reg;
1181	alu.src[0].chan = bc->ar_chan;
1182	alu.last = 1;
1183	r = r600_bytecode_add_alu(bc, &alu);
1184	if (r)
1185		return r;
1186
1187	bc->cf_last->r6xx_uses_waterfall = 1;
1188	bc->ar_loaded = 1;
1189	return 0;
1190}
1191
1192int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
1193		const struct r600_bytecode_alu *alu, unsigned type)
1194{
1195	struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1196	struct r600_bytecode_alu *lalu;
1197	int i, r;
1198
1199	if (!nalu)
1200		return -ENOMEM;
1201	memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1202
1203	if (alu->is_op3) {
1204		/* will fail later since alu does not support it. */
1205		assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1206	}
1207
1208	if (bc->cf_last != NULL && bc->cf_last->op != type) {
1209		/* check if we could add it anyway */
1210		if (bc->cf_last->op == CF_OP_ALU &&
1211			type == CF_OP_ALU_PUSH_BEFORE) {
1212			LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1213				if (lalu->execute_mask) {
1214					bc->force_add_cf = 1;
1215					break;
1216				}
1217			}
1218		} else
1219			bc->force_add_cf = 1;
1220	}
1221
1222	/* cf can contains only alu or only vtx or only tex */
1223	if (bc->cf_last == NULL || bc->force_add_cf) {
1224		r = r600_bytecode_add_cf(bc);
1225		if (r) {
1226			free(nalu);
1227			return r;
1228		}
1229	}
1230	bc->cf_last->op = type;
1231
1232	/* Load index register if required */
1233	if (bc->chip_class >= EVERGREEN) {
1234		for (i = 0; i < 3; i++)
1235			if (nalu->src[i].kc_bank && nalu->src[i].kc_rel)
1236				egcm_load_index_reg(bc, 0, true);
1237	}
1238
1239	/* Check AR usage and load it if required */
1240	for (i = 0; i < 3; i++)
1241		if (nalu->src[i].rel && !bc->ar_loaded)
1242			load_ar(bc);
1243
1244	if (nalu->dst.rel && !bc->ar_loaded)
1245		load_ar(bc);
1246
1247	/* Setup the kcache for this ALU instruction. This will start a new
1248	 * ALU clause if needed. */
1249	if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1250		free(nalu);
1251		return r;
1252	}
1253
1254	if (!bc->cf_last->curr_bs_head) {
1255		bc->cf_last->curr_bs_head = nalu;
1256	}
1257	/* number of gpr == the last gpr used in any alu */
1258	for (i = 0; i < 3; i++) {
1259		if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1260			bc->ngpr = nalu->src[i].sel + 1;
1261		}
1262		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1263			r600_bytecode_special_constants(nalu->src[i].value,
1264				&nalu->src[i].sel, &nalu->src[i].neg, nalu->src[i].abs);
1265	}
1266	if (nalu->dst.sel >= bc->ngpr) {
1267		bc->ngpr = nalu->dst.sel + 1;
1268	}
1269	LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1270	/* each alu use 2 dwords */
1271	bc->cf_last->ndw += 2;
1272	bc->ndw += 2;
1273
1274	/* process cur ALU instructions for bank swizzle */
1275	if (nalu->last) {
1276		uint32_t literal[4];
1277		unsigned nliteral;
1278		struct r600_bytecode_alu *slots[5];
1279		int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1280		r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1281		if (r)
1282			return r;
1283
1284		if (bc->cf_last->prev_bs_head) {
1285			r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1286			if (r)
1287				return r;
1288		}
1289
1290		if (bc->cf_last->prev_bs_head) {
1291			r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1292			if (r)
1293				return r;
1294		}
1295
1296		r = check_and_set_bank_swizzle(bc, slots);
1297		if (r)
1298			return r;
1299
1300		for (i = 0, nliteral = 0; i < max_slots; i++) {
1301			if (slots[i]) {
1302				r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral);
1303				if (r)
1304					return r;
1305			}
1306		}
1307		bc->cf_last->ndw += align(nliteral, 2);
1308
1309		/* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1310		 * worst case */
1311		if ((bc->cf_last->ndw >> 1) >= 120) {
1312			bc->force_add_cf = 1;
1313		}
1314
1315		bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1316		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1317		bc->cf_last->curr_bs_head = NULL;
1318	}
1319
1320	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
1321		insert_nop_r6xx(bc);
1322
1323	/* Might need to insert spill write ops after current clause */
1324	if (nalu->last && bc->n_pending_outputs) {
1325		while (bc->n_pending_outputs) {
1326			r = r600_bytecode_add_output(bc, &bc->pending_outputs[--bc->n_pending_outputs]);
1327			if (r)
1328				return r;
1329		}
1330	}
1331
1332	return 0;
1333}
1334
1335int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1336{
1337	return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU);
1338}
1339
1340static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1341{
1342	switch (bc->chip_class) {
1343	case R600:
1344		return 8;
1345
1346	case R700:
1347	case EVERGREEN:
1348	case CAYMAN:
1349		return 16;
1350
1351	default:
1352		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1353		return 8;
1354	}
1355}
1356
1357static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
1358{
1359	return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) &&
1360		 bc->cf_last->op != CF_OP_GDS &&
1361		 (bc->chip_class == CAYMAN ||
1362		  bc->cf_last->op != CF_OP_TEX));
1363}
1364
1365static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx,
1366					  bool use_tc)
1367{
1368	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1369	int r;
1370
1371	if (!nvtx)
1372		return -ENOMEM;
1373	memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1374
1375	/* Load index register if required */
1376	if (bc->chip_class >= EVERGREEN) {
1377		if (vtx->buffer_index_mode)
1378			egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false);
1379	}
1380
1381	/* cf can contains only alu or only vtx or only tex */
1382	if (bc->cf_last == NULL ||
1383	    last_inst_was_not_vtx_fetch(bc) ||
1384	    bc->force_add_cf) {
1385		r = r600_bytecode_add_cf(bc);
1386		if (r) {
1387			free(nvtx);
1388			return r;
1389		}
1390		switch (bc->chip_class) {
1391		case R600:
1392		case R700:
1393			bc->cf_last->op = CF_OP_VTX;
1394			break;
1395		case EVERGREEN:
1396			if (use_tc)
1397				bc->cf_last->op = CF_OP_TEX;
1398			else
1399				bc->cf_last->op = CF_OP_VTX;
1400			break;
1401		case CAYMAN:
1402			bc->cf_last->op = CF_OP_TEX;
1403			break;
1404		default:
1405			R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1406			free(nvtx);
1407			return -EINVAL;
1408		}
1409	}
1410	LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1411	/* each fetch use 4 dwords */
1412	bc->cf_last->ndw += 4;
1413	bc->ndw += 4;
1414	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1415		bc->force_add_cf = 1;
1416
1417	bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1418	bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1419
1420	return 0;
1421}
1422
1423int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1424{
1425	return r600_bytecode_add_vtx_internal(bc, vtx, false);
1426}
1427
1428int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1429{
1430	return r600_bytecode_add_vtx_internal(bc, vtx, true);
1431}
1432
1433int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1434{
1435	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1436	int r;
1437
1438	if (!ntex)
1439		return -ENOMEM;
1440	memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1441
1442	/* Load index register if required */
1443	if (bc->chip_class >= EVERGREEN) {
1444		if (tex->sampler_index_mode || tex->resource_index_mode)
1445			egcm_load_index_reg(bc, 1, false);
1446	}
1447
1448	/* we can't fetch data und use it as texture lookup address in the same TEX clause */
1449	if (bc->cf_last != NULL &&
1450		bc->cf_last->op == CF_OP_TEX) {
1451		struct r600_bytecode_tex *ttex;
1452		LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1453			if (ttex->dst_gpr == ntex->src_gpr) {
1454				bc->force_add_cf = 1;
1455				break;
1456			}
1457		}
1458		/* slight hack to make gradients always go into same cf */
1459		if (ntex->op == FETCH_OP_SET_GRADIENTS_H)
1460			bc->force_add_cf = 1;
1461	}
1462
1463	/* cf can contains only alu or only vtx or only tex */
1464	if (bc->cf_last == NULL ||
1465		bc->cf_last->op != CF_OP_TEX ||
1466	        bc->force_add_cf) {
1467		r = r600_bytecode_add_cf(bc);
1468		if (r) {
1469			free(ntex);
1470			return r;
1471		}
1472		bc->cf_last->op = CF_OP_TEX;
1473	}
1474	if (ntex->src_gpr >= bc->ngpr) {
1475		bc->ngpr = ntex->src_gpr + 1;
1476	}
1477	if (ntex->dst_gpr >= bc->ngpr) {
1478		bc->ngpr = ntex->dst_gpr + 1;
1479	}
1480	LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1481	/* each texture fetch use 4 dwords */
1482	bc->cf_last->ndw += 4;
1483	bc->ndw += 4;
1484	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1485		bc->force_add_cf = 1;
1486	return 0;
1487}
1488
1489int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds)
1490{
1491	struct r600_bytecode_gds *ngds = r600_bytecode_gds();
1492	int r;
1493
1494	if (ngds == NULL)
1495		return -ENOMEM;
1496	memcpy(ngds, gds, sizeof(struct r600_bytecode_gds));
1497
1498	if (bc->chip_class >= EVERGREEN) {
1499		if (gds->uav_index_mode)
1500			egcm_load_index_reg(bc, gds->uav_index_mode - 1, false);
1501	}
1502
1503	if (bc->cf_last == NULL ||
1504	    bc->cf_last->op != CF_OP_GDS ||
1505	    bc->force_add_cf) {
1506		r = r600_bytecode_add_cf(bc);
1507		if (r) {
1508			free(ngds);
1509			return r;
1510		}
1511		bc->cf_last->op = CF_OP_GDS;
1512	}
1513
1514	LIST_ADDTAIL(&ngds->list, &bc->cf_last->gds);
1515	bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */
1516	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1517		bc->force_add_cf = 1;
1518	return 0;
1519}
1520
1521int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
1522{
1523	int r;
1524
1525	/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
1526	if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) {
1527		bc->need_wait_ack = false;
1528		r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
1529	}
1530
1531	r = r600_bytecode_add_cf(bc);
1532	if (r)
1533		return r;
1534
1535	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1536	bc->cf_last->op = op;
1537	return 0;
1538}
1539
1540int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1541{
1542	return r600_bytecode_add_cfinst(bc, CF_OP_CF_END);
1543}
1544
1545/* common to all 3 families */
1546static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1547{
1548	if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
1549		return r700_bytecode_fetch_mem_build(bc, vtx, id);
1550	bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(r600_isa_fetch_opcode(bc->isa->hw_class, vtx->op)) |
1551			S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1552			S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1553			S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1554			S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1555	if (bc->chip_class < CAYMAN)
1556		bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1557	id++;
1558	bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1559				S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1560				S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1561				S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1562				S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1563				S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1564				S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1565				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1566				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1567				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1568	bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1569				S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1570	if (bc->chip_class >= EVERGREEN)
1571		bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode);
1572	if (bc->chip_class < CAYMAN)
1573		bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1574	id++;
1575	bc->bytecode[id++] = 0;
1576	return 0;
1577}
1578
1579/* common to all 3 families */
1580static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1581{
1582	bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST(
1583					r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) |
1584			    EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) |
1585				S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1586				S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1587				S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1588	if (bc->chip_class >= EVERGREEN)
1589		bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode);
1590				((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode)
1591	id++;
1592	bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1593				S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1594				S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1595				S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1596				S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1597				S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1598				S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1599				S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1600				S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1601				S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1602				S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1603	bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1604				S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1605				S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1606				S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1607				S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1608				S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1609				S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1610				S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1611	bc->bytecode[id++] = 0;
1612	return 0;
1613}
1614
1615/* r600 only, r700/eg bits in r700_asm.c */
1616static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1617{
1618	unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op);
1619
1620	/* don't replace gpr by pv or ps for destination register */
1621	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1622				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1623				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1624				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1625				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1626				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1627				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1628				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1629				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1630				S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1631				S_SQ_ALU_WORD0_LAST(alu->last);
1632
1633	if (alu->is_op3) {
1634		assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1635		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1636					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1637					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1638					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1639					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1640					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1641					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1642					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1643					S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) |
1644					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1645	} else {
1646		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1647					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1648					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1649					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1650					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1651					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1652					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1653					S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1654					S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) |
1655					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1656					S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1657					S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1658	}
1659	return 0;
1660}
1661
1662static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1663{
1664	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1665	*bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) |
1666			S_SQ_CF_WORD1_BARRIER(1) |
1667			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)|
1668			S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1669}
1670
1671/* common for r600/r700 - eg in eg_asm.c */
1672static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1673{
1674	unsigned id = cf->id;
1675	const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1676	unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op);
1677
1678
1679	if (cf->op == CF_NATIVE) {
1680		bc->bytecode[id++] = cf->isa[0];
1681		bc->bytecode[id++] = cf->isa[1];
1682	} else if (cfop->flags & CF_ALU) {
1683		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1684			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1685			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1686			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1687
1688		bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) |
1689			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1690			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1691			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1692					S_SQ_CF_ALU_WORD1_BARRIER(1) |
1693					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1694					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1695	} else if (cfop->flags & CF_FETCH) {
1696		if (bc->chip_class == R700)
1697			r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1698		else
1699			r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1700	} else if (cfop->flags & CF_EXP) {
1701		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1702			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1703			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1704			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1705			S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1706		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1707			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1708			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1709			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1710			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1711			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1712			S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1713			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program);
1714	} else if (cfop->flags & CF_MEM) {
1715		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1716			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1717			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1718			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1719			S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1720		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1721			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1722			S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1723			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) |
1724			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1725			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1726	} else {
1727		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1728		bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) |
1729					S_SQ_CF_WORD1_BARRIER(1) |
1730			                S_SQ_CF_WORD1_COND(cf->cond) |
1731			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
1732					S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1733	}
1734	return 0;
1735}
1736
1737int r600_bytecode_build(struct r600_bytecode *bc)
1738{
1739	struct r600_bytecode_cf *cf;
1740	struct r600_bytecode_alu *alu;
1741	struct r600_bytecode_vtx *vtx;
1742	struct r600_bytecode_tex *tex;
1743	struct r600_bytecode_gds *gds;
1744	uint32_t literal[4];
1745	unsigned nliteral;
1746	unsigned addr;
1747	int i, r;
1748
1749	if (!bc->nstack) { // If not 0, Stack_size already provided by llvm
1750		if (bc->stack.max_entries)
1751			bc->nstack = bc->stack.max_entries;
1752		else if (bc->type == PIPE_SHADER_VERTEX ||
1753			 bc->type == PIPE_SHADER_TESS_EVAL ||
1754			 bc->type == PIPE_SHADER_TESS_CTRL)
1755			bc->nstack = 1;
1756	}
1757
1758	/* first path compute addr of each CF block */
1759	/* addr start after all the CF instructions */
1760	addr = bc->cf_last->id + 2;
1761	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1762		if (r600_isa_cf(cf->op)->flags & CF_FETCH) {
1763			addr += 3;
1764			addr &= 0xFFFFFFFCUL;
1765		}
1766		cf->addr = addr;
1767		addr += cf->ndw;
1768		bc->ndw = cf->addr + cf->ndw;
1769	}
1770	free(bc->bytecode);
1771	bc->bytecode = calloc(4, bc->ndw);
1772	if (bc->bytecode == NULL)
1773		return -ENOMEM;
1774	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1775		const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1776		addr = cf->addr;
1777		if (bc->chip_class >= EVERGREEN)
1778			r = eg_bytecode_cf_build(bc, cf);
1779		else
1780			r = r600_bytecode_cf_build(bc, cf);
1781		if (r)
1782			return r;
1783		if (cfop->flags & CF_ALU) {
1784			nliteral = 0;
1785			memset(literal, 0, sizeof(literal));
1786			LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1787				r = r600_bytecode_alu_nliterals(alu, literal, &nliteral);
1788				if (r)
1789					return r;
1790				r600_bytecode_alu_adjust_literals(alu, literal, nliteral);
1791				r600_bytecode_assign_kcache_banks(alu, cf->kcache);
1792
1793				switch(bc->chip_class) {
1794				case R600:
1795					r = r600_bytecode_alu_build(bc, alu, addr);
1796					break;
1797				case R700:
1798					r = r700_bytecode_alu_build(bc, alu, addr);
1799					break;
1800				case EVERGREEN:
1801				case CAYMAN:
1802					r = eg_bytecode_alu_build(bc, alu, addr);
1803					break;
1804				default:
1805					R600_ERR("unknown chip class %d.\n", bc->chip_class);
1806					return -EINVAL;
1807				}
1808				if (r)
1809					return r;
1810				addr += 2;
1811				if (alu->last) {
1812					for (i = 0; i < align(nliteral, 2); ++i) {
1813						bc->bytecode[addr++] = literal[i];
1814					}
1815					nliteral = 0;
1816					memset(literal, 0, sizeof(literal));
1817				}
1818			}
1819		} else if (cf->op == CF_OP_VTX) {
1820			LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1821				r = r600_bytecode_vtx_build(bc, vtx, addr);
1822				if (r)
1823					return r;
1824				addr += 4;
1825			}
1826		} else if (cf->op == CF_OP_GDS) {
1827			assert(bc->chip_class >= EVERGREEN);
1828			LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
1829				r = eg_bytecode_gds_build(bc, gds, addr);
1830				if (r)
1831					return r;
1832				addr += 4;
1833			}
1834		} else if (cf->op == CF_OP_TEX) {
1835			LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1836				assert(bc->chip_class >= EVERGREEN);
1837				r = r600_bytecode_vtx_build(bc, vtx, addr);
1838				if (r)
1839					return r;
1840				addr += 4;
1841			}
1842			LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1843				r = r600_bytecode_tex_build(bc, tex, addr);
1844				if (r)
1845					return r;
1846				addr += 4;
1847			}
1848		}
1849	}
1850	return 0;
1851}
1852
1853void r600_bytecode_clear(struct r600_bytecode *bc)
1854{
1855	struct r600_bytecode_cf *cf = NULL, *next_cf;
1856
1857	free(bc->bytecode);
1858	bc->bytecode = NULL;
1859
1860	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1861		struct r600_bytecode_alu *alu = NULL, *next_alu;
1862		struct r600_bytecode_tex *tex = NULL, *next_tex;
1863		struct r600_bytecode_tex *vtx = NULL, *next_vtx;
1864		struct r600_bytecode_gds *gds = NULL, *next_gds;
1865
1866		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1867			free(alu);
1868		}
1869
1870		LIST_INITHEAD(&cf->alu);
1871
1872		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1873			free(tex);
1874		}
1875
1876		LIST_INITHEAD(&cf->tex);
1877
1878		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1879			free(vtx);
1880		}
1881
1882		LIST_INITHEAD(&cf->vtx);
1883
1884		LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) {
1885			free(gds);
1886		}
1887
1888		LIST_INITHEAD(&cf->gds);
1889
1890		free(cf);
1891	}
1892
1893	LIST_INITHEAD(&cf->list);
1894}
1895
1896static int print_swizzle(unsigned swz)
1897{
1898	const char * swzchars = "xyzw01?_";
1899	assert(swz<8 && swz != 6);
1900	return fprintf(stderr, "%c", swzchars[swz]);
1901}
1902
1903static int print_sel(unsigned sel, unsigned rel, unsigned index_mode,
1904		unsigned need_brackets)
1905{
1906	int o = 0;
1907	if (rel && index_mode >= 5 && sel < 128)
1908		o += fprintf(stderr, "G");
1909	if (rel || need_brackets) {
1910		o += fprintf(stderr, "[");
1911	}
1912	o += fprintf(stderr, "%d", sel);
1913	if (rel) {
1914		if (index_mode == 0 || index_mode == 6)
1915			o += fprintf(stderr, "+AR");
1916		else if (index_mode == 4)
1917			o += fprintf(stderr, "+AL");
1918	}
1919	if (rel || need_brackets) {
1920		o += fprintf(stderr, "]");
1921	}
1922	return o;
1923}
1924
1925static int print_dst(struct r600_bytecode_alu *alu)
1926{
1927	int o = 0;
1928	unsigned sel = alu->dst.sel;
1929	char reg_char = 'R';
1930	if (sel > 128 - 4) { /* clause temporary gpr */
1931		sel -= 128 - 4;
1932		reg_char = 'T';
1933	}
1934
1935	if (alu_writes(alu)) {
1936		o += fprintf(stderr, "%c", reg_char);
1937		o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0);
1938	} else {
1939		o += fprintf(stderr, "__");
1940	}
1941	o += fprintf(stderr, ".");
1942	o += print_swizzle(alu->dst.chan);
1943	return o;
1944}
1945
1946static int print_src(struct r600_bytecode_alu *alu, unsigned idx)
1947{
1948	int o = 0;
1949	struct r600_bytecode_alu_src *src = &alu->src[idx];
1950	unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0;
1951
1952	if (src->neg)
1953		o += fprintf(stderr,"-");
1954	if (src->abs)
1955		o += fprintf(stderr,"|");
1956
1957	if (sel < 128 - 4) {
1958		o += fprintf(stderr, "R");
1959	} else if (sel < 128) {
1960		o += fprintf(stderr, "T");
1961		sel -= 128 - 4;
1962	} else if (sel < 160) {
1963		o += fprintf(stderr, "KC0");
1964		need_brackets = 1;
1965		sel -= 128;
1966	} else if (sel < 192) {
1967		o += fprintf(stderr, "KC1");
1968		need_brackets = 1;
1969		sel -= 160;
1970	} else if (sel >= 512) {
1971		o += fprintf(stderr, "C%d", src->kc_bank);
1972		need_brackets = 1;
1973		sel -= 512;
1974	} else if (sel >= 448) {
1975		o += fprintf(stderr, "Param");
1976		sel -= 448;
1977		need_chan = 0;
1978	} else if (sel >= 288) {
1979		o += fprintf(stderr, "KC3");
1980		need_brackets = 1;
1981		sel -= 288;
1982	} else if (sel >= 256) {
1983		o += fprintf(stderr, "KC2");
1984		need_brackets = 1;
1985		sel -= 256;
1986	} else {
1987		need_sel = 0;
1988		need_chan = 0;
1989		switch (sel) {
1990		case EG_V_SQ_ALU_SRC_LDS_DIRECT_A:
1991			o += fprintf(stderr, "LDS_A[0x%08X]", src->value);
1992			break;
1993		case EG_V_SQ_ALU_SRC_LDS_DIRECT_B:
1994			o += fprintf(stderr, "LDS_B[0x%08X]", src->value);
1995			break;
1996		case EG_V_SQ_ALU_SRC_LDS_OQ_A:
1997			o += fprintf(stderr, "LDS_OQ_A");
1998			need_chan = 1;
1999			break;
2000		case EG_V_SQ_ALU_SRC_LDS_OQ_B:
2001			o += fprintf(stderr, "LDS_OQ_B");
2002			need_chan = 1;
2003			break;
2004		case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP:
2005			o += fprintf(stderr, "LDS_OQ_A_POP");
2006			need_chan = 1;
2007			break;
2008		case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP:
2009			o += fprintf(stderr, "LDS_OQ_B_POP");
2010			need_chan = 1;
2011			break;
2012		case EG_V_SQ_ALU_SRC_TIME_LO:
2013			o += fprintf(stderr, "TIME_LO");
2014			break;
2015		case EG_V_SQ_ALU_SRC_TIME_HI:
2016			o += fprintf(stderr, "TIME_HI");
2017			break;
2018		case EG_V_SQ_ALU_SRC_SE_ID:
2019			o += fprintf(stderr, "SE_ID");
2020			break;
2021		case EG_V_SQ_ALU_SRC_SIMD_ID:
2022			o += fprintf(stderr, "SIMD_ID");
2023			break;
2024		case EG_V_SQ_ALU_SRC_HW_WAVE_ID:
2025			o += fprintf(stderr, "HW_WAVE_ID");
2026			break;
2027		case V_SQ_ALU_SRC_PS:
2028			o += fprintf(stderr, "PS");
2029			break;
2030		case V_SQ_ALU_SRC_PV:
2031			o += fprintf(stderr, "PV");
2032			need_chan = 1;
2033			break;
2034		case V_SQ_ALU_SRC_LITERAL:
2035			o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value));
2036			break;
2037		case V_SQ_ALU_SRC_0_5:
2038			o += fprintf(stderr, "0.5");
2039			break;
2040		case V_SQ_ALU_SRC_M_1_INT:
2041			o += fprintf(stderr, "-1");
2042			break;
2043		case V_SQ_ALU_SRC_1_INT:
2044			o += fprintf(stderr, "1");
2045			break;
2046		case V_SQ_ALU_SRC_1:
2047			o += fprintf(stderr, "1.0");
2048			break;
2049		case V_SQ_ALU_SRC_0:
2050			o += fprintf(stderr, "0");
2051			break;
2052		default:
2053			o += fprintf(stderr, "??IMM_%d", sel);
2054			break;
2055		}
2056	}
2057
2058	if (need_sel)
2059		o += print_sel(sel, src->rel, alu->index_mode, need_brackets);
2060
2061	if (need_chan) {
2062		o += fprintf(stderr, ".");
2063		o += print_swizzle(src->chan);
2064	}
2065
2066	if (src->abs)
2067		o += fprintf(stderr,"|");
2068
2069	return o;
2070}
2071
2072static int print_indent(int p, int c)
2073{
2074	int o = 0;
2075	while (p++ < c)
2076		o += fprintf(stderr, " ");
2077	return o;
2078}
2079
2080void r600_bytecode_disasm(struct r600_bytecode *bc)
2081{
2082	const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"};
2083	static int index = 0;
2084	struct r600_bytecode_cf *cf = NULL;
2085	struct r600_bytecode_alu *alu = NULL;
2086	struct r600_bytecode_vtx *vtx = NULL;
2087	struct r600_bytecode_tex *tex = NULL;
2088	struct r600_bytecode_gds *gds = NULL;
2089
2090	unsigned i, id, ngr = 0, last;
2091	uint32_t literal[4];
2092	unsigned nliteral;
2093	char chip = '6';
2094
2095	switch (bc->chip_class) {
2096	case R700:
2097		chip = '7';
2098		break;
2099	case EVERGREEN:
2100		chip = 'E';
2101		break;
2102	case CAYMAN:
2103		chip = 'C';
2104		break;
2105	case R600:
2106	default:
2107		chip = '6';
2108		break;
2109	}
2110	fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
2111	        bc->ndw, bc->ngpr, bc->nstack);
2112	fprintf(stderr, "shader %d -- %c\n", index++, chip);
2113
2114	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2115		id = cf->id;
2116		if (cf->op == CF_NATIVE) {
2117			fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id],
2118					bc->bytecode[id + 1]);
2119		} else {
2120			const struct cf_op_info *cfop = r600_isa_cf(cf->op);
2121			if (cfop->flags & CF_ALU) {
2122				if (cf->eg_alu_extended) {
2123					fprintf(stderr, "%04d %08X %08X  %s\n", id, bc->bytecode[id],
2124							bc->bytecode[id + 1], "ALU_EXT");
2125					id += 2;
2126				}
2127				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2128						bc->bytecode[id + 1], cfop->name);
2129				fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr);
2130				for (i = 0; i < 4; ++i) {
2131					if (cf->kcache[i].mode) {
2132						int c_start = (cf->kcache[i].addr << 4);
2133						int c_end = c_start + (cf->kcache[i].mode << 4);
2134						fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ",
2135						        i, cf->kcache[i].bank, c_start, c_end,
2136						        cf->kcache[i].index_mode ? " " : "",
2137						        cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : "");
2138					}
2139				}
2140				fprintf(stderr, "\n");
2141			} else if (cfop->flags & CF_FETCH) {
2142				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2143						bc->bytecode[id + 1], cfop->name);
2144				fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
2145				if (cf->vpm)
2146					fprintf(stderr, "VPM ");
2147				if (cf->end_of_program)
2148					fprintf(stderr, "EOP ");
2149				fprintf(stderr, "\n");
2150
2151			} else if (cfop->flags & CF_EXP) {
2152				int o = 0;
2153				const char *exp_type[] = {"PIXEL", "POS  ", "PARAM"};
2154				o += fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2155						bc->bytecode[id + 1], cfop->name);
2156				o += print_indent(o, 43);
2157				o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2158				if (cf->output.burst_count > 1) {
2159					o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2160							cf->output.array_base + cf->output.burst_count - 1);
2161
2162					o += print_indent(o, 55);
2163					o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2164							cf->output.gpr + cf->output.burst_count - 1);
2165				} else {
2166					o += fprintf(stderr, "%d ", cf->output.array_base);
2167					o += print_indent(o, 55);
2168					o += fprintf(stderr, "R%d.", cf->output.gpr);
2169				}
2170
2171				o += print_swizzle(cf->output.swizzle_x);
2172				o += print_swizzle(cf->output.swizzle_y);
2173				o += print_swizzle(cf->output.swizzle_z);
2174				o += print_swizzle(cf->output.swizzle_w);
2175
2176				print_indent(o, 67);
2177
2178				fprintf(stderr, " ES:%X ", cf->output.elem_size);
2179				if (cf->mark)
2180					fprintf(stderr, "MARK ");
2181				if (!cf->barrier)
2182					fprintf(stderr, "NO_BARRIER ");
2183				if (cf->end_of_program)
2184					fprintf(stderr, "EOP ");
2185				fprintf(stderr, "\n");
2186			} else if (r600_isa_cf(cf->op)->flags & CF_MEM) {
2187				int o = 0;
2188				const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK",
2189						"WRITE_IND_ACK"};
2190				o += fprintf(stderr, "%04d %08X %08X  %s ", id,
2191						bc->bytecode[id], bc->bytecode[id + 1], cfop->name);
2192				o += print_indent(o, 43);
2193				o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2194
2195				if (r600_isa_cf(cf->op)->flags & CF_RAT) {
2196					o += fprintf(stderr, "RAT%d", cf->rat.id);
2197					if (cf->rat.index_mode) {
2198						o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1);
2199					}
2200					o += fprintf(stderr, " INST: %d ", cf->rat.inst);
2201				}
2202
2203				if (cf->output.burst_count > 1) {
2204					o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2205							cf->output.array_base + cf->output.burst_count - 1);
2206					o += print_indent(o, 55);
2207					o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2208							cf->output.gpr + cf->output.burst_count - 1);
2209				} else {
2210					o += fprintf(stderr, "%d ", cf->output.array_base);
2211					o += print_indent(o, 55);
2212					o += fprintf(stderr, "R%d.", cf->output.gpr);
2213				}
2214				for (i = 0; i < 4; ++i) {
2215					if (cf->output.comp_mask & (1 << i))
2216						o += print_swizzle(i);
2217					else
2218						o += print_swizzle(7);
2219				}
2220
2221				if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND ||
2222				    cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND)
2223					o += fprintf(stderr, " R%d", cf->output.index_gpr);
2224
2225				o += print_indent(o, 67);
2226
2227				fprintf(stderr, " ES:%i ", cf->output.elem_size);
2228				if (cf->output.array_size != 0xFFF)
2229					fprintf(stderr, "AS:%i ", cf->output.array_size);
2230				if (cf->mark)
2231					fprintf(stderr, "MARK ");
2232				if (!cf->barrier)
2233					fprintf(stderr, "NO_BARRIER ");
2234				if (cf->end_of_program)
2235					fprintf(stderr, "EOP ");
2236
2237				if (cf->output.mark)
2238					fprintf(stderr, "MARK ");
2239
2240				fprintf(stderr, "\n");
2241			} else {
2242				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2243						bc->bytecode[id + 1], cfop->name);
2244				fprintf(stderr, "@%d ", cf->cf_addr);
2245				if (cf->cond)
2246					fprintf(stderr, "CND:%X ", cf->cond);
2247				if (cf->pop_count)
2248					fprintf(stderr, "POP:%X ", cf->pop_count);
2249				if (cf->count && (cfop->flags & CF_EMIT))
2250					fprintf(stderr, "STREAM%d ", cf->count);
2251				if (cf->vpm)
2252					fprintf(stderr, "VPM ");
2253				if (cf->end_of_program)
2254					fprintf(stderr, "EOP ");
2255				fprintf(stderr, "\n");
2256			}
2257		}
2258
2259		id = cf->addr;
2260		nliteral = 0;
2261		last = 1;
2262		LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2263			const char *omod_str[] = {"","*2","*4","/2"};
2264			const struct alu_op_info *aop = r600_isa_alu(alu->op);
2265			int o = 0;
2266
2267			r600_bytecode_alu_nliterals(alu, literal, &nliteral);
2268			o += fprintf(stderr, " %04d %08X %08X  ", id, bc->bytecode[id], bc->bytecode[id+1]);
2269			if (last)
2270				o += fprintf(stderr, "%4d ", ++ngr);
2271			else
2272				o += fprintf(stderr, "     ");
2273			o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ',
2274					alu->update_pred ? 'P':' ',
2275					alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' ');
2276
2277			o += fprintf(stderr, "%s%s%s ", aop->name,
2278					omod_str[alu->omod], alu->dst.clamp ? "_sat":"");
2279
2280			o += print_indent(o,60);
2281			o += print_dst(alu);
2282			for (i = 0; i < aop->src_count; ++i) {
2283				o += fprintf(stderr, i == 0 ? ",  ": ", ");
2284				o += print_src(alu, i);
2285			}
2286
2287			if (alu->bank_swizzle) {
2288				o += print_indent(o,75);
2289				o += fprintf(stderr, "  BS:%d", alu->bank_swizzle);
2290			}
2291
2292			fprintf(stderr, "\n");
2293			id += 2;
2294
2295			if (alu->last) {
2296				for (i = 0; i < nliteral; i++, id++) {
2297					float *f = (float*)(bc->bytecode + id);
2298					o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]);
2299					print_indent(o, 60);
2300					fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id));
2301				}
2302				id += nliteral & 1;
2303				nliteral = 0;
2304			}
2305			last = alu->last;
2306		}
2307
2308		LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2309			int o = 0;
2310			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2311					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2312
2313			o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name);
2314
2315			o += print_indent(o, 50);
2316
2317			o += fprintf(stderr, "R%d.", tex->dst_gpr);
2318			o += print_swizzle(tex->dst_sel_x);
2319			o += print_swizzle(tex->dst_sel_y);
2320			o += print_swizzle(tex->dst_sel_z);
2321			o += print_swizzle(tex->dst_sel_w);
2322
2323			o += fprintf(stderr, ", R%d.", tex->src_gpr);
2324			o += print_swizzle(tex->src_sel_x);
2325			o += print_swizzle(tex->src_sel_y);
2326			o += print_swizzle(tex->src_sel_z);
2327			o += print_swizzle(tex->src_sel_w);
2328
2329			o += fprintf(stderr, ",  RID:%d", tex->resource_id);
2330			o += fprintf(stderr, ", SID:%d  ", tex->sampler_id);
2331
2332			if (tex->sampler_index_mode)
2333				fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]);
2334
2335			if (tex->lod_bias)
2336				fprintf(stderr, "LB:%d ", tex->lod_bias);
2337
2338			fprintf(stderr, "CT:%c%c%c%c ",
2339					tex->coord_type_x ? 'N' : 'U',
2340					tex->coord_type_y ? 'N' : 'U',
2341					tex->coord_type_z ? 'N' : 'U',
2342					tex->coord_type_w ? 'N' : 'U');
2343
2344			if (tex->offset_x)
2345				fprintf(stderr, "OX:%d ", tex->offset_x);
2346			if (tex->offset_y)
2347				fprintf(stderr, "OY:%d ", tex->offset_y);
2348			if (tex->offset_z)
2349				fprintf(stderr, "OZ:%d ", tex->offset_z);
2350
2351			id += 4;
2352			fprintf(stderr, "\n");
2353		}
2354
2355		LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2356			int o = 0;
2357			const char * fetch_type[] = {"VERTEX", "INSTANCE", ""};
2358			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2359					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2360
2361			o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name);
2362
2363			o += print_indent(o, 50);
2364
2365			o += fprintf(stderr, "R%d.", vtx->dst_gpr);
2366			o += print_swizzle(vtx->dst_sel_x);
2367			o += print_swizzle(vtx->dst_sel_y);
2368			o += print_swizzle(vtx->dst_sel_z);
2369			o += print_swizzle(vtx->dst_sel_w);
2370
2371			o += fprintf(stderr, ", R%d.", vtx->src_gpr);
2372			o += print_swizzle(vtx->src_sel_x);
2373			if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
2374				o += print_swizzle(vtx->src_sel_y);
2375
2376			if (vtx->offset)
2377				fprintf(stderr, " +%db", vtx->offset);
2378
2379			o += print_indent(o, 55);
2380
2381			fprintf(stderr, ",  RID:%d ", vtx->buffer_id);
2382
2383			fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]);
2384
2385			if (bc->chip_class < CAYMAN && vtx->mega_fetch_count)
2386				fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count);
2387
2388			if (bc->chip_class >= EVERGREEN && vtx->buffer_index_mode)
2389				fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]);
2390
2391			if (r600_isa_fetch(vtx->op)->flags & FF_MEM) {
2392				if (vtx->uncached)
2393					fprintf(stderr, "UNCACHED ");
2394				if (vtx->indexed)
2395					fprintf(stderr, "INDEXED:%d ", vtx->indexed);
2396
2397				fprintf(stderr, "ELEM_SIZE:%d ", vtx->elem_size);
2398				if (vtx->burst_count)
2399					fprintf(stderr, "BURST_COUNT:%d ", vtx->burst_count);
2400				fprintf(stderr, "ARRAY_BASE:%d ", vtx->array_base);
2401				fprintf(stderr, "ARRAY_SIZE:%d ", vtx->array_size);
2402			}
2403
2404			fprintf(stderr, "UCF:%d ", vtx->use_const_fields);
2405			fprintf(stderr, "FMT(DTA:%d ", vtx->data_format);
2406			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2407			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2408			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2409
2410			id += 4;
2411		}
2412
2413		LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
2414			int o = 0;
2415			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2416					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2417
2418			o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name);
2419
2420			if (gds->op != FETCH_OP_TF_WRITE) {
2421				o += fprintf(stderr, "R%d.", gds->dst_gpr);
2422				o += print_swizzle(gds->dst_sel_x);
2423				o += print_swizzle(gds->dst_sel_y);
2424				o += print_swizzle(gds->dst_sel_z);
2425				o += print_swizzle(gds->dst_sel_w);
2426			}
2427
2428			o += fprintf(stderr, ", R%d.", gds->src_gpr);
2429			o += print_swizzle(gds->src_sel_x);
2430			o += print_swizzle(gds->src_sel_y);
2431			o += print_swizzle(gds->src_sel_z);
2432
2433			if (gds->op != FETCH_OP_TF_WRITE) {
2434				o += fprintf(stderr, ", R%d.", gds->src_gpr2);
2435			}
2436			if (gds->alloc_consume) {
2437				o += fprintf(stderr, " UAV: %d", gds->uav_id);
2438				if (gds->uav_index_mode)
2439					o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]);
2440			}
2441			fprintf(stderr, "\n");
2442			id += 4;
2443		}
2444	}
2445
2446	fprintf(stderr, "--------------------------------------\n");
2447}
2448
2449void r600_vertex_data_type(enum pipe_format pformat,
2450				  unsigned *format,
2451				  unsigned *num_format, unsigned *format_comp, unsigned *endian)
2452{
2453	const struct util_format_description *desc;
2454	unsigned i;
2455
2456	*format = 0;
2457	*num_format = 0;
2458	*format_comp = 0;
2459	*endian = ENDIAN_NONE;
2460
2461	if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) {
2462		*format = FMT_10_11_11_FLOAT;
2463		*endian = r600_endian_swap(32);
2464		return;
2465	}
2466
2467	if (pformat == PIPE_FORMAT_B5G6R5_UNORM) {
2468		*format = FMT_5_6_5;
2469		*endian = r600_endian_swap(16);
2470		return;
2471	}
2472
2473	if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) {
2474		*format = FMT_1_5_5_5;
2475		*endian = r600_endian_swap(16);
2476		return;
2477	}
2478
2479	if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) {
2480		*format = FMT_5_5_5_1;
2481		return;
2482	}
2483
2484	desc = util_format_description(pformat);
2485	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2486		goto out_unknown;
2487	}
2488
2489	/* Find the first non-VOID channel. */
2490	for (i = 0; i < 4; i++) {
2491		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2492			break;
2493		}
2494	}
2495
2496	*endian = r600_endian_swap(desc->channel[i].size);
2497
2498	switch (desc->channel[i].type) {
2499	/* Half-floats, floats, ints */
2500	case UTIL_FORMAT_TYPE_FLOAT:
2501		switch (desc->channel[i].size) {
2502		case 16:
2503			switch (desc->nr_channels) {
2504			case 1:
2505				*format = FMT_16_FLOAT;
2506				break;
2507			case 2:
2508				*format = FMT_16_16_FLOAT;
2509				break;
2510			case 3:
2511			case 4:
2512				*format = FMT_16_16_16_16_FLOAT;
2513				break;
2514			}
2515			break;
2516		case 32:
2517			switch (desc->nr_channels) {
2518			case 1:
2519				*format = FMT_32_FLOAT;
2520				break;
2521			case 2:
2522				*format = FMT_32_32_FLOAT;
2523				break;
2524			case 3:
2525				*format = FMT_32_32_32_FLOAT;
2526				break;
2527			case 4:
2528				*format = FMT_32_32_32_32_FLOAT;
2529				break;
2530			}
2531			break;
2532		default:
2533			goto out_unknown;
2534		}
2535		break;
2536		/* Unsigned ints */
2537	case UTIL_FORMAT_TYPE_UNSIGNED:
2538		/* Signed ints */
2539	case UTIL_FORMAT_TYPE_SIGNED:
2540		switch (desc->channel[i].size) {
2541		case 4:
2542			switch (desc->nr_channels) {
2543			case 2:
2544				*format = FMT_4_4;
2545				break;
2546			case 4:
2547				*format = FMT_4_4_4_4;
2548				break;
2549			}
2550			break;
2551		case 8:
2552			switch (desc->nr_channels) {
2553			case 1:
2554				*format = FMT_8;
2555				break;
2556			case 2:
2557				*format = FMT_8_8;
2558				break;
2559			case 3:
2560			case 4:
2561				*format = FMT_8_8_8_8;
2562				break;
2563			}
2564			break;
2565		case 10:
2566			if (desc->nr_channels != 4)
2567				goto out_unknown;
2568
2569			*format = FMT_2_10_10_10;
2570			break;
2571		case 16:
2572			switch (desc->nr_channels) {
2573			case 1:
2574				*format = FMT_16;
2575				break;
2576			case 2:
2577				*format = FMT_16_16;
2578				break;
2579			case 3:
2580			case 4:
2581				*format = FMT_16_16_16_16;
2582				break;
2583			}
2584			break;
2585		case 32:
2586			switch (desc->nr_channels) {
2587			case 1:
2588				*format = FMT_32;
2589				break;
2590			case 2:
2591				*format = FMT_32_32;
2592				break;
2593			case 3:
2594				*format = FMT_32_32_32;
2595				break;
2596			case 4:
2597				*format = FMT_32_32_32_32;
2598				break;
2599			}
2600			break;
2601		default:
2602			goto out_unknown;
2603		}
2604		break;
2605	default:
2606		goto out_unknown;
2607	}
2608
2609	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2610		*format_comp = 1;
2611	}
2612
2613	*num_format = 0;
2614	if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2615	    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2616		if (!desc->channel[i].normalized) {
2617			if (desc->channel[i].pure_integer)
2618				*num_format = 1;
2619			else
2620				*num_format = 2;
2621		}
2622	}
2623	return;
2624out_unknown:
2625	R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2626}
2627
2628void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
2629				      unsigned count,
2630				      const struct pipe_vertex_element *elements)
2631{
2632	struct r600_context *rctx = (struct r600_context *)ctx;
2633	struct r600_bytecode bc;
2634	struct r600_bytecode_vtx vtx;
2635	const struct util_format_description *desc;
2636	unsigned fetch_resource_start = rctx->b.chip_class >= EVERGREEN ? 0 : 160;
2637	unsigned format, num_format, format_comp, endian;
2638	uint32_t *bytecode;
2639	int i, j, r, fs_size;
2640	struct r600_fetch_shader *shader;
2641	unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB;
2642	unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
2643
2644	assert(count < 32);
2645
2646	memset(&bc, 0, sizeof(bc));
2647	r600_bytecode_init(&bc, rctx->b.chip_class, rctx->b.family,
2648			   rctx->screen->has_compressed_msaa_texturing);
2649
2650	bc.isa = rctx->isa;
2651
2652	for (i = 0; i < count; i++) {
2653		if (elements[i].instance_divisor > 1) {
2654			if (rctx->b.chip_class == CAYMAN) {
2655				for (j = 0; j < 4; j++) {
2656					struct r600_bytecode_alu alu;
2657					memset(&alu, 0, sizeof(alu));
2658					alu.op = ALU_OP2_MULHI_UINT;
2659					alu.src[0].sel = 0;
2660					alu.src[0].chan = 3;
2661					alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2662					alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2663					alu.dst.sel = i + 1;
2664					alu.dst.chan = j;
2665					alu.dst.write = j == 3;
2666					alu.last = j == 3;
2667					if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2668						r600_bytecode_clear(&bc);
2669						return NULL;
2670					}
2671				}
2672			} else {
2673				struct r600_bytecode_alu alu;
2674				memset(&alu, 0, sizeof(alu));
2675				alu.op = ALU_OP2_MULHI_UINT;
2676				alu.src[0].sel = 0;
2677				alu.src[0].chan = 3;
2678				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2679				alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2680				alu.dst.sel = i + 1;
2681				alu.dst.chan = 3;
2682				alu.dst.write = 1;
2683				alu.last = 1;
2684				if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2685					r600_bytecode_clear(&bc);
2686					return NULL;
2687				}
2688			}
2689		}
2690	}
2691
2692	for (i = 0; i < count; i++) {
2693		r600_vertex_data_type(elements[i].src_format,
2694				      &format, &num_format, &format_comp, &endian);
2695
2696		desc = util_format_description(elements[i].src_format);
2697		if (!desc) {
2698			r600_bytecode_clear(&bc);
2699			R600_ERR("unknown format %d\n", elements[i].src_format);
2700			return NULL;
2701		}
2702
2703		if (elements[i].src_offset > 65535) {
2704			r600_bytecode_clear(&bc);
2705			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
2706			return NULL;
2707		}
2708
2709		memset(&vtx, 0, sizeof(vtx));
2710		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
2711		vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA;
2712		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2713		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2714		vtx.mega_fetch_count = 0x1F;
2715		vtx.dst_gpr = i + 1;
2716		vtx.dst_sel_x = desc->swizzle[0];
2717		vtx.dst_sel_y = desc->swizzle[1];
2718		vtx.dst_sel_z = desc->swizzle[2];
2719		vtx.dst_sel_w = desc->swizzle[3];
2720		vtx.data_format = format;
2721		vtx.num_format_all = num_format;
2722		vtx.format_comp_all = format_comp;
2723		vtx.offset = elements[i].src_offset;
2724		vtx.endian = endian;
2725
2726		if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2727			r600_bytecode_clear(&bc);
2728			return NULL;
2729		}
2730	}
2731
2732	r600_bytecode_add_cfinst(&bc, CF_OP_RET);
2733
2734	if ((r = r600_bytecode_build(&bc))) {
2735		r600_bytecode_clear(&bc);
2736		return NULL;
2737	}
2738
2739	if (rctx->screen->b.debug_flags & DBG_FS) {
2740		fprintf(stderr, "--------------------------------------------------------------\n");
2741		fprintf(stderr, "Vertex elements state:\n");
2742		for (i = 0; i < count; i++) {
2743			fprintf(stderr, "   ");
2744			util_dump_vertex_element(stderr, elements+i);
2745			fprintf(stderr, "\n");
2746		}
2747
2748		if (!sb_disasm) {
2749			r600_bytecode_disasm(&bc);
2750
2751			fprintf(stderr, "______________________________________________________________\n");
2752		} else {
2753			r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/);
2754		}
2755	}
2756
2757	fs_size = bc.ndw*4;
2758
2759	/* Allocate the CSO. */
2760	shader = CALLOC_STRUCT(r600_fetch_shader);
2761	if (!shader) {
2762		r600_bytecode_clear(&bc);
2763		return NULL;
2764	}
2765
2766	u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256,
2767			     &shader->offset,
2768			     (struct pipe_resource**)&shader->buffer);
2769	if (!shader->buffer) {
2770		r600_bytecode_clear(&bc);
2771		FREE(shader);
2772		return NULL;
2773	}
2774
2775	bytecode = r600_buffer_map_sync_with_rings
2776		(&rctx->b, shader->buffer,
2777		PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
2778	bytecode += shader->offset / 4;
2779
2780	if (R600_BIG_ENDIAN) {
2781		for (i = 0; i < fs_size / 4; ++i) {
2782			bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
2783		}
2784	} else {
2785		memcpy(bytecode, bc.bytecode, fs_size);
2786	}
2787	rctx->b.ws->buffer_unmap(shader->buffer->buf);
2788
2789	r600_bytecode_clear(&bc);
2790	return shader;
2791}
2792
2793void r600_bytecode_alu_read(struct r600_bytecode *bc,
2794		struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1)
2795{
2796	/* WORD0 */
2797	alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0);
2798	alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0);
2799	alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0);
2800	alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0);
2801	alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0);
2802	alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0);
2803	alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0);
2804	alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0);
2805	alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0);
2806	alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0);
2807	alu->last = G_SQ_ALU_WORD0_LAST(word0);
2808
2809	/* WORD1 */
2810	alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1);
2811	if (alu->bank_swizzle)
2812		alu->bank_swizzle_force = alu->bank_swizzle;
2813	alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1);
2814	alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1);
2815	alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1);
2816	alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1);
2817	if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/
2818	{
2819		alu->is_op3 = 1;
2820		alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1);
2821		alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1);
2822		alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1);
2823		alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1);
2824		alu->op = r600_isa_alu_by_opcode(bc->isa,
2825				G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1);
2826
2827	}
2828	else /*ALU_DWORD1_OP2*/
2829	{
2830		alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1);
2831		alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1);
2832		alu->op = r600_isa_alu_by_opcode(bc->isa,
2833				G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0);
2834		alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1);
2835		alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1);
2836		alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1);
2837		alu->execute_mask =
2838			G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1);
2839	}
2840}
2841
2842#if 0
2843void r600_bytecode_export_read(struct r600_bytecode *bc,
2844		struct r600_bytecode_output *output, uint32_t word0, uint32_t word1)
2845{
2846	output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0);
2847	output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0);
2848	output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0);
2849	output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0);
2850
2851	output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1);
2852	output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1);
2853	output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1);
2854	output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1);
2855	output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1);
2856	output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1);
2857    output->op = r600_isa_cf_by_opcode(bc->isa,
2858			G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0);
2859	output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1);
2860	output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1);
2861	output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1);
2862}
2863#endif
2864