1/*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Vadim Girlin
25 */
26
27#define BCP_DEBUG 0
28
29#if BCP_DEBUG
30#define BCP_DUMP(q) do { q } while (0)
31#else
32#define BCP_DUMP(q)
33#endif
34
35#include "r600_pipe.h"
36#include "r600_shader.h"
37#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1
38
39#include <stack>
40
41#include "sb_bc.h"
42#include "sb_shader.h"
43#include "sb_pass.h"
44#include "util/macros.h"
45
46namespace r600_sb {
47
48int bc_parser::decode() {
49
50	dw = bc->bytecode;
51	bc_ndw = bc->ndw;
52	max_cf = 0;
53
54	dec = new bc_decoder(ctx, dw, bc_ndw);
55
56	shader_target t = TARGET_UNKNOWN;
57
58	if (pshader) {
59		switch (bc->type) {
60		case PIPE_SHADER_FRAGMENT: t = TARGET_PS; break;
61		case PIPE_SHADER_VERTEX:
62			t = pshader->vs_as_ls ? TARGET_LS : (pshader->vs_as_es ? TARGET_ES : TARGET_VS);
63			break;
64		case PIPE_SHADER_GEOMETRY: t = TARGET_GS; break;
65		case PIPE_SHADER_COMPUTE: t = TARGET_COMPUTE; break;
66		case PIPE_SHADER_TESS_CTRL: t = TARGET_HS; break;
67		case PIPE_SHADER_TESS_EVAL: t = pshader->tes_as_es ? TARGET_ES : TARGET_VS; break;
68		default: assert(!"unknown shader target"); return -1; break;
69		}
70	} else {
71		if (bc->type == PIPE_SHADER_COMPUTE)
72			t = TARGET_COMPUTE;
73		else
74			t = TARGET_FETCH;
75	}
76
77	sh = new shader(ctx, t, bc->debug_id);
78	sh->safe_math = sb_context::safe_math || (t == TARGET_COMPUTE || bc->precise);
79
80	int r = decode_shader();
81
82	delete dec;
83
84	sh->ngpr = bc->ngpr;
85	sh->nstack = bc->nstack;
86
87	return r;
88}
89
90int bc_parser::decode_shader() {
91	int r = 0;
92	unsigned i = 0;
93	bool eop = false;
94
95	sh->init();
96
97	do {
98		eop = false;
99		if ((r = decode_cf(i, eop)))
100			return r;
101
102	} while (!eop || (i >> 1) < max_cf);
103
104	return 0;
105}
106
107int bc_parser::prepare() {
108	int r = 0;
109	if ((r = parse_decls()))
110		return r;
111	if ((r = prepare_ir()))
112		return r;
113	return 0;
114}
115
116int bc_parser::parse_decls() {
117
118	if (!pshader) {
119		if (gpr_reladdr)
120			sh->add_gpr_array(0, bc->ngpr, 0x0F);
121
122		// compute shaders have some values preloaded in R0, R1
123		sh->add_input(0 /* GPR */, true /* preloaded */, 0x0F /* mask */);
124		sh->add_input(1 /* GPR */, true /* preloaded */, 0x0F /* mask */);
125		return 0;
126	}
127
128	if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) {
129
130		assert(pshader->num_arrays);
131
132		if (pshader->num_arrays) {
133			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
134				r600_shader_array &a = pshader->arrays[i];
135				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
136			}
137		} else {
138			sh->add_gpr_array(0, pshader->bc.ngpr, 0x0F);
139		}
140	}
141
142	// GS inputs can add indirect addressing
143	if (sh->target == TARGET_GS) {
144		if (pshader->num_arrays) {
145			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
146				r600_shader_array &a = pshader->arrays[i];
147				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
148			}
149		}
150	}
151
152	if (sh->target == TARGET_VS || sh->target == TARGET_ES || sh->target == TARGET_HS || sh->target == TARGET_LS)
153		sh->add_input(0, 1, 0x0F);
154	else if (sh->target == TARGET_GS) {
155		sh->add_input(0, 1, 0x0F);
156		sh->add_input(1, 1, 0x0F);
157	} else if (sh->target == TARGET_COMPUTE) {
158		sh->add_input(0, 1, 0x0F);
159		sh->add_input(1, 1, 0x0F);
160	}
161
162	bool ps_interp = ctx.hw_class >= HW_CLASS_EVERGREEN
163			&& sh->target == TARGET_PS;
164
165	bool ij_interpolators[6];
166	memset(ij_interpolators, 0, sizeof(ij_interpolators));
167
168	for (unsigned i = 0; i < pshader->ninput; ++i) {
169		r600_shader_io & in = pshader->input[i];
170		bool preloaded = sh->target == TARGET_PS && !(ps_interp && in.spi_sid);
171		sh->add_input(in.gpr, preloaded, /*in.write_mask*/ 0x0F);
172		if (ps_interp && in.spi_sid) {
173			int k = eg_get_interpolator_index(in.interpolate, in.interpolate_location);
174			if (k >= 0) {
175				ij_interpolators[k] |= true;
176				if (in.uses_interpolate_at_centroid) {
177					k = eg_get_interpolator_index(in.interpolate, TGSI_INTERPOLATE_LOC_CENTROID);
178					ij_interpolators[k] |= true;
179				}
180			}
181		}
182	}
183
184	if (ps_interp) {
185		/* add the egcm ij interpolators to live inputs */
186		unsigned num_ij = 0;
187		for (unsigned i = 0; i < ARRAY_SIZE(ij_interpolators); i++) {
188			num_ij += ij_interpolators[i];
189		}
190
191		unsigned mask = (1 << (2 * num_ij)) - 1;
192		unsigned gpr = 0;
193
194		while (mask) {
195			sh->add_input(gpr, true, mask & 0x0F);
196			++gpr;
197			mask >>= 4;
198		}
199	}
200
201	return 0;
202}
203
204int bc_parser::decode_cf(unsigned &i, bool &eop) {
205
206	int r;
207
208	cf_node *cf = sh->create_cf();
209	sh->root->push_back(cf);
210
211	unsigned id = i >> 1;
212
213	cf->bc.id = id;
214
215	if (cf_map.size() < id + 1)
216		cf_map.resize(id + 1);
217
218	cf_map[id] = cf;
219
220	if ((r = dec->decode_cf(i, cf->bc)))
221		return r;
222
223	cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags;
224
225	if (flags & CF_ALU) {
226		if ((r = decode_alu_clause(cf)))
227			return r;
228	} else if (flags & CF_FETCH) {
229		if ((r = decode_fetch_clause(cf)))
230			return r;
231	} else if (flags & CF_EXP) {
232		if (cf->bc.rw_rel)
233			gpr_reladdr = true;
234		assert(!cf->bc.rw_rel);
235	} else if (flags & CF_MEM) {
236		if (cf->bc.rw_rel)
237			gpr_reladdr = true;
238		assert(!cf->bc.rw_rel);
239	} else if (flags & CF_BRANCH) {
240		if (cf->bc.addr > max_cf)
241			max_cf = cf->bc.addr;
242	}
243
244	eop = cf->bc.end_of_program || cf->bc.op == CF_OP_CF_END ||
245			cf->bc.op == CF_OP_RET;
246	return 0;
247}
248
249int bc_parser::decode_alu_clause(cf_node* cf) {
250	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1, gcnt;
251
252	cf->subtype = NST_ALU_CLAUSE;
253
254	cgroup = 0;
255	memset(slots[0], 0, 5*sizeof(slots[0][0]));
256
257	unsigned ng = 0;
258
259	do {
260		decode_alu_group(cf, i, gcnt);
261		assert(gcnt <= cnt);
262		cnt -= gcnt;
263		ng++;
264	} while (cnt);
265
266	return 0;
267}
268
269int bc_parser::decode_alu_group(cf_node* cf, unsigned &i, unsigned &gcnt) {
270	int r;
271	alu_node *n;
272	alu_group_node *g = sh->create_alu_group();
273
274	cgroup = !cgroup;
275	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
276	gcnt = 0;
277
278	unsigned literal_mask = 0;
279
280	do {
281		n = sh->create_alu();
282		g->push_back(n);
283
284		if ((r = dec->decode_alu(i, n->bc)))
285			return r;
286
287		if (!sh->assign_slot(n, slots[cgroup])) {
288			assert(!"alu slot assignment failed");
289			return -1;
290		}
291
292		gcnt++;
293
294	} while (gcnt <= 5 && !n->bc.last);
295
296	assert(n->bc.last);
297
298	for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) {
299		n = static_cast<alu_node*>(*I);
300
301		if (n->bc.dst_rel)
302			gpr_reladdr = true;
303
304		for (int k = 0; k < n->bc.op_ptr->src_count; ++k) {
305			bc_alu_src &src = n->bc.src[k];
306			if (src.rel)
307				gpr_reladdr = true;
308			if (src.sel == ALU_SRC_LITERAL) {
309				literal_mask |= (1 << src.chan);
310				src.value.u = dw[i + src.chan];
311			}
312		}
313	}
314
315	unsigned literal_ndw = 0;
316	while (literal_mask) {
317		g->literals.push_back(dw[i + literal_ndw]);
318		literal_ndw += 1;
319		literal_mask >>= 1;
320	}
321
322	literal_ndw = (literal_ndw + 1) & ~1u;
323
324	i += literal_ndw;
325	gcnt += literal_ndw >> 1;
326
327	cf->push_back(g);
328	return 0;
329}
330
331int bc_parser::prepare_alu_clause(cf_node* cf) {
332
333	// loop over alu groups
334	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
335		assert(I->subtype == NST_ALU_GROUP);
336		alu_group_node *g = static_cast<alu_group_node*>(*I);
337		prepare_alu_group(cf, g);
338	}
339
340	return 0;
341}
342
343void bc_parser::save_set_cf_index(value *val, unsigned idx)
344{
345	assert(idx <= 1);
346	assert(val);
347	cf_index_value[idx] = val;
348}
349value *bc_parser::get_cf_index_value(unsigned idx)
350{
351	assert(idx <= 1);
352	assert(cf_index_value[idx]);
353	return cf_index_value[idx];
354}
355void bc_parser::save_mova(alu_node *mova)
356{
357	assert(mova);
358	this->mova = mova;
359}
360alu_node *bc_parser::get_mova()
361{
362	assert(mova);
363	return mova;
364}
365
366int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
367
368	alu_node *n;
369
370	cgroup = !cgroup;
371	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
372
373	for (node_iterator I = g->begin(), E = g->end();
374			I != E; ++I) {
375		n = static_cast<alu_node*>(*I);
376		bool ubo_indexing[2] = {};
377
378		if (!sh->assign_slot(n, slots[cgroup])) {
379			assert(!"alu slot assignment failed");
380			return -1;
381		}
382
383		unsigned src_count = n->bc.op_ptr->src_count;
384
385		if (ctx.alu_slots(n->bc.op) & AF_4SLOT)
386			n->flags |= NF_ALU_4SLOT;
387
388		if (ctx.alu_slots(n->bc.op) & AF_2SLOT)
389			n->flags |= NF_ALU_2SLOT;
390
391		n->src.resize(src_count);
392
393		unsigned flags = n->bc.op_ptr->flags;
394
395		if (flags & AF_LDS) {
396			bool need_rw = false, need_oqa = false, need_oqb = false;
397			int ndst = 0, ncount = 0;
398
399			/* all non-read operations have side effects */
400			if (n->bc.op != LDS_OP2_LDS_READ2_RET &&
401			    n->bc.op != LDS_OP1_LDS_READ_REL_RET &&
402			    n->bc.op != LDS_OP1_LDS_READ_RET) {
403				n->flags |= NF_DONT_KILL;
404				ndst++;
405				need_rw = true;
406			}
407
408			if (n->bc.op >= LDS_OP2_LDS_ADD_RET && n->bc.op <= LDS_OP1_LDS_USHORT_READ_RET) {
409				need_oqa = true;
410				ndst++;
411			}
412
413			if (n->bc.op == LDS_OP2_LDS_READ2_RET || n->bc.op == LDS_OP1_LDS_READ_REL_RET) {
414				need_oqb = true;
415				ndst++;
416			}
417
418			n->dst.resize(ndst);
419			if (need_oqa)
420				n->dst[ncount++] = sh->get_special_value(SV_LDS_OQA);
421			if (need_oqb)
422				n->dst[ncount++] = sh->get_special_value(SV_LDS_OQB);
423			if (need_rw)
424				n->dst[ncount++] = sh->get_special_value(SV_LDS_RW);
425
426			n->flags |= NF_DONT_MOVE | NF_DONT_HOIST;
427
428		} else if (flags & AF_PRED) {
429			n->dst.resize(3);
430			if (n->bc.update_pred)
431				n->dst[1] = sh->get_special_value(SV_ALU_PRED);
432			if (n->bc.update_exec_mask)
433				n->dst[2] = sh->get_special_value(SV_EXEC_MASK);
434
435			n->flags |= NF_DONT_HOIST;
436
437		} else if (flags & AF_KILL) {
438
439			n->dst.resize(2);
440			n->dst[1] = sh->get_special_value(SV_VALID_MASK);
441			sh->set_uses_kill();
442
443			n->flags |= NF_DONT_HOIST | NF_DONT_MOVE |
444					NF_DONT_KILL | NF_SCHEDULE_EARLY;
445
446		} else {
447			n->dst.resize(1);
448		}
449
450		if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) {
451			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
452			// DCE will kill this op
453			save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1);
454		} else if (flags & AF_MOVA) {
455
456			n->dst[0] = sh->get_special_value(SV_AR_INDEX);
457			save_mova(n);
458
459			n->flags |= NF_DONT_HOIST;
460
461		} else if ((n->bc.op_ptr->src_count == 3 || n->bc.write_mask) && !(flags & AF_LDS)) {
462			assert(!n->bc.dst_rel || n->bc.index_mode == INDEX_AR_X);
463
464			value *v = sh->get_gpr_value(false, n->bc.dst_gpr, n->bc.dst_chan,
465					n->bc.dst_rel);
466
467			n->dst[0] = v;
468		}
469
470		if (n->bc.pred_sel) {
471			sh->has_alu_predication = true;
472			n->pred = sh->get_special_value(SV_ALU_PRED);
473		}
474
475		for (unsigned s = 0; s < src_count; ++s) {
476			bc_alu_src &src = n->bc.src[s];
477
478			if (src.sel == ALU_SRC_LITERAL) {
479				n->src[s] = sh->get_const_value(src.value);
480			} else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) {
481				unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ?
482						((unsigned)SLOT_TRANS) : src.chan;
483
484				// XXX shouldn't happen but llvm backend uses PS on cayman
485				if (prev_slot == SLOT_TRANS && ctx.is_cayman())
486					prev_slot = SLOT_X;
487
488				alu_node *prev_alu = slots[pgroup][prev_slot];
489
490				assert(prev_alu);
491
492				if (!prev_alu->dst[0]) {
493					value * t = sh->create_temp_value();
494					prev_alu->dst[0] = t;
495				}
496
497				value *d = prev_alu->dst[0];
498
499				if (d->is_rel()) {
500					d = sh->get_gpr_value(true, prev_alu->bc.dst_gpr,
501					                      prev_alu->bc.dst_chan,
502					                      prev_alu->bc.dst_rel);
503				}
504
505				n->src[s] = d;
506			} else if (ctx.is_kcache_sel(src.sel)) {
507				unsigned sel = src.sel, kc_addr;
508				unsigned kc_set = ((sel >> 7) & 2) + ((sel >> 5) & 1);
509
510				bc_kcache &kc = cf->bc.kc[kc_set];
511				kc_addr = (kc.addr << 4) + (sel & 0x1F);
512				n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode);
513
514				if (kc.index_mode != KC_INDEX_NONE) {
515					assert(kc.index_mode != KC_LOCK_LOOP);
516					ubo_indexing[kc.index_mode - KC_INDEX_0] = true;
517				}
518			} else if (src.sel < MAX_GPR) {
519				value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel);
520
521				n->src[s] = v;
522
523			} else if (src.sel >= ALU_SRC_PARAM_OFFSET) {
524				// using slot for value channel because in fact the slot
525				// determines the channel that is loaded by INTERP_LOAD_P0
526				// (and maybe some others).
527				// otherwise GVN will consider INTERP_LOAD_P0s with the same
528				// param index as equal instructions and leave only one of them
529				n->src[s] = sh->get_special_ro_value(sel_chan(src.sel,
530				                                              n->bc.slot));
531			} else if (ctx.is_lds_oq(src.sel)) {
532				switch (src.sel) {
533				case ALU_SRC_LDS_OQ_A:
534				case ALU_SRC_LDS_OQ_B:
535					assert(!"Unsupported LDS queue access in SB");
536					break;
537				case ALU_SRC_LDS_OQ_A_POP:
538					n->src[s] = sh->get_special_value(SV_LDS_OQA);
539					break;
540				case ALU_SRC_LDS_OQ_B_POP:
541					n->src[s] = sh->get_special_value(SV_LDS_OQB);
542					break;
543				}
544				n->flags |= NF_DONT_HOIST | NF_DONT_MOVE;
545
546			} else {
547				switch (src.sel) {
548				case ALU_SRC_0:
549					n->src[s] = sh->get_const_value(0);
550					break;
551				case ALU_SRC_0_5:
552					n->src[s] = sh->get_const_value(0.5f);
553					break;
554				case ALU_SRC_1:
555					n->src[s] = sh->get_const_value(1.0f);
556					break;
557				case ALU_SRC_1_INT:
558					n->src[s] = sh->get_const_value(1);
559					break;
560				case ALU_SRC_M_1_INT:
561					n->src[s] = sh->get_const_value(-1);
562					break;
563				default:
564					n->src[s] = sh->get_special_ro_value(src.sel);
565					break;
566				}
567			}
568		}
569
570		// add UBO index values if any as dependencies
571		if (ubo_indexing[0]) {
572			n->src.push_back(get_cf_index_value(0));
573		}
574		if (ubo_indexing[1]) {
575			n->src.push_back(get_cf_index_value(1));
576		}
577
578		if ((flags & AF_MOVA) && (n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) &&
579		    ctx.is_cayman())
580			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
581			save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1);
582	}
583
584	// pack multislot instructions into alu_packed_node
585
586	alu_packed_node *p = NULL;
587	for (node_iterator N, I = g->begin(), E = g->end(); I != E; I = N) {
588		N = I + 1;
589		alu_node *a = static_cast<alu_node*>(*I);
590		unsigned sflags = a->bc.slot_flags;
591
592		if (sflags == AF_4V || sflags == AF_2V  || (ctx.is_cayman() && sflags == AF_S)) {
593			if (!p)
594				p = sh->create_alu_packed();
595
596			a->remove();
597			p->push_back(a);
598                        if (sflags == AF_2V && p->count() == 2) {
599                           g->push_front(p);
600                           p = NULL;
601                        }
602		}
603	}
604
605	if (p) {
606		g->push_front(p);
607
608		if (p->count() == 3 && ctx.is_cayman()) {
609			// cayman's scalar instruction that can use 3 or 4 slots
610
611			// FIXME for simplicity we'll always add 4th slot,
612			// but probably we might want to always remove 4th slot and make
613			// sure that regalloc won't choose 'w' component for dst
614
615			alu_node *f = static_cast<alu_node*>(p->first);
616			alu_node *a = sh->create_alu();
617			a->src = f->src;
618			a->dst.resize(f->dst.size());
619			a->bc = f->bc;
620			a->bc.slot = SLOT_W;
621			p->push_back(a);
622		}
623	}
624
625	return 0;
626}
627
628int bc_parser::decode_fetch_clause(cf_node* cf) {
629	int r;
630	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1;
631
632	if (cf->bc.op_ptr->flags & FF_GDS)
633		cf->subtype = NST_GDS_CLAUSE;
634	else
635		cf->subtype = NST_TEX_CLAUSE;
636
637	while (cnt--) {
638		fetch_node *n = sh->create_fetch();
639		cf->push_back(n);
640		if ((r = dec->decode_fetch(i, n->bc)))
641			return r;
642		if (n->bc.src_rel || n->bc.dst_rel)
643			gpr_reladdr = true;
644
645	}
646	return 0;
647}
648
649int bc_parser::prepare_fetch_clause(cf_node *cf) {
650
651	vvec grad_v, grad_h, texture_offsets;
652
653	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
654
655		fetch_node *n = static_cast<fetch_node*>(*I);
656		assert(n->is_valid());
657
658		unsigned flags = n->bc.op_ptr->flags;
659
660		unsigned vtx = flags & FF_VTX;
661		unsigned gds = flags & FF_GDS;
662		unsigned num_src = gds ? 2 : vtx ? ctx.vtx_src_num : 4;
663
664		n->dst.resize(4);
665
666		if (gds) {
667			n->flags |= NF_DONT_HOIST | NF_DONT_MOVE | NF_DONT_KILL;
668		}
669		if (flags & (FF_SETGRAD | FF_USEGRAD | FF_GETGRAD)) {
670			sh->uses_gradients = true;
671		}
672
673		if (flags & (FF_SETGRAD | FF_SET_TEXTURE_OFFSETS)) {
674
675			vvec *grad = NULL;
676
677			switch (n->bc.op) {
678				case FETCH_OP_SET_GRADIENTS_V:
679					grad = &grad_v;
680					break;
681				case FETCH_OP_SET_GRADIENTS_H:
682					grad = &grad_h;
683					break;
684				case FETCH_OP_SET_TEXTURE_OFFSETS:
685					grad = &texture_offsets;
686					break;
687				default:
688					assert(!"unexpected SET_GRAD instruction");
689					return -1;
690			}
691
692			if (grad->empty())
693				grad->resize(4);
694
695			for(unsigned s = 0; s < 4; ++s) {
696				unsigned sw = n->bc.src_sel[s];
697				if (sw <= SEL_W)
698					(*grad)[s] = sh->get_gpr_value(true, n->bc.src_gpr,
699					                               sw, false);
700				else if (sw == SEL_0)
701					(*grad)[s] = sh->get_const_value(0.0f);
702				else if (sw == SEL_1)
703					(*grad)[s] = sh->get_const_value(1.0f);
704			}
705		} else {
706			// Fold source values for instructions with hidden target values in to the instructions
707			// using them. The set instructions are later re-emitted by bc_finalizer
708			if (flags & FF_USEGRAD) {
709				n->src.resize(12);
710				std::copy(grad_v.begin(), grad_v.end(), n->src.begin() + 4);
711				std::copy(grad_h.begin(), grad_h.end(), n->src.begin() + 8);
712			} else if (flags & FF_USE_TEXTURE_OFFSETS) {
713				n->src.resize(8);
714				std::copy(texture_offsets.begin(), texture_offsets.end(), n->src.begin() + 4);
715			} else {
716				n->src.resize(4);
717			}
718
719			for(int s = 0; s < 4; ++s) {
720				if (n->bc.dst_sel[s] != SEL_MASK)
721					n->dst[s] = sh->get_gpr_value(false, n->bc.dst_gpr, s, false);
722				// NOTE: it doesn't matter here which components of the result we
723				// are using, but original n->bc.dst_sel should be taken into
724				// account when building the bytecode
725			}
726			for(unsigned s = 0; s < num_src; ++s) {
727				if (n->bc.src_sel[s] <= SEL_W)
728					n->src[s] = sh->get_gpr_value(true, n->bc.src_gpr,
729					                              n->bc.src_sel[s], false);
730			}
731
732			// Scheduler will emit the appropriate instructions to set CF_IDX0/1
733			if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
734				n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1));
735			}
736			if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
737				n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1));
738			}
739		}
740
741		if (n->bc.op == FETCH_OP_READ_SCRATCH) {
742			n->src.push_back(sh->get_special_value(SV_SCRATCH));
743			n->dst.push_back(sh->get_special_value(SV_SCRATCH));
744		}
745	}
746
747	return 0;
748}
749
750int bc_parser::prepare_ir() {
751
752	for(id_cf_map::iterator I = cf_map.begin(), E = cf_map.end(); I != E; ++I) {
753		cf_node *c = *I;
754
755		if (!c)
756			continue;
757
758		unsigned flags = c->bc.op_ptr->flags;
759
760		if (flags & CF_ALU) {
761			prepare_alu_clause(c);
762		} else if (flags & CF_FETCH) {
763			prepare_fetch_clause(c);
764		} else if (c->bc.op == CF_OP_CALL_FS) {
765			sh->init_call_fs(c);
766			c->flags |= NF_SCHEDULE_EARLY | NF_DONT_MOVE;
767		} else if (flags & CF_LOOP_START) {
768			prepare_loop(c);
769		} else if (c->bc.op == CF_OP_JUMP) {
770			prepare_if(c);
771		} else if (c->bc.op == CF_OP_LOOP_END) {
772			loop_stack.pop();
773		} else if (c->bc.op == CF_OP_LOOP_CONTINUE) {
774			assert(!loop_stack.empty());
775			repeat_node *rep = sh->create_repeat(loop_stack.top());
776			if (c->parent->first != c)
777				rep->move(c->parent->first, c);
778			c->replace_with(rep);
779			sh->simplify_dep_rep(rep);
780		} else if (c->bc.op == CF_OP_LOOP_BREAK) {
781			assert(!loop_stack.empty());
782			depart_node *dep = sh->create_depart(loop_stack.top());
783			if (c->parent->first != c)
784				dep->move(c->parent->first, c);
785			c->replace_with(dep);
786			sh->simplify_dep_rep(dep);
787		} else if (flags & CF_EXP) {
788
789			// unroll burst exports
790
791			assert(c->bc.op == CF_OP_EXPORT || c->bc.op == CF_OP_EXPORT_DONE);
792
793			c->bc.set_op(CF_OP_EXPORT);
794
795			unsigned burst_count = c->bc.burst_count;
796			unsigned eop = c->bc.end_of_program;
797
798			c->bc.end_of_program = 0;
799			c->bc.burst_count = 0;
800
801			do {
802				c->src.resize(4);
803
804				for(int s = 0; s < 4; ++s) {
805					switch (c->bc.sel[s]) {
806					case SEL_0:
807						c->src[s] = sh->get_const_value(0.0f);
808						break;
809					case SEL_1:
810						c->src[s] = sh->get_const_value(1.0f);
811						break;
812					case SEL_MASK:
813						break;
814					default:
815						if (c->bc.sel[s] <= SEL_W)
816							c->src[s] = sh->get_gpr_value(true, c->bc.rw_gpr,
817									c->bc.sel[s], false);
818						else
819							assert(!"invalid src_sel for export");
820					}
821				}
822
823				if (!burst_count--)
824					break;
825
826				cf_node *cf_next = sh->create_cf();
827				cf_next->bc = c->bc;
828				++cf_next->bc.rw_gpr;
829				++cf_next->bc.array_base;
830
831				c->insert_after(cf_next);
832				c = cf_next;
833
834			} while (1);
835
836			c->bc.end_of_program = eop;
837		} else if (flags & CF_MEM) {
838
839			unsigned burst_count = c->bc.burst_count;
840			unsigned eop = c->bc.end_of_program;
841
842			c->bc.end_of_program = 0;
843			c->bc.burst_count = 0;
844
845			do {
846
847				if (ctx.hw_class == HW_CLASS_R600 && c->bc.op == CF_OP_MEM_SCRATCH &&
848				    (c->bc.type == 2 || c->bc.type == 3)) {
849					c->dst.resize(4);
850					for(int s = 0; s < 4; ++s) {
851						if (c->bc.comp_mask & (1 << s))
852							c->dst[s] =
853								sh->get_gpr_value(true, c->bc.rw_gpr, s, false);
854					}
855				} else {
856					c->src.resize(4);
857
858
859					for(int s = 0; s < 4; ++s) {
860						if (c->bc.comp_mask & (1 << s))
861							c->src[s] =
862								sh->get_gpr_value(true, c->bc.rw_gpr, s, false);
863					}
864				}
865
866				if (((flags & CF_RAT) || (!(flags & CF_STRM))) && (c->bc.type & 1)) { // indexed write
867					c->src.resize(8);
868					for(int s = 0; s < 3; ++s) {
869						c->src[4 + s] =
870							sh->get_gpr_value(true, c->bc.index_gpr, s, false);
871					}
872
873					// FIXME probably we can relax it a bit
874					c->flags |= NF_DONT_HOIST | NF_DONT_MOVE;
875				}
876
877				if (flags & CF_EMIT) {
878					// Instruction implicitly depends on prior [EMIT_][CUT]_VERTEX
879					c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
880					c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
881					if (sh->target == TARGET_ES) {
882						// For ES shaders this is an export
883						c->flags |= NF_DONT_KILL;
884					}
885				}
886				else if (c->bc.op == CF_OP_MEM_SCRATCH) {
887					c->src.push_back(sh->get_special_value(SV_SCRATCH));
888					c->dst.push_back(sh->get_special_value(SV_SCRATCH));
889				}
890
891				if (!burst_count--)
892					break;
893
894				cf_node *cf_next = sh->create_cf();
895				cf_next->bc = c->bc;
896				++cf_next->bc.rw_gpr;
897
898				// FIXME is it correct?
899				cf_next->bc.array_base += cf_next->bc.elem_size + 1;
900
901				c->insert_after(cf_next);
902				c = cf_next;
903			} while (1);
904
905			c->bc.end_of_program = eop;
906
907		} else if (flags & CF_EMIT) {
908			/* quick peephole */
909			cf_node *prev = static_cast<cf_node *>(c->prev);
910			if (c->bc.op == CF_OP_CUT_VERTEX &&
911				prev && prev->is_valid() &&
912				prev->bc.op == CF_OP_EMIT_VERTEX &&
913				c->bc.count == prev->bc.count) {
914				prev->bc.set_op(CF_OP_EMIT_CUT_VERTEX);
915				prev->bc.end_of_program = c->bc.end_of_program;
916				c->remove();
917			}
918			else {
919				c->flags |= NF_DONT_KILL | NF_DONT_HOIST | NF_DONT_MOVE;
920
921				c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
922				c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
923			}
924		} else if (c->bc.op == CF_OP_WAIT_ACK) {
925			c->src.push_back(sh->get_special_value(SV_SCRATCH));
926			c->dst.push_back(sh->get_special_value(SV_SCRATCH));
927		}
928	}
929
930	assert(loop_stack.empty());
931	return 0;
932}
933
934int bc_parser::prepare_loop(cf_node* c) {
935	assert(c->bc.addr-1 < cf_map.size());
936
937	cf_node *end = cf_map[c->bc.addr - 1];
938	assert(end->bc.op == CF_OP_LOOP_END);
939	assert(c->parent == end->parent);
940
941	region_node *reg = sh->create_region();
942	repeat_node *rep = sh->create_repeat(reg);
943
944	reg->push_back(rep);
945	c->insert_before(reg);
946	rep->move(c, end->next);
947
948	reg->src_loop = true;
949
950	loop_stack.push(reg);
951	return 0;
952}
953
954int bc_parser::prepare_if(cf_node* c) {
955	assert(c->bc.addr-1 < cf_map.size());
956	cf_node *c_else = NULL, *end = cf_map[c->bc.addr];
957
958	if (!end)
959		return 0; // not quite sure how this happens, malformed input?
960
961	BCP_DUMP(
962		sblog << "parsing JUMP @" << c->bc.id;
963		sblog << "\n";
964	);
965
966	if (end->bc.op == CF_OP_ELSE) {
967		BCP_DUMP(
968			sblog << "  found ELSE : ";
969			dump::dump_op(end);
970			sblog << "\n";
971		);
972
973		c_else = end;
974		end = cf_map[c_else->bc.addr];
975	} else {
976		BCP_DUMP(
977			sblog << "  no else\n";
978		);
979
980		c_else = end;
981	}
982
983	if (c_else->parent != c->parent)
984		c_else = NULL;
985
986	if (end && end->parent != c->parent)
987		end = NULL;
988
989	region_node *reg = sh->create_region();
990
991	depart_node *dep2 = sh->create_depart(reg);
992	depart_node *dep = sh->create_depart(reg);
993	if_node *n_if = sh->create_if();
994
995	c->insert_before(reg);
996
997	if (c_else != end)
998		dep->move(c_else, end);
999	dep2->move(c, end);
1000
1001	reg->push_back(dep);
1002	dep->push_front(n_if);
1003	n_if->push_back(dep2);
1004
1005	n_if->cond = sh->get_special_value(SV_EXEC_MASK);
1006
1007	return 0;
1008}
1009
1010
1011} // namespace r600_sb
1012