1b8e80941Smrg/*
2b8e80941Smrg * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21b8e80941Smrg * SOFTWARE.
22b8e80941Smrg *
23b8e80941Smrg * Authors:
24b8e80941Smrg *    Jonathan Marek <jonathan@marek.ca>
25b8e80941Smrg */
26b8e80941Smrg
27b8e80941Smrg#include "ir2_private.h"
28b8e80941Smrg
29b8e80941Smrgstatic bool scalar_possible(struct ir2_instr *instr)
30b8e80941Smrg{
31b8e80941Smrg	if (instr->alu.scalar_opc == SCALAR_NONE)
32b8e80941Smrg		return false;
33b8e80941Smrg
34b8e80941Smrg	return src_ncomp(instr) == 1;
35b8e80941Smrg}
36b8e80941Smrg
37b8e80941Smrgstatic bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
38b8e80941Smrg{
39b8e80941Smrg	if (!a)
40b8e80941Smrg		return true;
41b8e80941Smrg
42b8e80941Smrg	/* dont use same instruction twice */
43b8e80941Smrg	if (a == b)
44b8e80941Smrg		return false;
45b8e80941Smrg
46b8e80941Smrg	/* PRED_SET must be alone */
47b8e80941Smrg	if (b->alu.scalar_opc >= PRED_SETEs &&
48b8e80941Smrg		b->alu.scalar_opc <= PRED_SET_RESTOREs)
49b8e80941Smrg		return false;
50b8e80941Smrg
51b8e80941Smrg	/* must write to same export (issues otherwise?) */
52b8e80941Smrg	return a->alu.export == b->alu.export;
53b8e80941Smrg}
54b8e80941Smrg
55b8e80941Smrg/* priority of vector instruction for scheduling (lower=higher prio) */
56b8e80941Smrgstatic unsigned alu_vector_prio(struct ir2_instr *instr)
57b8e80941Smrg{
58b8e80941Smrg	if (instr->alu.vector_opc == VECTOR_NONE)
59b8e80941Smrg		return ~0u;
60b8e80941Smrg
61b8e80941Smrg	if (is_export(instr))
62b8e80941Smrg		return 4;
63b8e80941Smrg
64b8e80941Smrg	/* TODO check src type and ncomps */
65b8e80941Smrg	if (instr->src_count == 3)
66b8e80941Smrg		return 0;
67b8e80941Smrg
68b8e80941Smrg	if (!scalar_possible(instr))
69b8e80941Smrg		return 1;
70b8e80941Smrg
71b8e80941Smrg	return instr->src_count == 2 ? 2 : 3;
72b8e80941Smrg}
73b8e80941Smrg
74b8e80941Smrg/* priority of scalar instruction for scheduling (lower=higher prio) */
75b8e80941Smrgstatic unsigned alu_scalar_prio(struct ir2_instr *instr)
76b8e80941Smrg{
77b8e80941Smrg	if (!scalar_possible(instr))
78b8e80941Smrg		return ~0u;
79b8e80941Smrg
80b8e80941Smrg	/* this case is dealt with later */
81b8e80941Smrg	if (instr->src_count > 1)
82b8e80941Smrg		return ~0u;
83b8e80941Smrg
84b8e80941Smrg	if (is_export(instr))
85b8e80941Smrg		return 4;
86b8e80941Smrg
87b8e80941Smrg	/* PRED to end of block */
88b8e80941Smrg	if (instr->alu.scalar_opc >= PRED_SETEs &&
89b8e80941Smrg		instr->alu.scalar_opc <= PRED_SET_RESTOREs)
90b8e80941Smrg		return 5;
91b8e80941Smrg
92b8e80941Smrg	/* scalar only have highest priority */
93b8e80941Smrg	return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
94b8e80941Smrg}
95b8e80941Smrg
96b8e80941Smrg/* this is a bit messy:
97b8e80941Smrg * we want to find a slot where we can insert a scalar MOV with
98b8e80941Smrg * a vector instruction that was already scheduled
99b8e80941Smrg */
100b8e80941Smrgstatic struct ir2_sched_instr*
101b8e80941Smrginsert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
102b8e80941Smrg	struct ir2_src src1, unsigned *comp)
103b8e80941Smrg{
104b8e80941Smrg	struct ir2_sched_instr *sched = NULL, *s;
105b8e80941Smrg	unsigned i, mask = 0xf;
106b8e80941Smrg
107b8e80941Smrg	/* go first earliest point where the mov can be inserted */
108b8e80941Smrg	for (i = ctx->instr_sched_count-1; i > 0; i--) {
109b8e80941Smrg		s = &ctx->instr_sched[i - 1];
110b8e80941Smrg
111b8e80941Smrg		if (s->instr && s->instr->block_idx != block_idx)
112b8e80941Smrg			break;
113b8e80941Smrg		if (s->instr_s && s->instr_s->block_idx != block_idx)
114b8e80941Smrg			break;
115b8e80941Smrg
116b8e80941Smrg		if (src1.type == IR2_SRC_SSA) {
117b8e80941Smrg			if ((s->instr && s->instr->idx == src1.num) ||
118b8e80941Smrg				(s->instr_s && s->instr_s->idx == src1.num))
119b8e80941Smrg				break;
120b8e80941Smrg		}
121b8e80941Smrg
122b8e80941Smrg		unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
123b8e80941Smrg		if ((mask & mr) == 0)
124b8e80941Smrg			break;
125b8e80941Smrg
126b8e80941Smrg		mask &= mr;
127b8e80941Smrg		if (s->instr_s || s->instr->src_count == 3)
128b8e80941Smrg			continue;
129b8e80941Smrg
130b8e80941Smrg		if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
131b8e80941Smrg			continue;
132b8e80941Smrg
133b8e80941Smrg		sched = s;
134b8e80941Smrg	}
135b8e80941Smrg	*comp = ffs(mask) - 1;
136b8e80941Smrg	return sched;
137b8e80941Smrg}
138b8e80941Smrg
139b8e80941Smrg/* case1:
140b8e80941Smrg * in this case, insert a mov to place the 2nd src into to same reg
141b8e80941Smrg * (scalar sources come from the same register)
142b8e80941Smrg *
143b8e80941Smrg * this is a common case which works when one of the srcs is input/const
144b8e80941Smrg * but for instrs which have 2 ssa/reg srcs, then its not ideal
145b8e80941Smrg */
146b8e80941Smrgstatic bool
147b8e80941Smrgscalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
148b8e80941Smrg{
149b8e80941Smrg	struct ir2_src src0 = instr->src[ order];
150b8e80941Smrg	struct ir2_src src1 = instr->src[!order];
151b8e80941Smrg	struct ir2_sched_instr *sched;
152b8e80941Smrg	struct ir2_instr *ins;
153b8e80941Smrg	struct ir2_reg *reg;
154b8e80941Smrg	unsigned idx, comp;
155b8e80941Smrg
156b8e80941Smrg	switch (src0.type) {
157b8e80941Smrg	case IR2_SRC_CONST:
158b8e80941Smrg	case IR2_SRC_INPUT:
159b8e80941Smrg		return false;
160b8e80941Smrg	default:
161b8e80941Smrg		break;
162b8e80941Smrg	}
163b8e80941Smrg
164b8e80941Smrg	/* TODO, insert needs logic for this */
165b8e80941Smrg	if (src1.type == IR2_SRC_REG)
166b8e80941Smrg		return false;
167b8e80941Smrg
168b8e80941Smrg	/* we could do something if they match src1.. */
169b8e80941Smrg	if (src0.negate || src0.abs)
170b8e80941Smrg		return false;
171b8e80941Smrg
172b8e80941Smrg	reg = get_reg_src(ctx, &src0);
173b8e80941Smrg
174b8e80941Smrg	/* result not used more since we will overwrite */
175b8e80941Smrg	for (int i = 0; i < 4; i++)
176b8e80941Smrg		if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
177b8e80941Smrg			return false;
178b8e80941Smrg
179b8e80941Smrg	/* find a place to insert the mov */
180b8e80941Smrg	sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
181b8e80941Smrg	if (!sched)
182b8e80941Smrg		return false;
183b8e80941Smrg
184b8e80941Smrg	ins = &ctx->instr[idx = ctx->instr_count++];
185b8e80941Smrg	ins->idx = idx;
186b8e80941Smrg	ins->type = IR2_ALU;
187b8e80941Smrg	ins->src[0] = src1;
188b8e80941Smrg	ins->src_count = 1;
189b8e80941Smrg	ins->is_ssa = true;
190b8e80941Smrg	ins->ssa.idx = reg->idx;
191b8e80941Smrg	ins->ssa.ncomp = 1;
192b8e80941Smrg	ins->ssa.comp[0].c = comp;
193b8e80941Smrg	ins->alu.scalar_opc = MAXs;
194b8e80941Smrg	ins->alu.export = -1;
195b8e80941Smrg	ins->alu.write_mask = 1;
196b8e80941Smrg	ins->pred = instr->pred;
197b8e80941Smrg	ins->block_idx = instr->block_idx;
198b8e80941Smrg
199b8e80941Smrg	instr->src[0] = src0;
200b8e80941Smrg	instr->alu.src1_swizzle = comp;
201b8e80941Smrg
202b8e80941Smrg	sched->instr_s = ins;
203b8e80941Smrg	return true;
204b8e80941Smrg}
205b8e80941Smrg
206b8e80941Smrg/* fill sched with next fetch or (vector and/or scalar) alu instruction */
207b8e80941Smrgstatic int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
208b8e80941Smrg{
209b8e80941Smrg	struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
210b8e80941Smrg	unsigned avail_count = 0;
211b8e80941Smrg
212b8e80941Smrg	instr_alloc_type_t export = ~0u;
213b8e80941Smrg	int block_idx = -1;
214b8e80941Smrg
215b8e80941Smrg	/* XXX merge this loop with the other one somehow? */
216b8e80941Smrg	ir2_foreach_instr(instr, ctx) {
217b8e80941Smrg		if (!instr->need_emit)
218b8e80941Smrg			continue;
219b8e80941Smrg		if (is_export(instr))
220b8e80941Smrg			export = MIN2(export, export_buf(instr->alu.export));
221b8e80941Smrg	}
222b8e80941Smrg
223b8e80941Smrg	ir2_foreach_instr(instr, ctx) {
224b8e80941Smrg		if (!instr->need_emit)
225b8e80941Smrg			continue;
226b8e80941Smrg
227b8e80941Smrg		/* dont mix exports */
228b8e80941Smrg		if (is_export(instr) && export_buf(instr->alu.export) != export)
229b8e80941Smrg			continue;
230b8e80941Smrg
231b8e80941Smrg		if (block_idx < 0)
232b8e80941Smrg			block_idx = instr->block_idx;
233b8e80941Smrg		else if (block_idx != instr->block_idx || /* must be same block */
234b8e80941Smrg			instr->type == IR2_CF || /* CF/MEM must be alone */
235b8e80941Smrg			(is_export(instr) && export == SQ_MEMORY))
236b8e80941Smrg			break;
237b8e80941Smrg		/* it works because IR2_CF is always at end of block
238b8e80941Smrg		 * and somewhat same idea with MEM exports, which might not be alone
239b8e80941Smrg		 * but will end up in-order at least
240b8e80941Smrg		 */
241b8e80941Smrg
242b8e80941Smrg		/* check if dependencies are satisfied */
243b8e80941Smrg		bool is_ok = true;
244b8e80941Smrg		ir2_foreach_src(src, instr) {
245b8e80941Smrg			if (src->type == IR2_SRC_REG) {
246b8e80941Smrg				/* need to check if all previous instructions in the block
247b8e80941Smrg				 * which write the reg have been emitted
248b8e80941Smrg				 * slow..
249b8e80941Smrg				 * XXX: check components instead of whole register
250b8e80941Smrg				 */
251b8e80941Smrg				struct ir2_reg *reg = get_reg_src(ctx, src);
252b8e80941Smrg				ir2_foreach_instr(p, ctx) {
253b8e80941Smrg					if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
254b8e80941Smrg						is_ok &= !p->need_emit;
255b8e80941Smrg				}
256b8e80941Smrg			} else if (src->type == IR2_SRC_SSA) {
257b8e80941Smrg				/* in this case its easy, just check need_emit */
258b8e80941Smrg				is_ok &= !ctx->instr[src->num].need_emit;
259b8e80941Smrg			}
260b8e80941Smrg		}
261b8e80941Smrg		if (!is_ok)
262b8e80941Smrg			continue;
263b8e80941Smrg
264b8e80941Smrg		avail[avail_count++] = instr;
265b8e80941Smrg	}
266b8e80941Smrg
267b8e80941Smrg	if (!avail_count) {
268b8e80941Smrg		assert(block_idx == -1);
269b8e80941Smrg		return -1;
270b8e80941Smrg	}
271b8e80941Smrg
272b8e80941Smrg	/* priority to FETCH instructions */
273b8e80941Smrg	ir2_foreach_avail(instr) {
274b8e80941Smrg		if (instr->type == IR2_ALU)
275b8e80941Smrg			continue;
276b8e80941Smrg
277b8e80941Smrg		ra_src_free(ctx, instr);
278b8e80941Smrg		ra_reg(ctx, get_reg(instr), -1, false, 0);
279b8e80941Smrg
280b8e80941Smrg		instr->need_emit = false;
281b8e80941Smrg		sched->instr = instr;
282b8e80941Smrg		sched->instr_s = NULL;
283b8e80941Smrg		return block_idx;
284b8e80941Smrg	}
285b8e80941Smrg
286b8e80941Smrg	/* TODO precompute priorities */
287b8e80941Smrg
288b8e80941Smrg	unsigned prio_v = ~0u, prio_s = ~0u, prio;
289b8e80941Smrg	ir2_foreach_avail(instr) {
290b8e80941Smrg		prio = alu_vector_prio(instr);
291b8e80941Smrg		if (prio < prio_v) {
292b8e80941Smrg			instr_v = instr;
293b8e80941Smrg			prio_v = prio;
294b8e80941Smrg		}
295b8e80941Smrg	}
296b8e80941Smrg
297b8e80941Smrg	/* TODO can still insert scalar if src_count=3, if smart about it */
298b8e80941Smrg	if (!instr_v || instr_v->src_count < 3) {
299b8e80941Smrg		ir2_foreach_avail(instr) {
300b8e80941Smrg			bool compat = is_alu_compatible(instr_v, instr);
301b8e80941Smrg
302b8e80941Smrg			prio = alu_scalar_prio(instr);
303b8e80941Smrg			if (prio >= prio_v && !compat)
304b8e80941Smrg				continue;
305b8e80941Smrg
306b8e80941Smrg			if (prio < prio_s) {
307b8e80941Smrg				instr_s = instr;
308b8e80941Smrg				prio_s = prio;
309b8e80941Smrg				if (!compat)
310b8e80941Smrg					instr_v = NULL;
311b8e80941Smrg			}
312b8e80941Smrg		}
313b8e80941Smrg	}
314b8e80941Smrg
315b8e80941Smrg	assert(instr_v || instr_s);
316b8e80941Smrg
317b8e80941Smrg	/* now, we try more complex insertion of vector instruction as scalar
318b8e80941Smrg	 * TODO: if we are smart we can still insert if instr_v->src_count==3
319b8e80941Smrg	 */
320b8e80941Smrg	if (!instr_s && instr_v->src_count < 3) {
321b8e80941Smrg		ir2_foreach_avail(instr) {
322b8e80941Smrg			if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
323b8e80941Smrg				continue;
324b8e80941Smrg
325b8e80941Smrg			/* at this point, src_count should always be 2 */
326b8e80941Smrg			assert(instr->src_count == 2);
327b8e80941Smrg
328b8e80941Smrg			if (scalarize_case1(ctx, instr, 0)) {
329b8e80941Smrg				instr_s = instr;
330b8e80941Smrg				break;
331b8e80941Smrg			}
332b8e80941Smrg			if (scalarize_case1(ctx, instr, 1)) {
333b8e80941Smrg				instr_s = instr;
334b8e80941Smrg				break;
335b8e80941Smrg			}
336b8e80941Smrg		}
337b8e80941Smrg	}
338b8e80941Smrg
339b8e80941Smrg	/* free src registers */
340b8e80941Smrg	if (instr_v) {
341b8e80941Smrg		instr_v->need_emit = false;
342b8e80941Smrg		ra_src_free(ctx, instr_v);
343b8e80941Smrg	}
344b8e80941Smrg
345b8e80941Smrg	if (instr_s) {
346b8e80941Smrg		instr_s->need_emit = false;
347b8e80941Smrg		ra_src_free(ctx, instr_s);
348b8e80941Smrg	}
349b8e80941Smrg
350b8e80941Smrg	/* allocate dst registers */
351b8e80941Smrg	if (instr_v)
352b8e80941Smrg		ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
353b8e80941Smrg
354b8e80941Smrg	if (instr_s)
355b8e80941Smrg		ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
356b8e80941Smrg
357b8e80941Smrg	sched->instr = instr_v;
358b8e80941Smrg	sched->instr_s = instr_s;
359b8e80941Smrg	return block_idx;
360b8e80941Smrg}
361b8e80941Smrg
362b8e80941Smrg/* scheduling: determine order of instructions */
363b8e80941Smrgstatic void schedule_instrs(struct ir2_context *ctx)
364b8e80941Smrg{
365b8e80941Smrg	struct ir2_sched_instr *sched;
366b8e80941Smrg	int block_idx;
367b8e80941Smrg
368b8e80941Smrg	/* allocate input registers */
369b8e80941Smrg	for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
370b8e80941Smrg		if (ctx->input[idx].initialized)
371b8e80941Smrg			ra_reg(ctx, &ctx->input[idx], idx, false, 0);
372b8e80941Smrg
373b8e80941Smrg	for (;;) {
374b8e80941Smrg		sched = &ctx->instr_sched[ctx->instr_sched_count++];
375b8e80941Smrg		block_idx = sched_next(ctx, sched);
376b8e80941Smrg		if (block_idx < 0)
377b8e80941Smrg			break;
378b8e80941Smrg		memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
379b8e80941Smrg
380b8e80941Smrg		/* catch texture fetch after scheduling and insert the
381b8e80941Smrg		 * SET_TEX_LOD right before it if necessary
382b8e80941Smrg		 * TODO clean this up
383b8e80941Smrg		 */
384b8e80941Smrg		struct ir2_instr *instr = sched->instr, *tex_lod;
385b8e80941Smrg		if (instr && instr->type == IR2_FETCH &&
386b8e80941Smrg			instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
387b8e80941Smrg			/* generate the SET_LOD instruction */
388b8e80941Smrg			tex_lod = &ctx->instr[ctx->instr_count++];
389b8e80941Smrg			tex_lod->type = IR2_FETCH;
390b8e80941Smrg			tex_lod->block_idx = instr->block_idx;
391b8e80941Smrg			tex_lod->pred = instr->pred;
392b8e80941Smrg			tex_lod->fetch.opc = TEX_SET_TEX_LOD;
393b8e80941Smrg			tex_lod->src[0] = instr->src[1];
394b8e80941Smrg			tex_lod->src_count = 1;
395b8e80941Smrg
396b8e80941Smrg			sched[1] = sched[0];
397b8e80941Smrg			sched->instr = tex_lod;
398b8e80941Smrg			ctx->instr_sched_count++;
399b8e80941Smrg		}
400b8e80941Smrg
401b8e80941Smrg		bool free_block = true;
402b8e80941Smrg		ir2_foreach_instr(instr, ctx)
403b8e80941Smrg			free_block &= instr->block_idx != block_idx;
404b8e80941Smrg		if (free_block)
405b8e80941Smrg			ra_block_free(ctx, block_idx);
406b8e80941Smrg	};
407b8e80941Smrg	ctx->instr_sched_count--;
408b8e80941Smrg}
409b8e80941Smrg
410b8e80941Smrgvoid
411b8e80941Smrgir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
412b8e80941Smrg		struct fd2_shader_stateobj *fp)
413b8e80941Smrg{
414b8e80941Smrg	struct ir2_context ctx = { };
415b8e80941Smrg	bool binning = !fp && so->type == MESA_SHADER_VERTEX;
416b8e80941Smrg
417b8e80941Smrg	if (fp)
418b8e80941Smrg		so->variant[variant].f = fp->variant[0].f;
419b8e80941Smrg
420b8e80941Smrg	ctx.so = so;
421b8e80941Smrg	ctx.info = &so->variant[variant].info;
422b8e80941Smrg	ctx.f = &so->variant[variant].f;
423b8e80941Smrg	ctx.info->max_reg = -1;
424b8e80941Smrg
425b8e80941Smrg	/* convert nir to internal representation */
426b8e80941Smrg	ir2_nir_compile(&ctx, binning);
427b8e80941Smrg
428b8e80941Smrg	/* copy propagate srcs */
429b8e80941Smrg	cp_src(&ctx);
430b8e80941Smrg
431b8e80941Smrg	/* get ref_counts and kill non-needed instructions */
432b8e80941Smrg	ra_count_refs(&ctx);
433b8e80941Smrg
434b8e80941Smrg	/* remove movs used to write outputs */
435b8e80941Smrg	cp_export(&ctx);
436b8e80941Smrg
437b8e80941Smrg	/* instruction order.. and vector->scalar conversions */
438b8e80941Smrg	schedule_instrs(&ctx);
439b8e80941Smrg
440b8e80941Smrg	/* finally, assemble to bitcode */
441b8e80941Smrg	assemble(&ctx, binning);
442b8e80941Smrg}
443