1/*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Jonathan Marek <jonathan@marek.ca>
25 */
26
27#include "ir2_private.h"
28
29static bool scalar_possible(struct ir2_instr *instr)
30{
31	if (instr->alu.scalar_opc == SCALAR_NONE)
32		return false;
33
34	return src_ncomp(instr) == 1;
35}
36
37static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
38{
39	if (!a)
40		return true;
41
42	/* dont use same instruction twice */
43	if (a == b)
44		return false;
45
46	/* PRED_SET must be alone */
47	if (b->alu.scalar_opc >= PRED_SETEs &&
48		b->alu.scalar_opc <= PRED_SET_RESTOREs)
49		return false;
50
51	/* must write to same export (issues otherwise?) */
52	return a->alu.export == b->alu.export;
53}
54
55/* priority of vector instruction for scheduling (lower=higher prio) */
56static unsigned alu_vector_prio(struct ir2_instr *instr)
57{
58	if (instr->alu.vector_opc == VECTOR_NONE)
59		return ~0u;
60
61	if (is_export(instr))
62		return 4;
63
64	/* TODO check src type and ncomps */
65	if (instr->src_count == 3)
66		return 0;
67
68	if (!scalar_possible(instr))
69		return 1;
70
71	return instr->src_count == 2 ? 2 : 3;
72}
73
74/* priority of scalar instruction for scheduling (lower=higher prio) */
75static unsigned alu_scalar_prio(struct ir2_instr *instr)
76{
77	if (!scalar_possible(instr))
78		return ~0u;
79
80	/* this case is dealt with later */
81	if (instr->src_count > 1)
82		return ~0u;
83
84	if (is_export(instr))
85		return 4;
86
87	/* PRED to end of block */
88	if (instr->alu.scalar_opc >= PRED_SETEs &&
89		instr->alu.scalar_opc <= PRED_SET_RESTOREs)
90		return 5;
91
92	/* scalar only have highest priority */
93	return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
94}
95
96/* this is a bit messy:
97 * we want to find a slot where we can insert a scalar MOV with
98 * a vector instruction that was already scheduled
99 */
100static struct ir2_sched_instr*
101insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
102	struct ir2_src src1, unsigned *comp)
103{
104	struct ir2_sched_instr *sched = NULL, *s;
105	unsigned i, mask = 0xf;
106
107	/* go first earliest point where the mov can be inserted */
108	for (i = ctx->instr_sched_count-1; i > 0; i--) {
109		s = &ctx->instr_sched[i - 1];
110
111		if (s->instr && s->instr->block_idx != block_idx)
112			break;
113		if (s->instr_s && s->instr_s->block_idx != block_idx)
114			break;
115
116		if (src1.type == IR2_SRC_SSA) {
117			if ((s->instr && s->instr->idx == src1.num) ||
118				(s->instr_s && s->instr_s->idx == src1.num))
119				break;
120		}
121
122		unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
123		if ((mask & mr) == 0)
124			break;
125
126		mask &= mr;
127		if (s->instr_s || s->instr->src_count == 3)
128			continue;
129
130		if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
131			continue;
132
133		sched = s;
134	}
135	*comp = ffs(mask) - 1;
136	return sched;
137}
138
139/* case1:
140 * in this case, insert a mov to place the 2nd src into to same reg
141 * (scalar sources come from the same register)
142 *
143 * this is a common case which works when one of the srcs is input/const
144 * but for instrs which have 2 ssa/reg srcs, then its not ideal
145 */
146static bool
147scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
148{
149	struct ir2_src src0 = instr->src[ order];
150	struct ir2_src src1 = instr->src[!order];
151	struct ir2_sched_instr *sched;
152	struct ir2_instr *ins;
153	struct ir2_reg *reg;
154	unsigned idx, comp;
155
156	switch (src0.type) {
157	case IR2_SRC_CONST:
158	case IR2_SRC_INPUT:
159		return false;
160	default:
161		break;
162	}
163
164	/* TODO, insert needs logic for this */
165	if (src1.type == IR2_SRC_REG)
166		return false;
167
168	/* we could do something if they match src1.. */
169	if (src0.negate || src0.abs)
170		return false;
171
172	reg = get_reg_src(ctx, &src0);
173
174	/* result not used more since we will overwrite */
175	for (int i = 0; i < 4; i++)
176		if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
177			return false;
178
179	/* find a place to insert the mov */
180	sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
181	if (!sched)
182		return false;
183
184	ins = &ctx->instr[idx = ctx->instr_count++];
185	ins->idx = idx;
186	ins->type = IR2_ALU;
187	ins->src[0] = src1;
188	ins->src_count = 1;
189	ins->is_ssa = true;
190	ins->ssa.idx = reg->idx;
191	ins->ssa.ncomp = 1;
192	ins->ssa.comp[0].c = comp;
193	ins->alu.scalar_opc = MAXs;
194	ins->alu.export = -1;
195	ins->alu.write_mask = 1;
196	ins->pred = instr->pred;
197	ins->block_idx = instr->block_idx;
198
199	instr->src[0] = src0;
200	instr->alu.src1_swizzle = comp;
201
202	sched->instr_s = ins;
203	return true;
204}
205
206/* fill sched with next fetch or (vector and/or scalar) alu instruction */
207static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
208{
209	struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
210	unsigned avail_count = 0;
211
212	instr_alloc_type_t export = ~0u;
213	int block_idx = -1;
214
215	/* XXX merge this loop with the other one somehow? */
216	ir2_foreach_instr(instr, ctx) {
217		if (!instr->need_emit)
218			continue;
219		if (is_export(instr))
220			export = MIN2(export, export_buf(instr->alu.export));
221	}
222
223	ir2_foreach_instr(instr, ctx) {
224		if (!instr->need_emit)
225			continue;
226
227		/* dont mix exports */
228		if (is_export(instr) && export_buf(instr->alu.export) != export)
229			continue;
230
231		if (block_idx < 0)
232			block_idx = instr->block_idx;
233		else if (block_idx != instr->block_idx || /* must be same block */
234			instr->type == IR2_CF || /* CF/MEM must be alone */
235			(is_export(instr) && export == SQ_MEMORY))
236			break;
237		/* it works because IR2_CF is always at end of block
238		 * and somewhat same idea with MEM exports, which might not be alone
239		 * but will end up in-order at least
240		 */
241
242		/* check if dependencies are satisfied */
243		bool is_ok = true;
244		ir2_foreach_src(src, instr) {
245			if (src->type == IR2_SRC_REG) {
246				/* need to check if all previous instructions in the block
247				 * which write the reg have been emitted
248				 * slow..
249				 * XXX: check components instead of whole register
250				 */
251				struct ir2_reg *reg = get_reg_src(ctx, src);
252				ir2_foreach_instr(p, ctx) {
253					if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
254						is_ok &= !p->need_emit;
255				}
256			} else if (src->type == IR2_SRC_SSA) {
257				/* in this case its easy, just check need_emit */
258				is_ok &= !ctx->instr[src->num].need_emit;
259			}
260		}
261		if (!is_ok)
262			continue;
263
264		avail[avail_count++] = instr;
265	}
266
267	if (!avail_count) {
268		assert(block_idx == -1);
269		return -1;
270	}
271
272	/* priority to FETCH instructions */
273	ir2_foreach_avail(instr) {
274		if (instr->type == IR2_ALU)
275			continue;
276
277		ra_src_free(ctx, instr);
278		ra_reg(ctx, get_reg(instr), -1, false, 0);
279
280		instr->need_emit = false;
281		sched->instr = instr;
282		sched->instr_s = NULL;
283		return block_idx;
284	}
285
286	/* TODO precompute priorities */
287
288	unsigned prio_v = ~0u, prio_s = ~0u, prio;
289	ir2_foreach_avail(instr) {
290		prio = alu_vector_prio(instr);
291		if (prio < prio_v) {
292			instr_v = instr;
293			prio_v = prio;
294		}
295	}
296
297	/* TODO can still insert scalar if src_count=3, if smart about it */
298	if (!instr_v || instr_v->src_count < 3) {
299		ir2_foreach_avail(instr) {
300			bool compat = is_alu_compatible(instr_v, instr);
301
302			prio = alu_scalar_prio(instr);
303			if (prio >= prio_v && !compat)
304				continue;
305
306			if (prio < prio_s) {
307				instr_s = instr;
308				prio_s = prio;
309				if (!compat)
310					instr_v = NULL;
311			}
312		}
313	}
314
315	assert(instr_v || instr_s);
316
317	/* now, we try more complex insertion of vector instruction as scalar
318	 * TODO: if we are smart we can still insert if instr_v->src_count==3
319	 */
320	if (!instr_s && instr_v->src_count < 3) {
321		ir2_foreach_avail(instr) {
322			if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
323				continue;
324
325			/* at this point, src_count should always be 2 */
326			assert(instr->src_count == 2);
327
328			if (scalarize_case1(ctx, instr, 0)) {
329				instr_s = instr;
330				break;
331			}
332			if (scalarize_case1(ctx, instr, 1)) {
333				instr_s = instr;
334				break;
335			}
336		}
337	}
338
339	/* free src registers */
340	if (instr_v) {
341		instr_v->need_emit = false;
342		ra_src_free(ctx, instr_v);
343	}
344
345	if (instr_s) {
346		instr_s->need_emit = false;
347		ra_src_free(ctx, instr_s);
348	}
349
350	/* allocate dst registers */
351	if (instr_v)
352		ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
353
354	if (instr_s)
355		ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
356
357	sched->instr = instr_v;
358	sched->instr_s = instr_s;
359	return block_idx;
360}
361
362/* scheduling: determine order of instructions */
363static void schedule_instrs(struct ir2_context *ctx)
364{
365	struct ir2_sched_instr *sched;
366	int block_idx;
367
368	/* allocate input registers */
369	for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
370		if (ctx->input[idx].initialized)
371			ra_reg(ctx, &ctx->input[idx], idx, false, 0);
372
373	for (;;) {
374		sched = &ctx->instr_sched[ctx->instr_sched_count++];
375		block_idx = sched_next(ctx, sched);
376		if (block_idx < 0)
377			break;
378		memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
379
380		/* catch texture fetch after scheduling and insert the
381		 * SET_TEX_LOD right before it if necessary
382		 * TODO clean this up
383		 */
384		struct ir2_instr *instr = sched->instr, *tex_lod;
385		if (instr && instr->type == IR2_FETCH &&
386			instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
387			/* generate the SET_LOD instruction */
388			tex_lod = &ctx->instr[ctx->instr_count++];
389			tex_lod->type = IR2_FETCH;
390			tex_lod->block_idx = instr->block_idx;
391			tex_lod->pred = instr->pred;
392			tex_lod->fetch.opc = TEX_SET_TEX_LOD;
393			tex_lod->src[0] = instr->src[1];
394			tex_lod->src_count = 1;
395
396			sched[1] = sched[0];
397			sched->instr = tex_lod;
398			ctx->instr_sched_count++;
399		}
400
401		bool free_block = true;
402		ir2_foreach_instr(instr, ctx)
403			free_block &= instr->block_idx != block_idx;
404		if (free_block)
405			ra_block_free(ctx, block_idx);
406	};
407	ctx->instr_sched_count--;
408}
409
410void
411ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
412		struct fd2_shader_stateobj *fp)
413{
414	struct ir2_context ctx = { };
415	bool binning = !fp && so->type == MESA_SHADER_VERTEX;
416
417	if (fp)
418		so->variant[variant].f = fp->variant[0].f;
419
420	ctx.so = so;
421	ctx.info = &so->variant[variant].info;
422	ctx.f = &so->variant[variant].f;
423	ctx.info->max_reg = -1;
424
425	/* convert nir to internal representation */
426	ir2_nir_compile(&ctx, binning);
427
428	/* copy propagate srcs */
429	cp_src(&ctx);
430
431	/* get ref_counts and kill non-needed instructions */
432	ra_count_refs(&ctx);
433
434	/* remove movs used to write outputs */
435	cp_export(&ctx);
436
437	/* instruction order.. and vector->scalar conversions */
438	schedule_instrs(&ctx);
439
440	/* finally, assemble to bitcode */
441	assemble(&ctx, binning);
442}
443