1/*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4 *
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial
17 * portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 */
28
29#include "radeon_dataflow.h"
30
31#include "radeon_compiler.h"
32#include "radeon_compiler_util.h"
33#include "radeon_list.h"
34#include "radeon_swizzle.h"
35#include "radeon_variable.h"
36
37struct src_clobbered_reads_cb_data {
38	rc_register_file File;
39	unsigned int Index;
40	unsigned int Mask;
41	struct rc_reader_data * ReaderData;
42};
43
44typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
45						struct rc_instruction *,
46						unsigned int);
47
48static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
49{
50	struct rc_src_register combine;
51	combine.File = inner.File;
52	combine.Index = inner.Index;
53	combine.RelAddr = inner.RelAddr;
54	if (outer.Abs) {
55		combine.Abs = 1;
56		combine.Negate = outer.Negate;
57	} else {
58		combine.Abs = inner.Abs;
59		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
60		combine.Negate ^= outer.Negate;
61	}
62	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
63	return combine;
64}
65
66static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
67						struct rc_src_register * src)
68{
69	rc_register_file file = src->File;
70	struct rc_reader_data * reader_data = data;
71
72	if(!rc_inst_can_use_presub(inst,
73				reader_data->Writer->U.I.PreSub.Opcode,
74				rc_swizzle_to_writemask(src->Swizzle),
75				src,
76				&reader_data->Writer->U.I.PreSub.SrcReg[0],
77				&reader_data->Writer->U.I.PreSub.SrcReg[1])) {
78		reader_data->Abort = 1;
79		return;
80	}
81
82	/* XXX This could probably be handled better. */
83	if (file == RC_FILE_ADDRESS) {
84		reader_data->Abort = 1;
85		return;
86	}
87
88	/* These instructions cannot read from the constants file.
89	 * see radeonTransformTEX()
90	 */
91	if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
92			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
93				(inst->U.I.Opcode == RC_OPCODE_TEX ||
94				inst->U.I.Opcode == RC_OPCODE_TXB ||
95				inst->U.I.Opcode == RC_OPCODE_TXP ||
96				inst->U.I.Opcode == RC_OPCODE_TXD ||
97				inst->U.I.Opcode == RC_OPCODE_TXL ||
98				inst->U.I.Opcode == RC_OPCODE_KIL)){
99		reader_data->Abort = 1;
100		return;
101	}
102}
103
104static void src_clobbered_reads_cb(
105	void * data,
106	struct rc_instruction * inst,
107	struct rc_src_register * src)
108{
109	struct src_clobbered_reads_cb_data * sc_data = data;
110
111	if (src->File == sc_data->File
112	    && src->Index == sc_data->Index
113	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
114
115		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
116	}
117
118	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
119		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
120	}
121}
122
123static void is_src_clobbered_scan_write(
124	void * data,
125	struct rc_instruction * inst,
126	rc_register_file file,
127	unsigned int index,
128	unsigned int mask)
129{
130	struct src_clobbered_reads_cb_data sc_data;
131	struct rc_reader_data * reader_data = data;
132	sc_data.File = file;
133	sc_data.Index = index;
134	sc_data.Mask = mask;
135	sc_data.ReaderData = reader_data;
136	rc_for_all_reads_src(reader_data->Writer,
137					src_clobbered_reads_cb, &sc_data);
138}
139
140static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
141{
142	struct rc_reader_data reader_data;
143	unsigned int i;
144
145	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
146	    inst_mov->U.I.WriteALUResult)
147		return;
148
149	/* Get a list of all the readers of this MOV instruction. */
150	reader_data.ExitOnAbort = 1;
151	rc_get_readers(c, inst_mov, &reader_data,
152		       copy_propagate_scan_read, NULL,
153		       is_src_clobbered_scan_write);
154
155	if (reader_data.Abort || reader_data.ReaderCount == 0)
156		return;
157
158	/* We can propagate SaturateMode if all the readers are MOV instructions
159	 * without a presubtract operation, source negation and absolute.
160	 * In that case, we just move SaturateMode to all readers. */
161        if (inst_mov->U.I.SaturateMode) {
162		for (i = 0; i < reader_data.ReaderCount; i++) {
163			struct rc_instruction * inst = reader_data.Readers[i].Inst;
164
165			if (inst->U.I.Opcode != RC_OPCODE_MOV ||
166			    inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
167			    inst->U.I.SrcReg[0].Abs ||
168			    inst->U.I.SrcReg[0].Negate) {
169				return;
170			}
171		}
172	}
173
174	/* Propagate the MOV instruction. */
175	for (i = 0; i < reader_data.ReaderCount; i++) {
176		struct rc_instruction * inst = reader_data.Readers[i].Inst;
177		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
178
179		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
180			inst->U.I.PreSub = inst_mov->U.I.PreSub;
181		if (!inst->U.I.SaturateMode)
182			inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
183	}
184
185	/* Finally, remove the original MOV instruction */
186	rc_remove_instruction(inst_mov);
187}
188
189/**
190 * Check if a source register is actually always the same
191 * swizzle constant.
192 */
193static int is_src_uniform_constant(struct rc_src_register src,
194		rc_swizzle * pswz, unsigned int * pnegate)
195{
196	int have_used = 0;
197
198	if (src.File != RC_FILE_NONE) {
199		*pswz = 0;
200		return 0;
201	}
202
203	for(unsigned int chan = 0; chan < 4; ++chan) {
204		unsigned int swz = GET_SWZ(src.Swizzle, chan);
205		if (swz < 4) {
206			*pswz = 0;
207			return 0;
208		}
209		if (swz == RC_SWIZZLE_UNUSED)
210			continue;
211
212		if (!have_used) {
213			*pswz = swz;
214			*pnegate = GET_BIT(src.Negate, chan);
215			have_used = 1;
216		} else {
217			if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
218				*pswz = 0;
219				return 0;
220			}
221		}
222	}
223
224	return 1;
225}
226
227static void constant_folding_mad(struct rc_instruction * inst)
228{
229	rc_swizzle swz = 0;
230	unsigned int negate= 0;
231
232	if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
233		if (swz == RC_SWIZZLE_ZERO) {
234			inst->U.I.Opcode = RC_OPCODE_MUL;
235			return;
236		}
237	}
238
239	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
240		if (swz == RC_SWIZZLE_ONE) {
241			inst->U.I.Opcode = RC_OPCODE_ADD;
242			if (negate)
243				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
244			inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
245			return;
246		} else if (swz == RC_SWIZZLE_ZERO) {
247			inst->U.I.Opcode = RC_OPCODE_MOV;
248			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
249			return;
250		}
251	}
252
253	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
254		if (swz == RC_SWIZZLE_ONE) {
255			inst->U.I.Opcode = RC_OPCODE_ADD;
256			if (negate)
257				inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
258			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
259			return;
260		} else if (swz == RC_SWIZZLE_ZERO) {
261			inst->U.I.Opcode = RC_OPCODE_MOV;
262			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
263			return;
264		}
265	}
266}
267
268static void constant_folding_mul(struct rc_instruction * inst)
269{
270	rc_swizzle swz = 0;
271	unsigned int negate = 0;
272
273	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
274		if (swz == RC_SWIZZLE_ONE) {
275			inst->U.I.Opcode = RC_OPCODE_MOV;
276			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
277			if (negate)
278				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
279			return;
280		} else if (swz == RC_SWIZZLE_ZERO) {
281			inst->U.I.Opcode = RC_OPCODE_MOV;
282			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
283			return;
284		}
285	}
286
287	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
288		if (swz == RC_SWIZZLE_ONE) {
289			inst->U.I.Opcode = RC_OPCODE_MOV;
290			if (negate)
291				inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
292			return;
293		} else if (swz == RC_SWIZZLE_ZERO) {
294			inst->U.I.Opcode = RC_OPCODE_MOV;
295			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
296			return;
297		}
298	}
299}
300
301static void constant_folding_add(struct rc_instruction * inst)
302{
303	rc_swizzle swz = 0;
304	unsigned int negate = 0;
305
306	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
307		if (swz == RC_SWIZZLE_ZERO) {
308			inst->U.I.Opcode = RC_OPCODE_MOV;
309			inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
310			return;
311		}
312	}
313
314	if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
315		if (swz == RC_SWIZZLE_ZERO) {
316			inst->U.I.Opcode = RC_OPCODE_MOV;
317			return;
318		}
319	}
320}
321
322/**
323 * Replace 0.0, 1.0 and 0.5 immediate constants by their
324 * respective swizzles. Simplify instructions like ADD dst, src, 0;
325 */
326static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
327{
328	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
329	unsigned int i;
330
331	/* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
332	for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
333		struct rc_constant * constant;
334		struct rc_src_register newsrc;
335		int have_real_reference;
336		unsigned int chan;
337
338		/* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
339		for (chan = 0; chan < 4; ++chan)
340			if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
341				break;
342		if (chan == 4) {
343			inst->U.I.SrcReg[src].File = RC_FILE_NONE;
344			continue;
345		}
346
347		/* Convert immediates to swizzles. */
348		if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
349		    inst->U.I.SrcReg[src].RelAddr ||
350		    inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
351			continue;
352
353		constant =
354			&c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
355
356		if (constant->Type != RC_CONSTANT_IMMEDIATE)
357			continue;
358
359		newsrc = inst->U.I.SrcReg[src];
360		have_real_reference = 0;
361		for (chan = 0; chan < 4; ++chan) {
362			unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
363			unsigned int newswz;
364			float imm;
365			float baseimm;
366
367			if (swz >= 4)
368				continue;
369
370			imm = constant->u.Immediate[swz];
371			baseimm = imm;
372			if (imm < 0.0)
373				baseimm = -baseimm;
374
375			if (baseimm == 0.0) {
376				newswz = RC_SWIZZLE_ZERO;
377			} else if (baseimm == 1.0) {
378				newswz = RC_SWIZZLE_ONE;
379			} else if (baseimm == 0.5 && c->has_half_swizzles) {
380				newswz = RC_SWIZZLE_HALF;
381			} else {
382				have_real_reference = 1;
383				continue;
384			}
385
386			SET_SWZ(newsrc.Swizzle, chan, newswz);
387			if (imm < 0.0 && !newsrc.Abs)
388				newsrc.Negate ^= 1 << chan;
389		}
390
391		if (!have_real_reference) {
392			newsrc.File = RC_FILE_NONE;
393			newsrc.Index = 0;
394		}
395
396		/* don't make the swizzle worse */
397		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc) &&
398		    c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
399			continue;
400
401		inst->U.I.SrcReg[src] = newsrc;
402	}
403
404	/* Simplify instructions based on constants */
405	if (inst->U.I.Opcode == RC_OPCODE_MAD)
406		constant_folding_mad(inst);
407
408	/* note: MAD can simplify to MUL or ADD */
409	if (inst->U.I.Opcode == RC_OPCODE_MUL)
410		constant_folding_mul(inst);
411	else if (inst->U.I.Opcode == RC_OPCODE_ADD)
412		constant_folding_add(inst);
413
414	/* In case this instruction has been converted, make sure all of the
415	 * registers that are no longer used are empty. */
416	opcode = rc_get_opcode_info(inst->U.I.Opcode);
417	for(i = opcode->NumSrcRegs; i < 3; i++) {
418		memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
419	}
420}
421
422/**
423 * If src and dst use the same register, this function returns a writemask that
424 * indicates wich components are read by src.  Otherwise zero is returned.
425 */
426static unsigned int src_reads_dst_mask(struct rc_src_register src,
427						struct rc_dst_register dst)
428{
429	if (dst.File != src.File || dst.Index != src.Index) {
430		return 0;
431	}
432	return rc_swizzle_to_writemask(src.Swizzle);
433}
434
435/* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
436 * in any of its channels.  Return 0 otherwise. */
437static int src_has_const_swz(struct rc_src_register src) {
438	int chan;
439	for(chan = 0; chan < 4; chan++) {
440		unsigned int swz = GET_SWZ(src.Swizzle, chan);
441		if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
442						|| swz == RC_SWIZZLE_ONE) {
443			return 1;
444		}
445	}
446	return 0;
447}
448
449static void presub_scan_read(
450	void * data,
451	struct rc_instruction * inst,
452	struct rc_src_register * src)
453{
454	struct rc_reader_data * reader_data = data;
455	rc_presubtract_op * presub_opcode = reader_data->CbData;
456
457	if (!rc_inst_can_use_presub(inst, *presub_opcode,
458			reader_data->Writer->U.I.DstReg.WriteMask,
459			src,
460			&reader_data->Writer->U.I.SrcReg[0],
461			&reader_data->Writer->U.I.SrcReg[1])) {
462		reader_data->Abort = 1;
463		return;
464	}
465}
466
467static int presub_helper(
468	struct radeon_compiler * c,
469	struct rc_instruction * inst_add,
470	rc_presubtract_op presub_opcode,
471	rc_presub_replace_fn presub_replace)
472{
473	struct rc_reader_data reader_data;
474	unsigned int i;
475	rc_presubtract_op cb_op = presub_opcode;
476
477	reader_data.CbData = &cb_op;
478	reader_data.ExitOnAbort = 1;
479	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
480						is_src_clobbered_scan_write);
481
482	if (reader_data.Abort || reader_data.ReaderCount == 0)
483		return 0;
484
485	for(i = 0; i < reader_data.ReaderCount; i++) {
486		unsigned int src_index;
487		struct rc_reader reader = reader_data.Readers[i];
488		const struct rc_opcode_info * info =
489				rc_get_opcode_info(reader.Inst->U.I.Opcode);
490
491		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
492			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
493				presub_replace(inst_add, reader.Inst, src_index);
494		}
495	}
496	return 1;
497}
498
499/* This function assumes that inst_add->U.I.SrcReg[0] and
500 * inst_add->U.I.SrcReg[1] aren't both negative. */
501static void presub_replace_add(
502	struct rc_instruction * inst_add,
503	struct rc_instruction * inst_reader,
504	unsigned int src_index)
505{
506	rc_presubtract_op presub_opcode;
507	if (inst_add->U.I.SrcReg[1].Negate || inst_add->U.I.SrcReg[0].Negate)
508		presub_opcode = RC_PRESUB_SUB;
509	else
510		presub_opcode = RC_PRESUB_ADD;
511
512	if (inst_add->U.I.SrcReg[1].Negate) {
513		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
514		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
515	} else {
516		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
517		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
518	}
519	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
520	inst_reader->U.I.PreSub.SrcReg[1].Negate = 0;
521	inst_reader->U.I.PreSub.Opcode = presub_opcode;
522	inst_reader->U.I.SrcReg[src_index] =
523			chain_srcregs(inst_reader->U.I.SrcReg[src_index],
524					inst_reader->U.I.PreSub.SrcReg[0]);
525	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
526	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
527}
528
529static int is_presub_candidate(
530	struct radeon_compiler * c,
531	struct rc_instruction * inst)
532{
533	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
534	unsigned int i;
535	unsigned int is_constant[2] = {0, 0};
536
537	assert(inst->U.I.Opcode == RC_OPCODE_ADD);
538
539	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
540			|| inst->U.I.SaturateMode
541			|| inst->U.I.WriteALUResult
542			|| inst->U.I.Omod) {
543		return 0;
544	}
545
546	/* If both sources use a constant swizzle, then we can't convert it to
547	 * a presubtract operation.  In fact for the ADD and SUB presubtract
548	 * operations neither source can contain a constant swizzle.  This
549	 * specific case is checked in peephole_add_presub_add() when
550	 * we make sure the swizzles for both sources are equal, so we
551	 * don't need to worry about it here. */
552	for (i = 0; i < 2; i++) {
553		int chan;
554		for (chan = 0; chan < 4; chan++) {
555			rc_swizzle swz =
556				get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
557			if (swz == RC_SWIZZLE_ONE
558					|| swz == RC_SWIZZLE_ZERO
559					|| swz == RC_SWIZZLE_HALF) {
560				is_constant[i] = 1;
561			}
562		}
563	}
564	if (is_constant[0] && is_constant[1])
565		return 0;
566
567	for(i = 0; i < info->NumSrcRegs; i++) {
568		struct rc_src_register src = inst->U.I.SrcReg[i];
569		if (src_reads_dst_mask(src, inst->U.I.DstReg))
570			return 0;
571
572		src.File = RC_FILE_PRESUB;
573		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
574			return 0;
575	}
576	return 1;
577}
578
579static int peephole_add_presub_add(
580	struct radeon_compiler * c,
581	struct rc_instruction * inst_add)
582{
583	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
584        unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
585        unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
586
587	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
588		return 0;
589
590	/* src0 and src1 can't have absolute values */
591	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
592	        return 0;
593
594	/* presub_replace_add() assumes only one is negative */
595	if (inst_add->U.I.SrcReg[0].Negate && inst_add->U.I.SrcReg[1].Negate)
596	        return 0;
597
598        /* if src0 is negative, at least all bits of dstmask have to be set */
599        if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
600	        return 0;
601
602        /* if src1 is negative, at least all bits of dstmask have to be set */
603        if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
604	        return 0;
605
606	if (!is_presub_candidate(c, inst_add))
607		return 0;
608
609	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
610		rc_remove_instruction(inst_add);
611		return 1;
612	}
613	return 0;
614}
615
616static void presub_replace_inv(
617	struct rc_instruction * inst_add,
618	struct rc_instruction * inst_reader,
619	unsigned int src_index)
620{
621	/* We must be careful not to modify inst_add, since it
622	 * is possible it will remain part of the program.*/
623	inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
624	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
625	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
626	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
627						inst_reader->U.I.PreSub.SrcReg[0]);
628
629	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
630	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
631}
632
633/**
634 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
635 * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
636 * of the add instruction must have the constatnt 1 swizzle.  This function
637 * does not check const registers to see if their value is 1.0, so it should
638 * be called after the constant_folding optimization.
639 * @return
640 * 	0 if the ADD instruction is still part of the program.
641 * 	1 if the ADD instruction is no longer part of the program.
642 */
643static int peephole_add_presub_inv(
644	struct radeon_compiler * c,
645	struct rc_instruction * inst_add)
646{
647	unsigned int i, swz;
648
649	if (!is_presub_candidate(c, inst_add))
650		return 0;
651
652	/* Check if src0 is 1. */
653	/* XXX It would be nice to use is_src_uniform_constant here, but that
654	 * function only works if the register's file is RC_FILE_NONE */
655	for(i = 0; i < 4; i++ ) {
656		swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
657		if(((1 << i) & inst_add->U.I.DstReg.WriteMask)
658						&& swz != RC_SWIZZLE_ONE) {
659			return 0;
660		}
661	}
662
663	/* Check src1. */
664	if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
665						inst_add->U.I.DstReg.WriteMask
666		|| inst_add->U.I.SrcReg[1].Abs
667		|| (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY
668			&& inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT)
669		|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {
670
671		return 0;
672	}
673
674	if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
675		rc_remove_instruction(inst_add);
676		return 1;
677	}
678	return 0;
679}
680
681struct peephole_mul_cb_data {
682	struct rc_dst_register * Writer;
683	unsigned int Clobbered;
684};
685
686static void omod_filter_reader_cb(
687	void * userdata,
688	struct rc_instruction * inst,
689	rc_register_file file,
690	unsigned int index,
691	unsigned int mask)
692{
693	struct peephole_mul_cb_data * d = userdata;
694	if (rc_src_reads_dst_mask(file, mask, index,
695		d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
696
697		d->Clobbered = 1;
698	}
699}
700
701static void omod_filter_writer_cb(
702	void * userdata,
703	struct rc_instruction * inst,
704	rc_register_file file,
705	unsigned int index,
706	unsigned int mask)
707{
708	struct peephole_mul_cb_data * d = userdata;
709	if (file == d->Writer->File && index == d->Writer->Index &&
710					(mask & d->Writer->WriteMask)) {
711		d->Clobbered = 1;
712	}
713}
714
715static int peephole_mul_omod(
716	struct radeon_compiler * c,
717	struct rc_instruction * inst_mul,
718	struct rc_list * var_list)
719{
720	unsigned int chan = 0, swz, i;
721	int const_index = -1;
722	int temp_index = -1;
723	float const_value;
724	rc_omod_op omod_op = RC_OMOD_DISABLE;
725	struct rc_list * writer_list;
726	struct rc_variable * var;
727	struct peephole_mul_cb_data cb_data;
728	unsigned writemask_sum;
729
730	for (i = 0; i < 2; i++) {
731		unsigned int j;
732		if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
733			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY) {
734			return 0;
735		}
736		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
737			if (temp_index != -1) {
738				/* The instruction has two temp sources */
739				return 0;
740			} else {
741				temp_index = i;
742				continue;
743			}
744		}
745		/* If we get this far Src[i] must be a constant src */
746		if (inst_mul->U.I.SrcReg[i].Negate) {
747			return 0;
748		}
749		/* The constant src needs to read from the same swizzle */
750		swz = RC_SWIZZLE_UNUSED;
751		chan = 0;
752		for (j = 0; j < 4; j++) {
753			unsigned int j_swz =
754				GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
755			if (j_swz == RC_SWIZZLE_UNUSED) {
756				continue;
757			}
758			if (swz == RC_SWIZZLE_UNUSED) {
759				swz = j_swz;
760				chan = j;
761			} else if (j_swz != swz) {
762				return 0;
763			}
764		}
765
766		if (const_index != -1) {
767			/* The instruction has two constant sources */
768			return 0;
769		} else {
770			const_index = i;
771		}
772	}
773
774	if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
775				inst_mul->U.I.SrcReg[const_index].Index)) {
776		return 0;
777	}
778	const_value = rc_get_constant_value(c,
779			inst_mul->U.I.SrcReg[const_index].Index,
780			inst_mul->U.I.SrcReg[const_index].Swizzle,
781			inst_mul->U.I.SrcReg[const_index].Negate,
782			chan);
783
784	if (const_value == 2.0f) {
785		omod_op = RC_OMOD_MUL_2;
786	} else if (const_value == 4.0f) {
787		omod_op = RC_OMOD_MUL_4;
788	} else if (const_value == 8.0f) {
789		omod_op = RC_OMOD_MUL_8;
790	} else if (const_value == (1.0f / 2.0f)) {
791		omod_op = RC_OMOD_DIV_2;
792	} else if (const_value == (1.0f / 4.0f)) {
793		omod_op = RC_OMOD_DIV_4;
794	} else if (const_value == (1.0f / 8.0f)) {
795		omod_op = RC_OMOD_DIV_8;
796	} else {
797		return 0;
798	}
799
800	writer_list = rc_variable_list_get_writers_one_reader(var_list,
801		RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
802
803	if (!writer_list) {
804		return 0;
805	}
806
807	cb_data.Clobbered = 0;
808	cb_data.Writer = &inst_mul->U.I.DstReg;
809	for (var = writer_list->Item; var; var = var->Friend) {
810		struct rc_instruction * inst;
811		const struct rc_opcode_info * info = rc_get_opcode_info(
812				var->Inst->U.I.Opcode);
813		if (info->HasTexture) {
814			return 0;
815		}
816		if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
817			return 0;
818		}
819		for (inst = inst_mul->Prev; inst != var->Inst;
820							inst = inst->Prev) {
821			rc_for_all_reads_mask(inst, omod_filter_reader_cb,
822								&cb_data);
823			rc_for_all_writes_mask(inst, omod_filter_writer_cb,
824								&cb_data);
825			if (cb_data.Clobbered) {
826				break;
827			}
828		}
829	}
830
831	if (cb_data.Clobbered) {
832		return 0;
833	}
834
835	/* Rewrite the instructions */
836	writemask_sum = rc_variable_writemask_sum(writer_list->Item);
837	for (var = writer_list->Item; var; var = var->Friend) {
838		struct rc_variable * writer = var;
839		unsigned conversion_swizzle = rc_make_conversion_swizzle(
840					writemask_sum,
841					inst_mul->U.I.DstReg.WriteMask);
842		writer->Inst->U.I.Omod = omod_op;
843		writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
844		writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
845		rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
846		writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
847	}
848
849	rc_remove_instruction(inst_mul);
850
851	return 1;
852}
853
854/**
855 * @return
856 * 	0 if inst is still part of the program.
857 * 	1 if inst is no longer part of the program.
858 */
859static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
860{
861	switch(inst->U.I.Opcode){
862	case RC_OPCODE_ADD:
863		if (c->has_presub) {
864			if(peephole_add_presub_inv(c, inst))
865				return 1;
866			if(peephole_add_presub_add(c, inst))
867				return 1;
868		}
869		break;
870	default:
871		break;
872	}
873	return 0;
874}
875
876void rc_optimize(struct radeon_compiler * c, void *user)
877{
878	struct rc_instruction * inst = c->Program.Instructions.Next;
879	struct rc_list * var_list;
880	while(inst != &c->Program.Instructions) {
881		struct rc_instruction * cur = inst;
882		inst = inst->Next;
883
884		constant_folding(c, cur);
885
886		if(peephole(c, cur))
887			continue;
888
889		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
890			copy_propagate(c, cur);
891			/* cur may no longer be part of the program */
892		}
893	}
894
895	if (!c->has_omod) {
896		return;
897	}
898
899	inst = c->Program.Instructions.Next;
900	while(inst != &c->Program.Instructions) {
901		struct rc_instruction * cur = inst;
902		inst = inst->Next;
903		if (cur->U.I.Opcode == RC_OPCODE_MUL) {
904			var_list = rc_get_variables(c);
905			peephole_mul_omod(c, cur, var_list);
906		}
907	}
908}
909