1/*
2   Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3   Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4   develop this 3D driver.
5
6   Permission is hereby granted, free of charge, to any person obtaining
7   a copy of this software and associated documentation files (the
8   "Software"), to deal in the Software without restriction, including
9   without limitation the rights to use, copy, modify, merge, publish,
10   distribute, sublicense, and/or sell copies of the Software, and to
11   permit persons to whom the Software is furnished to do so, subject to
12   the following conditions:
13
14   The above copyright notice and this permission notice (including the
15   next paragraph) shall be included in all copies or substantial
16   portions of the Software.
17
18   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21   IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27/*
28 * Authors:
29 *   Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32#include "brw_eu.h"
33
34#include <string.h>
35#include <stdlib.h>
36
37#define ARRAY_SIZE(A) (sizeof(A)/sizeof(A[0]))
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47	if (reg.width == BRW_WIDTH_8 && p->compressed)
48		insn->header.execution_size = BRW_EXECUTE_16;
49	else
50		insn->header.execution_size = reg.width;
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  unsigned msg_reg_nr)
65{
66	if (p->gen < 060)
67		return;
68
69	if (src->file == BRW_MESSAGE_REGISTER_FILE)
70		return;
71
72	if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
73		brw_push_insn_state(p);
74		brw_set_mask_control(p, BRW_MASK_DISABLE);
75		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
76		brw_MOV(p, __retype_ud(brw_message_reg(msg_reg_nr)), __retype_ud(*src));
77		brw_pop_insn_state(p);
78	}
79	*src = brw_message_reg(msg_reg_nr);
80}
81
82static void
83gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
84{
85	/* From the BSpec / ISA Reference / send - [DevIVB+]:
86	 * "The send with EOT should use register space R112-R127 for <src>. This is
87	 *  to enable loading of a new thread into the same slot while the message
88	 *  with EOT for current thread is pending dispatch."
89	 *
90	 * Since we're pretending to have 16 MRFs anyway, we may as well use the
91	 * registers required for messages with EOT.
92	 */
93	if (p->gen >= 070 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
94		reg->file = BRW_GENERAL_REGISTER_FILE;
95		reg->nr += 111;
96	}
97}
98
99void
100brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
101	     struct brw_reg dest)
102{
103	if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
104	    dest.file != BRW_MESSAGE_REGISTER_FILE)
105		assert(dest.nr < 128);
106
107	gen7_convert_mrf_to_grf(p, &dest);
108
109	insn->bits1.da1.dest_reg_file = dest.file;
110	insn->bits1.da1.dest_reg_type = dest.type;
111	insn->bits1.da1.dest_address_mode = dest.address_mode;
112
113	if (dest.address_mode == BRW_ADDRESS_DIRECT) {
114		insn->bits1.da1.dest_reg_nr = dest.nr;
115
116		if (insn->header.access_mode == BRW_ALIGN_1) {
117			insn->bits1.da1.dest_subreg_nr = dest.subnr;
118			if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
119				dest.hstride = BRW_HORIZONTAL_STRIDE_1;
120			insn->bits1.da1.dest_horiz_stride = dest.hstride;
121		} else {
122			insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
123			insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
124			/* even ignored in da16, still need to set as '01' */
125			insn->bits1.da16.dest_horiz_stride = 1;
126		}
127	} else {
128		insn->bits1.ia1.dest_subreg_nr = dest.subnr;
129
130		/* These are different sizes in align1 vs align16:
131		*/
132		if (insn->header.access_mode == BRW_ALIGN_1) {
133			insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
134			if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
135				dest.hstride = BRW_HORIZONTAL_STRIDE_1;
136			insn->bits1.ia1.dest_horiz_stride = dest.hstride;
137		}
138		else {
139			insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
140			/* even ignored in da16, still need to set as '01' */
141			insn->bits1.ia16.dest_horiz_stride = 1;
142		}
143	}
144
145	guess_execution_size(p, insn, dest);
146}
147
148static const int reg_type_size[8] = {
149	[0] = 4,
150	[1] = 4,
151	[2] = 2,
152	[3] = 2,
153	[4] = 1,
154	[5] = 1,
155	[7] = 4
156};
157
158static void
159validate_reg(struct brw_instruction *insn, struct brw_reg reg)
160{
161	int hstride_for_reg[] = {0, 1, 2, 4};
162	int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
163	int width_for_reg[] = {1, 2, 4, 8, 16};
164	int execsize_for_reg[] = {1, 2, 4, 8, 16};
165	int width, hstride, vstride, execsize;
166
167	if (reg.file == BRW_IMMEDIATE_VALUE) {
168		/* 3.3.6: Region Parameters.  Restriction: Immediate vectors
169		 * mean the destination has to be 128-bit aligned and the
170		 * destination horiz stride has to be a word.
171		 */
172		if (reg.type == BRW_REGISTER_TYPE_V) {
173			assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
174			       reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
175		}
176
177		return;
178	}
179
180	if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
181	    reg.file == BRW_ARF_NULL)
182		return;
183
184	assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
185	assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
186	assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
187	assert(insn->header.execution_size >= 0 && insn->header.execution_size < ARRAY_SIZE(execsize_for_reg));
188
189	hstride = hstride_for_reg[reg.hstride];
190
191	if (reg.vstride == 0xf) {
192		vstride = -1;
193	} else {
194		vstride = vstride_for_reg[reg.vstride];
195	}
196
197	width = width_for_reg[reg.width];
198
199	execsize = execsize_for_reg[insn->header.execution_size];
200
201	/* Restrictions from 3.3.10: Register Region Restrictions. */
202	/* 3. */
203	assert(execsize >= width);
204
205	/* 4. */
206	if (execsize == width && hstride != 0) {
207		assert(vstride == -1 || vstride == width * hstride);
208	}
209
210	/* 5. */
211	if (execsize == width && hstride == 0) {
212		/* no restriction on vstride. */
213	}
214
215	/* 6. */
216	if (width == 1) {
217		assert(hstride == 0);
218	}
219
220	/* 7. */
221	if (execsize == 1 && width == 1) {
222		assert(hstride == 0);
223		assert(vstride == 0);
224	}
225
226	/* 8. */
227	if (vstride == 0 && hstride == 0) {
228		assert(width == 1);
229	}
230
231	/* 10. Check destination issues. */
232}
233
234void
235brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
236	     struct brw_reg reg)
237{
238	if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
239		assert(reg.nr < 128);
240
241	gen7_convert_mrf_to_grf(p, &reg);
242
243	validate_reg(insn, reg);
244
245	insn->bits1.da1.src0_reg_file = reg.file;
246	insn->bits1.da1.src0_reg_type = reg.type;
247	insn->bits2.da1.src0_abs = reg.abs;
248	insn->bits2.da1.src0_negate = reg.negate;
249	insn->bits2.da1.src0_address_mode = reg.address_mode;
250
251	if (reg.file == BRW_IMMEDIATE_VALUE) {
252		insn->bits3.ud = reg.dw1.ud;
253
254		/* Required to set some fields in src1 as well:
255		*/
256		insn->bits1.da1.src1_reg_file = 0; /* arf */
257		insn->bits1.da1.src1_reg_type = reg.type;
258	} else {
259		if (reg.address_mode == BRW_ADDRESS_DIRECT) {
260			if (insn->header.access_mode == BRW_ALIGN_1) {
261				insn->bits2.da1.src0_subreg_nr = reg.subnr;
262				insn->bits2.da1.src0_reg_nr = reg.nr;
263			} else {
264				insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
265				insn->bits2.da16.src0_reg_nr = reg.nr;
266			}
267		} else {
268			insn->bits2.ia1.src0_subreg_nr = reg.subnr;
269
270			if (insn->header.access_mode == BRW_ALIGN_1) {
271				insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
272			} else {
273				insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
274			}
275		}
276
277		if (insn->header.access_mode == BRW_ALIGN_1) {
278			if (reg.width == BRW_WIDTH_1 &&
279			    insn->header.execution_size == BRW_EXECUTE_1) {
280				insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
281				insn->bits2.da1.src0_width = BRW_WIDTH_1;
282				insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
283			} else {
284				insn->bits2.da1.src0_horiz_stride = reg.hstride;
285				insn->bits2.da1.src0_width = reg.width;
286				insn->bits2.da1.src0_vert_stride = reg.vstride;
287			}
288		} else {
289			insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
290			insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
291			insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
292			insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
293
294			/* This is an oddity of the fact we're using the same
295			 * descriptions for registers in align_16 as align_1:
296			 */
297			if (reg.vstride == BRW_VERTICAL_STRIDE_8)
298				insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
299			else
300				insn->bits2.da16.src0_vert_stride = reg.vstride;
301		}
302	}
303}
304
305void brw_set_src1(struct brw_compile *p,
306		  struct brw_instruction *insn,
307		  struct brw_reg reg)
308{
309	assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
310	assert(reg.nr < 128);
311
312	gen7_convert_mrf_to_grf(p, &reg);
313
314	validate_reg(insn, reg);
315
316	insn->bits1.da1.src1_reg_file = reg.file;
317	insn->bits1.da1.src1_reg_type = reg.type;
318	insn->bits3.da1.src1_abs = reg.abs;
319	insn->bits3.da1.src1_negate = reg.negate;
320
321	/* Only src1 can be immediate in two-argument instructions. */
322	assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
323
324	if (reg.file == BRW_IMMEDIATE_VALUE) {
325		insn->bits3.ud = reg.dw1.ud;
326	} else {
327		/* This is a hardware restriction, which may or may not be lifted
328		 * in the future:
329		 */
330		assert (reg.address_mode == BRW_ADDRESS_DIRECT);
331		/* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
332
333		if (insn->header.access_mode == BRW_ALIGN_1) {
334			insn->bits3.da1.src1_subreg_nr = reg.subnr;
335			insn->bits3.da1.src1_reg_nr = reg.nr;
336		} else {
337			insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
338			insn->bits3.da16.src1_reg_nr = reg.nr;
339		}
340
341		if (insn->header.access_mode == BRW_ALIGN_1) {
342			if (reg.width == BRW_WIDTH_1 &&
343			    insn->header.execution_size == BRW_EXECUTE_1) {
344				insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
345				insn->bits3.da1.src1_width = BRW_WIDTH_1;
346				insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
347			} else {
348				insn->bits3.da1.src1_horiz_stride = reg.hstride;
349				insn->bits3.da1.src1_width = reg.width;
350				insn->bits3.da1.src1_vert_stride = reg.vstride;
351			}
352		} else {
353			insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
354			insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
355			insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
356			insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
357
358			/* This is an oddity of the fact we're using the same
359			 * descriptions for registers in align_16 as align_1:
360			 */
361			if (reg.vstride == BRW_VERTICAL_STRIDE_8)
362				insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
363			else
364				insn->bits3.da16.src1_vert_stride = reg.vstride;
365		}
366	}
367}
368
369/**
370 * Set the Message Descriptor and Extended Message Descriptor fields
371 * for SEND messages.
372 *
373 * \note This zeroes out the Function Control bits, so it must be called
374 *       \b before filling out any message-specific data.  Callers can
375 *       choose not to fill in irrelevant bits; they will be zero.
376 */
377static void
378brw_set_message_descriptor(struct brw_compile *p,
379			   struct brw_instruction *inst,
380			   enum brw_message_target sfid,
381			   unsigned msg_length,
382			   unsigned response_length,
383			   bool header_present,
384			   bool end_of_thread)
385{
386	brw_set_src1(p, inst, brw_imm_d(0));
387
388	if (p->gen >= 050) {
389		inst->bits3.generic_gen5.header_present = header_present;
390		inst->bits3.generic_gen5.response_length = response_length;
391		inst->bits3.generic_gen5.msg_length = msg_length;
392		inst->bits3.generic_gen5.end_of_thread = end_of_thread;
393
394		if (p->gen >= 060) {
395			/* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
396			inst->header.destreg__conditionalmod = sfid;
397		} else {
398			/* Set Extended Message Descriptor (ex_desc) */
399			inst->bits2.send_gen5.sfid = sfid;
400			inst->bits2.send_gen5.end_of_thread = end_of_thread;
401		}
402	} else {
403		inst->bits3.generic.response_length = response_length;
404		inst->bits3.generic.msg_length = msg_length;
405		inst->bits3.generic.msg_target = sfid;
406		inst->bits3.generic.end_of_thread = end_of_thread;
407	}
408}
409
410
411static void brw_set_math_message(struct brw_compile *p,
412				 struct brw_instruction *insn,
413				 unsigned function,
414				 unsigned integer_type,
415				 bool low_precision,
416				 bool saturate,
417				 unsigned dataType)
418{
419	unsigned msg_length;
420	unsigned response_length;
421
422	/* Infer message length from the function */
423	switch (function) {
424	case BRW_MATH_FUNCTION_POW:
425	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
426	case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
427	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
428		msg_length = 2;
429		break;
430	default:
431		msg_length = 1;
432		break;
433	}
434
435	/* Infer response length from the function */
436	switch (function) {
437	case BRW_MATH_FUNCTION_SINCOS:
438	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
439		response_length = 2;
440		break;
441	default:
442		response_length = 1;
443		break;
444	}
445
446	brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
447				   msg_length, response_length,
448				   false, false);
449	if (p->gen == 050) {
450		insn->bits3.math_gen5.function = function;
451		insn->bits3.math_gen5.int_type = integer_type;
452		insn->bits3.math_gen5.precision = low_precision;
453		insn->bits3.math_gen5.saturate = saturate;
454		insn->bits3.math_gen5.data_type = dataType;
455		insn->bits3.math_gen5.snapshot = 0;
456	} else {
457		insn->bits3.math.function = function;
458		insn->bits3.math.int_type = integer_type;
459		insn->bits3.math.precision = low_precision;
460		insn->bits3.math.saturate = saturate;
461		insn->bits3.math.data_type = dataType;
462	}
463}
464
465static void brw_set_ff_sync_message(struct brw_compile *p,
466				    struct brw_instruction *insn,
467				    bool allocate,
468				    unsigned response_length,
469				    bool end_of_thread)
470{
471	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
472				   1, response_length,
473				   true, end_of_thread);
474	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
475	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
476	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
477	insn->bits3.urb_gen5.allocate = allocate;
478	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
479	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
480}
481
482static void brw_set_urb_message(struct brw_compile *p,
483				struct brw_instruction *insn,
484				bool allocate,
485				bool used,
486				unsigned msg_length,
487				unsigned response_length,
488				bool end_of_thread,
489				bool complete,
490				unsigned offset,
491				unsigned swizzle_control)
492{
493	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
494				   msg_length, response_length, true, end_of_thread);
495	if (p->gen >= 070) {
496		insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
497		insn->bits3.urb_gen7.offset = offset;
498		assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
499		insn->bits3.urb_gen7.swizzle_control = swizzle_control;
500		/* per_slot_offset = 0 makes it ignore offsets in message header */
501		insn->bits3.urb_gen7.per_slot_offset = 0;
502		insn->bits3.urb_gen7.complete = complete;
503	} else if (p->gen >= 050) {
504		insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
505		insn->bits3.urb_gen5.offset = offset;
506		insn->bits3.urb_gen5.swizzle_control = swizzle_control;
507		insn->bits3.urb_gen5.allocate = allocate;
508		insn->bits3.urb_gen5.used = used;	/* ? */
509		insn->bits3.urb_gen5.complete = complete;
510	} else {
511		insn->bits3.urb.opcode = 0;	/* ? */
512		insn->bits3.urb.offset = offset;
513		insn->bits3.urb.swizzle_control = swizzle_control;
514		insn->bits3.urb.allocate = allocate;
515		insn->bits3.urb.used = used;	/* ? */
516		insn->bits3.urb.complete = complete;
517	}
518}
519
520void
521brw_set_dp_write_message(struct brw_compile *p,
522			 struct brw_instruction *insn,
523			 unsigned binding_table_index,
524			 unsigned msg_control,
525			 unsigned msg_type,
526			 unsigned msg_length,
527			 bool header_present,
528			 bool last_render_target,
529			 unsigned response_length,
530			 bool end_of_thread,
531			 bool send_commit_msg)
532{
533	unsigned sfid;
534
535	if (p->gen >= 070) {
536		/* Use the Render Cache for RT writes; otherwise use the Data Cache */
537		if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
538			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
539		else
540			sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
541	} else if (p->gen >= 060) {
542		/* Use the render cache for all write messages. */
543		sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
544	} else {
545		sfid = BRW_SFID_DATAPORT_WRITE;
546	}
547
548	brw_set_message_descriptor(p, insn, sfid,
549				   msg_length, response_length,
550				   header_present, end_of_thread);
551
552	if (p->gen >= 070) {
553		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
554		insn->bits3.gen7_dp.msg_control = msg_control;
555		insn->bits3.gen7_dp.last_render_target = last_render_target;
556		insn->bits3.gen7_dp.msg_type = msg_type;
557	} else if (p->gen >= 060) {
558		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
559		insn->bits3.gen6_dp.msg_control = msg_control;
560		insn->bits3.gen6_dp.last_render_target = last_render_target;
561		insn->bits3.gen6_dp.msg_type = msg_type;
562		insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
563	} else if (p->gen >= 050) {
564		insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
565		insn->bits3.dp_write_gen5.msg_control = msg_control;
566		insn->bits3.dp_write_gen5.last_render_target = last_render_target;
567		insn->bits3.dp_write_gen5.msg_type = msg_type;
568		insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
569	} else {
570		insn->bits3.dp_write.binding_table_index = binding_table_index;
571		insn->bits3.dp_write.msg_control = msg_control;
572		insn->bits3.dp_write.last_render_target = last_render_target;
573		insn->bits3.dp_write.msg_type = msg_type;
574		insn->bits3.dp_write.send_commit_msg = send_commit_msg;
575	}
576}
577
578void
579brw_set_dp_read_message(struct brw_compile *p,
580			struct brw_instruction *insn,
581			unsigned binding_table_index,
582			unsigned msg_control,
583			unsigned msg_type,
584			unsigned target_cache,
585			unsigned msg_length,
586			unsigned response_length)
587{
588	unsigned sfid;
589
590	if (p->gen >= 070) {
591		sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
592	} else if (p->gen >= 060) {
593		if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
594			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
595		else
596			sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
597	} else {
598		sfid = BRW_SFID_DATAPORT_READ;
599	}
600
601	brw_set_message_descriptor(p, insn, sfid,
602				   msg_length, response_length,
603				   true, false);
604
605	if (p->gen >= 070) {
606		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
607		insn->bits3.gen7_dp.msg_control = msg_control;
608		insn->bits3.gen7_dp.last_render_target = 0;
609		insn->bits3.gen7_dp.msg_type = msg_type;
610	} else if (p->gen >= 060) {
611		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
612		insn->bits3.gen6_dp.msg_control = msg_control;
613		insn->bits3.gen6_dp.last_render_target = 0;
614		insn->bits3.gen6_dp.msg_type = msg_type;
615		insn->bits3.gen6_dp.send_commit_msg = 0;
616	} else if (p->gen >= 050) {
617		insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
618		insn->bits3.dp_read_gen5.msg_control = msg_control;
619		insn->bits3.dp_read_gen5.msg_type = msg_type;
620		insn->bits3.dp_read_gen5.target_cache = target_cache;
621	} else if (p->gen >= 045) {
622		insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
623		insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
624		insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
625		insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
626	} else {
627		insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
628		insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
629		insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
630		insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
631	}
632}
633
634static void brw_set_sampler_message(struct brw_compile *p,
635                                    struct brw_instruction *insn,
636                                    unsigned binding_table_index,
637                                    unsigned sampler,
638                                    unsigned msg_type,
639                                    unsigned response_length,
640                                    unsigned msg_length,
641                                    bool header_present,
642                                    unsigned simd_mode)
643{
644	brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER,
645				   msg_length, response_length,
646				   header_present, false);
647
648	if (p->gen >= 070) {
649		insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
650		insn->bits3.sampler_gen7.sampler = sampler;
651		insn->bits3.sampler_gen7.msg_type = msg_type;
652		insn->bits3.sampler_gen7.simd_mode = simd_mode;
653	} else if (p->gen >= 050) {
654		insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
655		insn->bits3.sampler_gen5.sampler = sampler;
656		insn->bits3.sampler_gen5.msg_type = msg_type;
657		insn->bits3.sampler_gen5.simd_mode = simd_mode;
658	} else if (p->gen >= 045) {
659		insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
660		insn->bits3.sampler_g4x.sampler = sampler;
661		insn->bits3.sampler_g4x.msg_type = msg_type;
662	} else {
663		insn->bits3.sampler.binding_table_index = binding_table_index;
664		insn->bits3.sampler.sampler = sampler;
665		insn->bits3.sampler.msg_type = msg_type;
666		insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
667	}
668}
669
670
671void brw_NOP(struct brw_compile *p)
672{
673	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_NOP);
674	brw_set_dest(p, insn, __retype_ud(brw_vec4_grf(0,0)));
675	brw_set_src0(p, insn, __retype_ud(brw_vec4_grf(0,0)));
676	brw_set_src1(p, insn, brw_imm_ud(0x0));
677}
678
679/***********************************************************************
680 * Comparisons, if/else/endif
681 */
682
683static void
684push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
685{
686	p->if_stack[p->if_stack_depth] = inst;
687
688	p->if_stack_depth++;
689	if (p->if_stack_array_size <= p->if_stack_depth) {
690		p->if_stack_array_size *= 2;
691		p->if_stack = realloc(p->if_stack, sizeof(struct brw_instruction *)*p->if_stack_array_size);
692	}
693}
694
695/* EU takes the value from the flag register and pushes it onto some
696 * sort of a stack (presumably merging with any flag value already on
697 * the stack).  Within an if block, the flags at the top of the stack
698 * control execution on each channel of the unit, eg. on each of the
699 * 16 pixel values in our wm programs.
700 *
701 * When the matching 'else' instruction is reached (presumably by
702 * countdown of the instruction count patched in by our ELSE/ENDIF
703 * functions), the relevent flags are inverted.
704 *
705 * When the matching 'endif' instruction is reached, the flags are
706 * popped off.  If the stack is now empty, normal execution resumes.
707 */
708struct brw_instruction *
709brw_IF(struct brw_compile *p, unsigned execute_size)
710{
711	struct brw_instruction *insn;
712
713	insn = brw_next_insn(p, BRW_OPCODE_IF);
714
715	/* Override the defaults for this instruction: */
716	if (p->gen < 060) {
717		brw_set_dest(p, insn, brw_ip_reg());
718		brw_set_src0(p, insn, brw_ip_reg());
719		brw_set_src1(p, insn, brw_imm_d(0x0));
720	} else if (p->gen < 070) {
721		brw_set_dest(p, insn, brw_imm_w(0));
722		insn->bits1.branch_gen6.jump_count = 0;
723		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
724		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
725	} else {
726		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
727		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
728		brw_set_src1(p, insn, brw_imm_ud(0));
729		insn->bits3.break_cont.jip = 0;
730		insn->bits3.break_cont.uip = 0;
731	}
732
733	insn->header.execution_size = execute_size;
734	insn->header.compression_control = BRW_COMPRESSION_NONE;
735	insn->header.predicate_control = BRW_PREDICATE_NORMAL;
736	insn->header.mask_control = BRW_MASK_ENABLE;
737	if (!p->single_program_flow)
738		insn->header.thread_control = BRW_THREAD_SWITCH;
739
740	p->current->header.predicate_control = BRW_PREDICATE_NONE;
741
742	push_if_stack(p, insn);
743	return insn;
744}
745
746/* This function is only used for gen6-style IF instructions with an
747 * embedded comparison (conditional modifier).  It is not used on gen7.
748 */
749struct brw_instruction *
750gen6_IF(struct brw_compile *p, uint32_t conditional,
751	struct brw_reg src0, struct brw_reg src1)
752{
753	struct brw_instruction *insn;
754
755	insn = brw_next_insn(p, BRW_OPCODE_IF);
756
757	brw_set_dest(p, insn, brw_imm_w(0));
758	if (p->compressed) {
759		insn->header.execution_size = BRW_EXECUTE_16;
760	} else {
761		insn->header.execution_size = BRW_EXECUTE_8;
762	}
763	insn->bits1.branch_gen6.jump_count = 0;
764	brw_set_src0(p, insn, src0);
765	brw_set_src1(p, insn, src1);
766
767	assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
768	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
769	insn->header.destreg__conditionalmod = conditional;
770
771	if (!p->single_program_flow)
772		insn->header.thread_control = BRW_THREAD_SWITCH;
773
774	push_if_stack(p, insn);
775	return insn;
776}
777
778/**
779 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
780 */
781static void
782convert_IF_ELSE_to_ADD(struct brw_compile *p,
783		       struct brw_instruction *if_inst,
784		       struct brw_instruction *else_inst)
785{
786	/* The next instruction (where the ENDIF would be, if it existed) */
787	struct brw_instruction *next_inst = &p->store[p->nr_insn];
788
789	assert(p->single_program_flow);
790	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
791	assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
792	assert(if_inst->header.execution_size == BRW_EXECUTE_1);
793
794	/* Convert IF to an ADD instruction that moves the instruction pointer
795	 * to the first instruction of the ELSE block.  If there is no ELSE
796	 * block, point to where ENDIF would be.  Reverse the predicate.
797	 *
798	 * There's no need to execute an ENDIF since we don't need to do any
799	 * stack operations, and if we're currently executing, we just want to
800	 * continue normally.
801	 */
802	if_inst->header.opcode = BRW_OPCODE_ADD;
803	if_inst->header.predicate_inverse = 1;
804
805	if (else_inst != NULL) {
806		/* Convert ELSE to an ADD instruction that points where the ENDIF
807		 * would be.
808		 */
809		else_inst->header.opcode = BRW_OPCODE_ADD;
810
811		if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
812		else_inst->bits3.ud = (next_inst - else_inst) * 16;
813	} else {
814		if_inst->bits3.ud = (next_inst - if_inst) * 16;
815	}
816}
817
818/**
819 * Patch IF and ELSE instructions with appropriate jump targets.
820 */
821static void
822patch_IF_ELSE(struct brw_compile *p,
823	      struct brw_instruction *if_inst,
824	      struct brw_instruction *else_inst,
825	      struct brw_instruction *endif_inst)
826{
827	unsigned br = 1;
828
829	assert(!p->single_program_flow);
830	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
831	assert(endif_inst != NULL);
832	assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
833
834	/* Jump count is for 64bit data chunk each, so one 128bit instruction
835	 * requires 2 chunks.
836	 */
837	if (p->gen >= 050)
838		br = 2;
839
840	assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
841	endif_inst->header.execution_size = if_inst->header.execution_size;
842
843	if (else_inst == NULL) {
844		/* Patch IF -> ENDIF */
845		if (p->gen < 060) {
846			/* Turn it into an IFF, which means no mask stack operations for
847			 * all-false and jumping past the ENDIF.
848			 */
849			if_inst->header.opcode = BRW_OPCODE_IFF;
850			if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
851			if_inst->bits3.if_else.pop_count = 0;
852			if_inst->bits3.if_else.pad0 = 0;
853		} else if (p->gen < 070) {
854			/* As of gen6, there is no IFF and IF must point to the ENDIF. */
855			if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
856		} else {
857			if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
858			if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
859		}
860	} else {
861		else_inst->header.execution_size = if_inst->header.execution_size;
862
863		/* Patch IF -> ELSE */
864		if (p->gen < 060) {
865			if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
866			if_inst->bits3.if_else.pop_count = 0;
867			if_inst->bits3.if_else.pad0 = 0;
868		} else if (p->gen <= 070) {
869			if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
870		}
871
872		/* Patch ELSE -> ENDIF */
873		if (p->gen < 060) {
874			/* BRW_OPCODE_ELSE pre-gen6 should point just past the
875			 * matching ENDIF.
876			 */
877			else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
878			else_inst->bits3.if_else.pop_count = 1;
879			else_inst->bits3.if_else.pad0 = 0;
880		} else if (p->gen < 070) {
881			/* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
882			else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
883		} else {
884			/* The IF instruction's JIP should point just past the ELSE */
885			if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
886			/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
887			if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
888			else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
889		}
890	}
891}
892
893void
894brw_ELSE(struct brw_compile *p)
895{
896	struct brw_instruction *insn;
897
898	insn = brw_next_insn(p, BRW_OPCODE_ELSE);
899
900	if (p->gen < 060) {
901		brw_set_dest(p, insn, brw_ip_reg());
902		brw_set_src0(p, insn, brw_ip_reg());
903		brw_set_src1(p, insn, brw_imm_d(0x0));
904	} else if (p->gen < 070) {
905		brw_set_dest(p, insn, brw_imm_w(0));
906		insn->bits1.branch_gen6.jump_count = 0;
907		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
908		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
909	} else {
910		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
911		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
912		brw_set_src1(p, insn, brw_imm_ud(0));
913		insn->bits3.break_cont.jip = 0;
914		insn->bits3.break_cont.uip = 0;
915	}
916
917	insn->header.compression_control = BRW_COMPRESSION_NONE;
918	insn->header.mask_control = BRW_MASK_ENABLE;
919	if (!p->single_program_flow)
920		insn->header.thread_control = BRW_THREAD_SWITCH;
921
922	push_if_stack(p, insn);
923}
924
925void
926brw_ENDIF(struct brw_compile *p)
927{
928	struct brw_instruction *insn;
929	struct brw_instruction *else_inst = NULL;
930	struct brw_instruction *if_inst = NULL;
931
932	/* Pop the IF and (optional) ELSE instructions from the stack */
933	p->if_stack_depth--;
934	if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
935		else_inst = p->if_stack[p->if_stack_depth];
936		p->if_stack_depth--;
937	}
938	if_inst = p->if_stack[p->if_stack_depth];
939
940	if (p->single_program_flow) {
941		/* ENDIF is useless; don't bother emitting it. */
942		convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
943		return;
944	}
945
946	insn = brw_next_insn(p, BRW_OPCODE_ENDIF);
947
948	if (p->gen < 060) {
949		brw_set_dest(p, insn, __retype_ud(brw_vec4_grf(0,0)));
950		brw_set_src0(p, insn, __retype_ud(brw_vec4_grf(0,0)));
951		brw_set_src1(p, insn, brw_imm_d(0x0));
952	} else if (p->gen < 070) {
953		brw_set_dest(p, insn, brw_imm_w(0));
954		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
955		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
956	} else {
957		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
958		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
959		brw_set_src1(p, insn, brw_imm_ud(0));
960	}
961
962	insn->header.compression_control = BRW_COMPRESSION_NONE;
963	insn->header.mask_control = BRW_MASK_ENABLE;
964	insn->header.thread_control = BRW_THREAD_SWITCH;
965
966	/* Also pop item off the stack in the endif instruction: */
967	if (p->gen < 060) {
968		insn->bits3.if_else.jump_count = 0;
969		insn->bits3.if_else.pop_count = 1;
970		insn->bits3.if_else.pad0 = 0;
971	} else if (p->gen < 070) {
972		insn->bits1.branch_gen6.jump_count = 2;
973	} else {
974		insn->bits3.break_cont.jip = 2;
975	}
976	patch_IF_ELSE(p, if_inst, else_inst, insn);
977}
978
979struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
980{
981	struct brw_instruction *insn;
982
983	insn = brw_next_insn(p, BRW_OPCODE_BREAK);
984	if (p->gen >= 060) {
985		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
986		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
987		brw_set_src1(p, insn, brw_imm_d(0x0));
988	} else {
989		brw_set_dest(p, insn, brw_ip_reg());
990		brw_set_src0(p, insn, brw_ip_reg());
991		brw_set_src1(p, insn, brw_imm_d(0x0));
992		insn->bits3.if_else.pad0 = 0;
993		insn->bits3.if_else.pop_count = pop_count;
994	}
995	insn->header.compression_control = BRW_COMPRESSION_NONE;
996	insn->header.execution_size = BRW_EXECUTE_8;
997
998	return insn;
999}
1000
1001struct brw_instruction *gen6_CONT(struct brw_compile *p,
1002				  struct brw_instruction *do_insn)
1003{
1004	struct brw_instruction *insn;
1005
1006	insn = brw_next_insn(p, BRW_OPCODE_CONTINUE);
1007	brw_set_dest(p, insn, __retype_d(brw_null_reg()));
1008	brw_set_src0(p, insn, __retype_d(brw_null_reg()));
1009	brw_set_dest(p, insn, brw_ip_reg());
1010	brw_set_src0(p, insn, brw_ip_reg());
1011	brw_set_src1(p, insn, brw_imm_d(0x0));
1012
1013	insn->header.compression_control = BRW_COMPRESSION_NONE;
1014	insn->header.execution_size = BRW_EXECUTE_8;
1015	return insn;
1016}
1017
1018struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1019{
1020	struct brw_instruction *insn;
1021	insn = brw_next_insn(p, BRW_OPCODE_CONTINUE);
1022	brw_set_dest(p, insn, brw_ip_reg());
1023	brw_set_src0(p, insn, brw_ip_reg());
1024	brw_set_src1(p, insn, brw_imm_d(0x0));
1025	insn->header.compression_control = BRW_COMPRESSION_NONE;
1026	insn->header.execution_size = BRW_EXECUTE_8;
1027	/* insn->header.mask_control = BRW_MASK_DISABLE; */
1028	insn->bits3.if_else.pad0 = 0;
1029	insn->bits3.if_else.pop_count = pop_count;
1030	return insn;
1031}
1032
1033/* DO/WHILE loop:
1034 *
1035 * The DO/WHILE is just an unterminated loop -- break or continue are
1036 * used for control within the loop.  We have a few ways they can be
1037 * done.
1038 *
1039 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1040 * jip and no DO instruction.
1041 *
1042 * For non-uniform control flow pre-gen6, there's a DO instruction to
1043 * push the mask, and a WHILE to jump back, and BREAK to get out and
1044 * pop the mask.
1045 *
1046 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1047 * just points back to the first instruction of the loop.
1048 */
1049struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1050{
1051	if (p->gen >= 060 || p->single_program_flow) {
1052		return &p->store[p->nr_insn];
1053	} else {
1054		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_DO);
1055
1056		/* Override the defaults for this instruction:
1057		*/
1058		brw_set_dest(p, insn, brw_null_reg());
1059		brw_set_src0(p, insn, brw_null_reg());
1060		brw_set_src1(p, insn, brw_null_reg());
1061
1062		insn->header.compression_control = BRW_COMPRESSION_NONE;
1063		insn->header.execution_size = execute_size;
1064		insn->header.predicate_control = BRW_PREDICATE_NONE;
1065		/* insn->header.mask_control = BRW_MASK_ENABLE; */
1066		/* insn->header.mask_control = BRW_MASK_DISABLE; */
1067
1068		return insn;
1069	}
1070}
1071
1072struct brw_instruction *brw_WHILE(struct brw_compile *p,
1073                                  struct brw_instruction *do_insn)
1074{
1075	struct brw_instruction *insn;
1076	unsigned br = 1;
1077
1078	if (p->gen >= 050)
1079		br = 2;
1080
1081	if (p->gen >= 070) {
1082		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
1083
1084		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
1085		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
1086		brw_set_src1(p, insn, brw_imm_ud(0));
1087		insn->bits3.break_cont.jip = br * (do_insn - insn);
1088
1089		insn->header.execution_size = BRW_EXECUTE_8;
1090	} else if (p->gen >= 060) {
1091		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
1092
1093		brw_set_dest(p, insn, brw_imm_w(0));
1094		insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1095		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
1096		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
1097
1098		insn->header.execution_size = BRW_EXECUTE_8;
1099	} else {
1100		if (p->single_program_flow) {
1101			insn = brw_next_insn(p, BRW_OPCODE_ADD);
1102
1103			brw_set_dest(p, insn, brw_ip_reg());
1104			brw_set_src0(p, insn, brw_ip_reg());
1105			brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1106			insn->header.execution_size = BRW_EXECUTE_1;
1107		} else {
1108			insn = brw_next_insn(p, BRW_OPCODE_WHILE);
1109
1110			assert(do_insn->header.opcode == BRW_OPCODE_DO);
1111
1112			brw_set_dest(p, insn, brw_ip_reg());
1113			brw_set_src0(p, insn, brw_ip_reg());
1114			brw_set_src1(p, insn, brw_imm_d(0));
1115
1116			insn->header.execution_size = do_insn->header.execution_size;
1117			insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1118			insn->bits3.if_else.pop_count = 0;
1119			insn->bits3.if_else.pad0 = 0;
1120		}
1121	}
1122	insn->header.compression_control = BRW_COMPRESSION_NONE;
1123	p->current->header.predicate_control = BRW_PREDICATE_NONE;
1124
1125	return insn;
1126}
1127
1128/* FORWARD JUMPS:
1129 */
1130void brw_land_fwd_jump(struct brw_compile *p,
1131		       struct brw_instruction *jmp_insn)
1132{
1133	struct brw_instruction *landing = &p->store[p->nr_insn];
1134	unsigned jmpi = 1;
1135
1136	if (p->gen >= 050)
1137		jmpi = 2;
1138
1139	assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1140	assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1141
1142	jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1143}
1144
1145
1146
1147/* To integrate with the above, it makes sense that the comparison
1148 * instruction should populate the flag register.  It might be simpler
1149 * just to use the flag reg for most WM tasks?
1150 */
1151void brw_CMP(struct brw_compile *p,
1152	     struct brw_reg dest,
1153	     unsigned conditional,
1154	     struct brw_reg src0,
1155	     struct brw_reg src1)
1156{
1157	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_CMP);
1158
1159	insn->header.destreg__conditionalmod = conditional;
1160	brw_set_dest(p, insn, dest);
1161	brw_set_src0(p, insn, src0);
1162	brw_set_src1(p, insn, src1);
1163
1164	/* Make it so that future instructions will use the computed flag
1165	 * value until brw_set_predicate_control_flag_value() is called
1166	 * again.
1167	 */
1168	if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1169	    dest.nr == 0) {
1170		p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1171		p->flag_value = 0xff;
1172	}
1173}
1174
1175/* Issue 'wait' instruction for n1, host could program MMIO
1176   to wake up thread. */
1177void brw_WAIT(struct brw_compile *p)
1178{
1179	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_WAIT);
1180	struct brw_reg src = brw_notification_1_reg();
1181
1182	brw_set_dest(p, insn, src);
1183	brw_set_src0(p, insn, src);
1184	brw_set_src1(p, insn, brw_null_reg());
1185	insn->header.execution_size = 0; /* must */
1186	insn->header.predicate_control = 0;
1187	insn->header.compression_control = 0;
1188}
1189
1190/***********************************************************************
1191 * Helpers for the various SEND message types:
1192 */
1193
1194/** Extended math function, float[8].
1195 */
1196void brw_math(struct brw_compile *p,
1197	      struct brw_reg dest,
1198	      unsigned function,
1199	      unsigned saturate,
1200	      unsigned msg_reg_nr,
1201	      struct brw_reg src,
1202	      unsigned data_type,
1203	      unsigned precision)
1204{
1205	if (p->gen >= 060) {
1206		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_MATH);
1207
1208		assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1209		assert(src.file == BRW_GENERAL_REGISTER_FILE);
1210
1211		assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1212		assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1213
1214		/* Source modifiers are ignored for extended math instructions. */
1215		assert(!src.negate);
1216		assert(!src.abs);
1217
1218		if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1219		    function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1220			assert(src.type == BRW_REGISTER_TYPE_F);
1221		}
1222
1223		/* Math is the same ISA format as other opcodes, except that CondModifier
1224		 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1225		 */
1226		insn->header.destreg__conditionalmod = function;
1227		insn->header.saturate = saturate;
1228
1229		brw_set_dest(p, insn, dest);
1230		brw_set_src0(p, insn, src);
1231		brw_set_src1(p, insn, brw_null_reg());
1232	} else {
1233		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1234		/* Example code doesn't set predicate_control for send
1235		 * instructions.
1236		 */
1237		insn->header.predicate_control = 0;
1238		insn->header.destreg__conditionalmod = msg_reg_nr;
1239
1240		brw_set_dest(p, insn, dest);
1241		brw_set_src0(p, insn, src);
1242		brw_set_math_message(p, insn, function,
1243				     src.type == BRW_REGISTER_TYPE_D,
1244				     precision,
1245				     saturate,
1246				     data_type);
1247	}
1248}
1249
1250/** Extended math function, float[8].
1251 */
1252void brw_math2(struct brw_compile *p,
1253	       struct brw_reg dest,
1254	       unsigned function,
1255	       struct brw_reg src0,
1256	       struct brw_reg src1)
1257{
1258	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_MATH);
1259
1260	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1261	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1262	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1263
1264	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1265	assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1266	assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1267
1268	if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1269	    function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1270		assert(src0.type == BRW_REGISTER_TYPE_F);
1271		assert(src1.type == BRW_REGISTER_TYPE_F);
1272	}
1273
1274	/* Source modifiers are ignored for extended math instructions. */
1275	assert(!src0.negate);
1276	assert(!src0.abs);
1277	assert(!src1.negate);
1278	assert(!src1.abs);
1279
1280	/* Math is the same ISA format as other opcodes, except that CondModifier
1281	 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1282	 */
1283	insn->header.destreg__conditionalmod = function;
1284
1285	brw_set_dest(p, insn, dest);
1286	brw_set_src0(p, insn, src0);
1287	brw_set_src1(p, insn, src1);
1288}
1289
1290/**
1291 * Extended math function, float[16].
1292 * Use 2 send instructions.
1293 */
1294void brw_math_16(struct brw_compile *p,
1295		 struct brw_reg dest,
1296		 unsigned function,
1297		 unsigned saturate,
1298		 unsigned msg_reg_nr,
1299		 struct brw_reg src,
1300		 unsigned precision)
1301{
1302	struct brw_instruction *insn;
1303
1304	if (p->gen >= 060) {
1305		insn = brw_next_insn(p, BRW_OPCODE_MATH);
1306
1307		/* Math is the same ISA format as other opcodes, except that CondModifier
1308		 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1309		 */
1310		insn->header.destreg__conditionalmod = function;
1311		insn->header.saturate = saturate;
1312
1313		/* Source modifiers are ignored for extended math instructions. */
1314		assert(!src.negate);
1315		assert(!src.abs);
1316
1317		brw_set_dest(p, insn, dest);
1318		brw_set_src0(p, insn, src);
1319		brw_set_src1(p, insn, brw_null_reg());
1320		return;
1321	}
1322
1323	/* First instruction:
1324	*/
1325	brw_push_insn_state(p);
1326	brw_set_predicate_control_flag_value(p, 0xff);
1327	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1328
1329	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1330	insn->header.destreg__conditionalmod = msg_reg_nr;
1331
1332	brw_set_dest(p, insn, dest);
1333	brw_set_src0(p, insn, src);
1334	brw_set_math_message(p, insn, function,
1335			     BRW_MATH_INTEGER_UNSIGNED,
1336			     precision,
1337			     saturate,
1338			     BRW_MATH_DATA_VECTOR);
1339
1340	/* Second instruction:
1341	*/
1342	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1343	insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1344	insn->header.destreg__conditionalmod = msg_reg_nr+1;
1345
1346	brw_set_dest(p, insn, __offset(dest,1));
1347	brw_set_src0(p, insn, src);
1348	brw_set_math_message(p, insn, function,
1349			     BRW_MATH_INTEGER_UNSIGNED,
1350			     precision,
1351			     saturate,
1352			     BRW_MATH_DATA_VECTOR);
1353
1354	brw_pop_insn_state(p);
1355}
1356
1357/**
1358 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1359 * using a constant offset per channel.
1360 *
1361 * The offset must be aligned to oword size (16 bytes).  Used for
1362 * register spilling.
1363 */
1364void brw_oword_block_write_scratch(struct brw_compile *p,
1365				   struct brw_reg mrf,
1366				   int num_regs,
1367				   unsigned offset)
1368{
1369	uint32_t msg_control, msg_type;
1370	int mlen;
1371
1372	if (p->gen >= 060)
1373		offset /= 16;
1374
1375	mrf = __retype_ud(mrf);
1376
1377	if (num_regs == 1) {
1378		msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1379		mlen = 2;
1380	} else {
1381		msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1382		mlen = 3;
1383	}
1384
1385	/* Set up the message header.  This is g0, with g0.2 filled with
1386	 * the offset.  We don't want to leave our offset around in g0 or
1387	 * it'll screw up texture samples, so set it up inside the message
1388	 * reg.
1389	 */
1390	{
1391		brw_push_insn_state(p);
1392		brw_set_mask_control(p, BRW_MASK_DISABLE);
1393		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1394
1395		brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
1396
1397		/* set message header global offset field (reg 0, element 2) */
1398		brw_MOV(p,
1399			__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
1400			brw_imm_ud(offset));
1401
1402		brw_pop_insn_state(p);
1403	}
1404
1405	{
1406		struct brw_reg dest;
1407		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1408		int send_commit_msg;
1409		struct brw_reg src_header = __retype_uw(brw_vec8_grf(0, 0));
1410
1411		if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1412			insn->header.compression_control = BRW_COMPRESSION_NONE;
1413			src_header = vec16(src_header);
1414		}
1415		assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1416		insn->header.destreg__conditionalmod = mrf.nr;
1417
1418		/* Until gen6, writes followed by reads from the same location
1419		 * are not guaranteed to be ordered unless write_commit is set.
1420		 * If set, then a no-op write is issued to the destination
1421		 * register to set a dependency, and a read from the destination
1422		 * can be used to ensure the ordering.
1423		 *
1424		 * For gen6, only writes between different threads need ordering
1425		 * protection.  Our use of DP writes is all about register
1426		 * spilling within a thread.
1427		 */
1428		if (p->gen >= 060) {
1429			dest = __retype_uw(vec16(brw_null_reg()));
1430			send_commit_msg = 0;
1431		} else {
1432			dest = src_header;
1433			send_commit_msg = 1;
1434		}
1435
1436		brw_set_dest(p, insn, dest);
1437		if (p->gen >= 060) {
1438			brw_set_src0(p, insn, mrf);
1439		} else {
1440			brw_set_src0(p, insn, brw_null_reg());
1441		}
1442
1443		if (p->gen >= 060)
1444			msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1445		else
1446			msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1447
1448		brw_set_dp_write_message(p,
1449					 insn,
1450					 255, /* binding table index (255=stateless) */
1451					 msg_control,
1452					 msg_type,
1453					 mlen,
1454					 true, /* header_present */
1455					 0, /* pixel scoreboard */
1456					 send_commit_msg, /* response_length */
1457					 0, /* eot */
1458					 send_commit_msg);
1459	}
1460}
1461
1462
1463/**
1464 * Read a block of owords (half a GRF each) from the scratch buffer
1465 * using a constant index per channel.
1466 *
1467 * Offset must be aligned to oword size (16 bytes).  Used for register
1468 * spilling.
1469 */
1470void
1471brw_oword_block_read_scratch(struct brw_compile *p,
1472			     struct brw_reg dest,
1473			     struct brw_reg mrf,
1474			     int num_regs,
1475			     unsigned offset)
1476{
1477	uint32_t msg_control;
1478	int rlen;
1479
1480	if (p->gen >= 060)
1481		offset /= 16;
1482
1483	mrf = __retype_ud(mrf);
1484	dest = __retype_uw(dest);
1485
1486	if (num_regs == 1) {
1487		msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1488		rlen = 1;
1489	} else {
1490		msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1491		rlen = 2;
1492	}
1493
1494	{
1495		brw_push_insn_state(p);
1496		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1497		brw_set_mask_control(p, BRW_MASK_DISABLE);
1498
1499		brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
1500
1501		/* set message header global offset field (reg 0, element 2) */
1502		brw_MOV(p,
1503			__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
1504			brw_imm_ud(offset));
1505
1506		brw_pop_insn_state(p);
1507	}
1508
1509	{
1510		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1511
1512		assert(insn->header.predicate_control == 0);
1513		insn->header.compression_control = BRW_COMPRESSION_NONE;
1514		insn->header.destreg__conditionalmod = mrf.nr;
1515
1516		brw_set_dest(p, insn, dest); /* UW? */
1517		if (p->gen >= 060) {
1518			brw_set_src0(p, insn, mrf);
1519		} else {
1520			brw_set_src0(p, insn, brw_null_reg());
1521		}
1522
1523		brw_set_dp_read_message(p,
1524					insn,
1525					255, /* binding table index (255=stateless) */
1526					msg_control,
1527					BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1528					BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1529					1, /* msg_length */
1530					rlen);
1531	}
1532}
1533
1534/**
1535 * Read a float[4] vector from the data port Data Cache (const buffer).
1536 * Location (in buffer) should be a multiple of 16.
1537 * Used for fetching shader constants.
1538 */
1539void brw_oword_block_read(struct brw_compile *p,
1540			  struct brw_reg dest,
1541			  struct brw_reg mrf,
1542			  uint32_t offset,
1543			  uint32_t bind_table_index)
1544{
1545	struct brw_instruction *insn;
1546
1547	/* On newer hardware, offset is in units of owords. */
1548	if (p->gen >= 060)
1549		offset /= 16;
1550
1551	mrf = __retype_ud(mrf);
1552
1553	brw_push_insn_state(p);
1554	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1555	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1556	brw_set_mask_control(p, BRW_MASK_DISABLE);
1557
1558	brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
1559
1560	/* set message header global offset field (reg 0, element 2) */
1561	brw_MOV(p,
1562		__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
1563		brw_imm_ud(offset));
1564
1565	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1566	insn->header.destreg__conditionalmod = mrf.nr;
1567
1568	/* cast dest to a uword[8] vector */
1569	dest = __retype_uw(vec8(dest));
1570
1571	brw_set_dest(p, insn, dest);
1572	if (p->gen >= 060) {
1573		brw_set_src0(p, insn, mrf);
1574	} else {
1575		brw_set_src0(p, insn, brw_null_reg());
1576	}
1577
1578	brw_set_dp_read_message(p,
1579				insn,
1580				bind_table_index,
1581				BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1582				BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1583				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1584				1, /* msg_length */
1585				1); /* response_length (1 reg, 2 owords!) */
1586
1587	brw_pop_insn_state(p);
1588}
1589
1590/**
1591 * Read a set of dwords from the data port Data Cache (const buffer).
1592 *
1593 * Location (in buffer) appears as UD offsets in the register after
1594 * the provided mrf header reg.
1595 */
1596void brw_dword_scattered_read(struct brw_compile *p,
1597			      struct brw_reg dest,
1598			      struct brw_reg mrf,
1599			      uint32_t bind_table_index)
1600{
1601	struct brw_instruction *insn;
1602
1603	mrf = __retype_ud(mrf);
1604
1605	brw_push_insn_state(p);
1606	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1607	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1608	brw_set_mask_control(p, BRW_MASK_DISABLE);
1609	brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
1610	brw_pop_insn_state(p);
1611
1612	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1613	insn->header.destreg__conditionalmod = mrf.nr;
1614
1615	/* cast dest to a uword[8] vector */
1616	dest = __retype_uw(vec8(dest));
1617
1618	brw_set_dest(p, insn, dest);
1619	brw_set_src0(p, insn, brw_null_reg());
1620
1621	brw_set_dp_read_message(p,
1622				insn,
1623				bind_table_index,
1624				BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1625				BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1626				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1627				2, /* msg_length */
1628				1); /* response_length */
1629}
1630
1631/**
1632 * Read float[4] constant(s) from VS constant buffer.
1633 * For relative addressing, two float[4] constants will be read into 'dest'.
1634 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1635 */
1636void brw_dp_READ_4_vs(struct brw_compile *p,
1637                      struct brw_reg dest,
1638                      unsigned location,
1639                      unsigned bind_table_index)
1640{
1641	struct brw_instruction *insn;
1642	unsigned msg_reg_nr = 1;
1643
1644	if (p->gen >= 060)
1645		location /= 16;
1646
1647	/* Setup MRF[1] with location/offset into const buffer */
1648	brw_push_insn_state(p);
1649	brw_set_access_mode(p, BRW_ALIGN_1);
1650	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1651	brw_set_mask_control(p, BRW_MASK_DISABLE);
1652	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1653	brw_MOV(p, __retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2)),
1654		brw_imm_ud(location));
1655	brw_pop_insn_state(p);
1656
1657	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1658
1659	insn->header.predicate_control = BRW_PREDICATE_NONE;
1660	insn->header.compression_control = BRW_COMPRESSION_NONE;
1661	insn->header.destreg__conditionalmod = msg_reg_nr;
1662	insn->header.mask_control = BRW_MASK_DISABLE;
1663
1664	brw_set_dest(p, insn, dest);
1665	if (p->gen >= 060) {
1666		brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1667	} else {
1668		brw_set_src0(p, insn, brw_null_reg());
1669	}
1670
1671	brw_set_dp_read_message(p,
1672				insn,
1673				bind_table_index,
1674				0,
1675				BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1676				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1677				1, /* msg_length */
1678				1); /* response_length (1 Oword) */
1679}
1680
1681/**
1682 * Read a float[4] constant per vertex from VS constant buffer, with
1683 * relative addressing.
1684 */
1685void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1686			       struct brw_reg dest,
1687			       struct brw_reg addr_reg,
1688			       unsigned offset,
1689			       unsigned bind_table_index)
1690{
1691	struct brw_reg src = brw_vec8_grf(0, 0);
1692	struct brw_instruction *insn;
1693	int msg_type;
1694
1695	/* Setup MRF[1] with offset into const buffer */
1696	brw_push_insn_state(p);
1697	brw_set_access_mode(p, BRW_ALIGN_1);
1698	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1699	brw_set_mask_control(p, BRW_MASK_DISABLE);
1700	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1701
1702	/* M1.0 is block offset 0, M1.4 is block offset 1, all other
1703	 * fields ignored.
1704	 */
1705	brw_ADD(p, __retype_d(brw_message_reg(1)),
1706		addr_reg, brw_imm_d(offset));
1707	brw_pop_insn_state(p);
1708
1709	gen6_resolve_implied_move(p, &src, 0);
1710
1711	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1712	insn->header.predicate_control = BRW_PREDICATE_NONE;
1713	insn->header.compression_control = BRW_COMPRESSION_NONE;
1714	insn->header.destreg__conditionalmod = 0;
1715	insn->header.mask_control = BRW_MASK_DISABLE;
1716
1717	brw_set_dest(p, insn, dest);
1718	brw_set_src0(p, insn, src);
1719
1720	if (p->gen >= 060)
1721		msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1722	else if (p->gen >= 045)
1723		msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1724	else
1725		msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1726
1727	brw_set_dp_read_message(p,
1728				insn,
1729				bind_table_index,
1730				BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1731				msg_type,
1732				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1733				2, /* msg_length */
1734				1); /* response_length */
1735}
1736
1737void brw_fb_WRITE(struct brw_compile *p,
1738		  int dispatch_width,
1739                  unsigned msg_reg_nr,
1740                  struct brw_reg src0,
1741                  unsigned msg_control,
1742                  unsigned binding_table_index,
1743                  unsigned msg_length,
1744                  unsigned response_length,
1745                  bool eot,
1746                  bool header_present)
1747{
1748	struct brw_instruction *insn;
1749	unsigned msg_type;
1750	struct brw_reg dest;
1751
1752	if (dispatch_width == 16)
1753		dest = __retype_uw(vec16(brw_null_reg()));
1754	else
1755		dest = __retype_uw(vec8(brw_null_reg()));
1756
1757	if (p->gen >= 060 && binding_table_index == 0) {
1758		insn = brw_next_insn(p, BRW_OPCODE_SENDC);
1759	} else {
1760		insn = brw_next_insn(p, BRW_OPCODE_SEND);
1761	}
1762	/* The execution mask is ignored for render target writes. */
1763	insn->header.predicate_control = 0;
1764	insn->header.compression_control = BRW_COMPRESSION_NONE;
1765
1766	if (p->gen >= 060) {
1767		/* headerless version, just submit color payload */
1768		src0 = brw_message_reg(msg_reg_nr);
1769
1770		msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1771	} else {
1772		insn->header.destreg__conditionalmod = msg_reg_nr;
1773
1774		msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1775	}
1776
1777	brw_set_dest(p, insn, dest);
1778	brw_set_src0(p, insn, src0);
1779	brw_set_dp_write_message(p,
1780				 insn,
1781				 binding_table_index,
1782				 msg_control,
1783				 msg_type,
1784				 msg_length,
1785				 header_present,
1786				 eot,
1787				 response_length,
1788				 eot,
1789				 0 /* send_commit_msg */);
1790}
1791
1792/**
1793 * Texture sample instruction.
1794 * Note: the msg_type plus msg_length values determine exactly what kind
1795 * of sampling operation is performed.  See volume 4, page 161 of docs.
1796 */
1797void brw_SAMPLE(struct brw_compile *p,
1798		struct brw_reg dest,
1799		unsigned msg_reg_nr,
1800		struct brw_reg src0,
1801		unsigned binding_table_index,
1802		unsigned sampler,
1803		unsigned writemask,
1804		unsigned msg_type,
1805		unsigned response_length,
1806		unsigned msg_length,
1807		bool header_present,
1808		unsigned simd_mode)
1809{
1810	assert(writemask);
1811
1812	if (p->gen < 050 || writemask != WRITEMASK_XYZW) {
1813		struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1814
1815		writemask = ~writemask & WRITEMASK_XYZW;
1816
1817		brw_push_insn_state(p);
1818
1819		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1820		brw_set_mask_control(p, BRW_MASK_DISABLE);
1821
1822		brw_MOV(p, __retype_ud(m1), __retype_ud(brw_vec8_grf(0,0)));
1823		brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(writemask << 12));
1824
1825		brw_pop_insn_state(p);
1826
1827		src0 = __retype_uw(brw_null_reg());
1828	}
1829
1830	{
1831		struct brw_instruction *insn;
1832
1833		gen6_resolve_implied_move(p, &src0, msg_reg_nr);
1834
1835		insn = brw_next_insn(p, BRW_OPCODE_SEND);
1836		insn->header.predicate_control = 0; /* XXX */
1837		insn->header.compression_control = BRW_COMPRESSION_NONE;
1838		if (p->gen < 060)
1839			insn->header.destreg__conditionalmod = msg_reg_nr;
1840
1841		brw_set_dest(p, insn, dest);
1842		brw_set_src0(p, insn, src0);
1843		brw_set_sampler_message(p, insn,
1844					binding_table_index,
1845					sampler,
1846					msg_type,
1847					response_length,
1848					msg_length,
1849					header_present,
1850					simd_mode);
1851	}
1852}
1853
1854/* All these variables are pretty confusing - we might be better off
1855 * using bitmasks and macros for this, in the old style.  Or perhaps
1856 * just having the caller instantiate the fields in dword3 itself.
1857 */
1858void brw_urb_WRITE(struct brw_compile *p,
1859		   struct brw_reg dest,
1860		   unsigned msg_reg_nr,
1861		   struct brw_reg src0,
1862		   bool allocate,
1863		   bool used,
1864		   unsigned msg_length,
1865		   unsigned response_length,
1866		   bool eot,
1867		   bool writes_complete,
1868		   unsigned offset,
1869		   unsigned swizzle)
1870{
1871	struct brw_instruction *insn;
1872
1873	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
1874
1875	if (p->gen >= 070) {
1876		/* Enable Channel Masks in the URB_WRITE_HWORD message header */
1877		brw_push_insn_state(p);
1878		brw_set_access_mode(p, BRW_ALIGN_1);
1879		brw_OR(p, __retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5)),
1880		       __retype_ud(brw_vec1_grf(0, 5)),
1881		       brw_imm_ud(0xff00));
1882		brw_pop_insn_state(p);
1883	}
1884
1885	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1886
1887	assert(msg_length < BRW_MAX_MRF);
1888
1889	brw_set_dest(p, insn, dest);
1890	brw_set_src0(p, insn, src0);
1891	brw_set_src1(p, insn, brw_imm_d(0));
1892
1893	if (p->gen <= 060)
1894		insn->header.destreg__conditionalmod = msg_reg_nr;
1895
1896	brw_set_urb_message(p,
1897			    insn,
1898			    allocate,
1899			    used,
1900			    msg_length,
1901			    response_length,
1902			    eot,
1903			    writes_complete,
1904			    offset,
1905			    swizzle);
1906}
1907
1908static int
1909brw_find_next_block_end(struct brw_compile *p, int start)
1910{
1911	int ip;
1912
1913	for (ip = start + 1; ip < p->nr_insn; ip++) {
1914		struct brw_instruction *insn = &p->store[ip];
1915
1916		switch (insn->header.opcode) {
1917		case BRW_OPCODE_ENDIF:
1918		case BRW_OPCODE_ELSE:
1919		case BRW_OPCODE_WHILE:
1920			return ip;
1921		}
1922	}
1923	assert(!"not reached");
1924	return start + 1;
1925}
1926
1927/* There is no DO instruction on gen6, so to find the end of the loop
1928 * we have to see if the loop is jumping back before our start
1929 * instruction.
1930 */
1931static int
1932brw_find_loop_end(struct brw_compile *p, int start)
1933{
1934	int ip;
1935	int br = 2;
1936
1937	for (ip = start + 1; ip < p->nr_insn; ip++) {
1938		struct brw_instruction *insn = &p->store[ip];
1939
1940		if (insn->header.opcode == BRW_OPCODE_WHILE) {
1941			int jip = p->gen <= 070 ? insn->bits1.branch_gen6.jump_count
1942				: insn->bits3.break_cont.jip;
1943			if (ip + jip / br <= start)
1944				return ip;
1945		}
1946	}
1947	assert(!"not reached");
1948	return start + 1;
1949}
1950
1951/* After program generation, go back and update the UIP and JIP of
1952 * BREAK and CONT instructions to their correct locations.
1953 */
1954void
1955brw_set_uip_jip(struct brw_compile *p)
1956{
1957	int ip;
1958	int br = 2;
1959
1960	if (p->gen <= 060)
1961		return;
1962
1963	for (ip = 0; ip < p->nr_insn; ip++) {
1964		struct brw_instruction *insn = &p->store[ip];
1965
1966		switch (insn->header.opcode) {
1967		case BRW_OPCODE_BREAK:
1968			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
1969			/* Gen7 UIP points to WHILE; Gen6 points just after it */
1970			insn->bits3.break_cont.uip =
1971				br * (brw_find_loop_end(p, ip) - ip + (p->gen <= 070 ? 1 : 0));
1972			break;
1973		case BRW_OPCODE_CONTINUE:
1974			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
1975			insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
1976
1977			assert(insn->bits3.break_cont.uip != 0);
1978			assert(insn->bits3.break_cont.jip != 0);
1979			break;
1980		}
1981	}
1982}
1983
1984void brw_ff_sync(struct brw_compile *p,
1985		   struct brw_reg dest,
1986		   unsigned msg_reg_nr,
1987		   struct brw_reg src0,
1988		   bool allocate,
1989		   unsigned response_length,
1990		   bool eot)
1991{
1992	struct brw_instruction *insn;
1993
1994	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
1995
1996	insn = brw_next_insn(p, BRW_OPCODE_SEND);
1997	brw_set_dest(p, insn, dest);
1998	brw_set_src0(p, insn, src0);
1999	brw_set_src1(p, insn, brw_imm_d(0));
2000
2001	if (p->gen < 060)
2002		insn->header.destreg__conditionalmod = msg_reg_nr;
2003
2004	brw_set_ff_sync_message(p,
2005				insn,
2006				allocate,
2007				response_length,
2008				eot);
2009}
2010