1/*
2 * Copyright © 2006,2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Wang Zhenyu <zhenyu.z.wang@intel.com>
25 *    Eric Anholt <eric@anholt.net>
26 *    Chris Wilson <chris@chris-wilson.co.uk>
27 *
28 */
29
30#ifdef HAVE_CONFIG_H
31#include "config.h"
32#endif
33
34#include "sna.h"
35#include "sna_reg.h"
36#include "sna_render.h"
37#include "sna_render_inline.h"
38
39#include "gen2_render.h"
40
41#define NO_COMPOSITE 0
42#define NO_COMPOSITE_SPANS 0
43#define NO_COPY 0
44#define NO_COPY_BOXES 0
45#define NO_FILL 0
46#define NO_FILL_ONE 0
47#define NO_FILL_BOXES 0
48
49#define MAX_3D_SIZE 2048
50#define MAX_3D_PITCH 8192
51
52#define BATCH(v) batch_emit(sna, v)
53#define BATCH_F(v) batch_emit_float(sna, v)
54#define VERTEX(v) batch_emit_float(sna, v)
55
56static const struct blendinfo {
57	bool dst_alpha;
58	bool src_alpha;
59	uint32_t src_blend;
60	uint32_t dst_blend;
61} gen2_blend_op[] = {
62	/* Clear */
63	{0, 0, BLENDFACTOR_ZERO, BLENDFACTOR_ZERO},
64	/* Src */
65	{0, 0, BLENDFACTOR_ONE, BLENDFACTOR_ZERO},
66	/* Dst */
67	{0, 0, BLENDFACTOR_ZERO, BLENDFACTOR_ONE},
68	/* Over */
69	{0, 1, BLENDFACTOR_ONE, BLENDFACTOR_INV_SRC_ALPHA},
70	/* OverReverse */
71	{1, 0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ONE},
72	/* In */
73	{1, 0, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_ZERO},
74	/* InReverse */
75	{0, 1, BLENDFACTOR_ZERO, BLENDFACTOR_SRC_ALPHA},
76	/* Out */
77	{1, 0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ZERO},
78	/* OutReverse */
79	{0, 1, BLENDFACTOR_ZERO, BLENDFACTOR_INV_SRC_ALPHA},
80	/* Atop */
81	{1, 1, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
82	/* AtopReverse */
83	{1, 1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_SRC_ALPHA},
84	/* Xor */
85	{1, 1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
86	/* Add */
87	{0, 0, BLENDFACTOR_ONE, BLENDFACTOR_ONE},
88};
89
90static const struct formatinfo {
91	unsigned int fmt;
92	uint32_t card_fmt;
93} i8xx_tex_formats[] = {
94	{PICT_a8, MAPSURF_8BIT | MT_8BIT_A8},
95	{PICT_a8r8g8b8, MAPSURF_32BIT | MT_32BIT_ARGB8888},
96	{PICT_a8b8g8r8, MAPSURF_32BIT | MT_32BIT_ABGR8888},
97	{PICT_r5g6b5, MAPSURF_16BIT | MT_16BIT_RGB565},
98	{PICT_a1r5g5b5, MAPSURF_16BIT | MT_16BIT_ARGB1555},
99	{PICT_a4r4g4b4, MAPSURF_16BIT | MT_16BIT_ARGB4444},
100}, i85x_tex_formats[] = {
101	{PICT_x8r8g8b8, MAPSURF_32BIT | MT_32BIT_XRGB8888},
102	{PICT_x8b8g8r8, MAPSURF_32BIT | MT_32BIT_XBGR8888},
103};
104
105static inline bool
106too_large(int width, int height)
107{
108	return width > MAX_3D_SIZE || height > MAX_3D_SIZE;
109}
110
111static inline uint32_t
112gen2_buf_tiling(uint32_t tiling)
113{
114	uint32_t v = 0;
115	switch (tiling) {
116	default: assert(0);
117	case I915_TILING_Y: v |= BUF_3D_TILE_WALK_Y;
118	case I915_TILING_X: v |= BUF_3D_TILED_SURFACE;
119	case I915_TILING_NONE: break;
120	}
121	return v;
122}
123
124static uint32_t
125gen2_get_dst_format(uint32_t format)
126{
127#define BIAS DSTORG_HORT_BIAS(0x8) | DSTORG_VERT_BIAS(0x8)
128	switch (format) {
129	default:
130		assert(0);
131	case PICT_a8r8g8b8:
132	case PICT_x8r8g8b8:
133		return COLR_BUF_ARGB8888 | BIAS;
134	case PICT_r5g6b5:
135		return COLR_BUF_RGB565 | BIAS;
136	case PICT_a1r5g5b5:
137	case PICT_x1r5g5b5:
138		return COLR_BUF_ARGB1555 | BIAS;
139	case PICT_a8:
140		return COLR_BUF_8BIT | BIAS;
141	case PICT_a4r4g4b4:
142	case PICT_x4r4g4b4:
143		return COLR_BUF_ARGB4444 | BIAS;
144	}
145#undef BIAS
146}
147
148static bool
149gen2_check_dst_format(uint32_t format)
150{
151	switch (format) {
152	case PICT_a8r8g8b8:
153	case PICT_x8r8g8b8:
154	case PICT_r5g6b5:
155	case PICT_a1r5g5b5:
156	case PICT_x1r5g5b5:
157	case PICT_a8:
158	case PICT_a4r4g4b4:
159	case PICT_x4r4g4b4:
160		return true;
161	default:
162		return false;
163	}
164}
165
166static uint32_t
167gen2_get_card_format(struct sna *sna, uint32_t format)
168{
169	unsigned int i;
170
171	for (i = 0; i < ARRAY_SIZE(i8xx_tex_formats); i++)
172		if (i8xx_tex_formats[i].fmt == format)
173			return i8xx_tex_formats[i].card_fmt;
174
175	if (sna->kgem.gen < 021) {
176		/* Whilst these are not directly supported on 830/845,
177		 * we only enable them when we can implicitly convert
178		 * them to a supported variant through the texture
179		 * combiners.
180		 */
181		for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++)
182			if (i85x_tex_formats[i].fmt == format)
183				return i8xx_tex_formats[1+i].card_fmt;
184	} else {
185		for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++)
186			if (i85x_tex_formats[i].fmt == format)
187				return i85x_tex_formats[i].card_fmt;
188	}
189
190	assert(0);
191	return 0;
192}
193
194static uint32_t
195gen2_check_format(struct sna *sna, PicturePtr p)
196{
197	unsigned int i;
198
199	for (i = 0; i < ARRAY_SIZE(i8xx_tex_formats); i++)
200		if (i8xx_tex_formats[i].fmt == p->format)
201			return true;
202
203	if (sna->kgem.gen > 021) {
204		for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++)
205			if (i85x_tex_formats[i].fmt == p->format)
206				return true;
207	}
208
209	return false;
210}
211
212static uint32_t
213gen2_sampler_tiling_bits(uint32_t tiling)
214{
215	uint32_t bits = 0;
216	switch (tiling) {
217	default:
218		assert(0);
219	case I915_TILING_Y:
220		bits |= TM0S1_TILE_WALK;
221	case I915_TILING_X:
222		bits |= TM0S1_TILED_SURFACE;
223	case I915_TILING_NONE:
224		break;
225	}
226	return bits;
227}
228
229static bool
230gen2_check_filter(PicturePtr picture)
231{
232	switch (picture->filter) {
233	case PictFilterNearest:
234	case PictFilterBilinear:
235		return true;
236	default:
237		return false;
238	}
239}
240
241static bool
242gen2_check_repeat(PicturePtr picture)
243{
244	if (!picture->repeat)
245		return true;
246
247	switch (picture->repeatType) {
248	case RepeatNone:
249	case RepeatNormal:
250	case RepeatPad:
251	case RepeatReflect:
252		return true;
253	default:
254		return false;
255	}
256}
257
258static void
259gen2_emit_texture(struct sna *sna,
260		  const struct sna_composite_channel *channel,
261		  int unit)
262{
263	uint32_t wrap_mode_u, wrap_mode_v;
264	uint32_t texcoordtype;
265	uint32_t filter;
266
267	assert(channel->bo);
268
269	if (channel->is_affine)
270		texcoordtype = TEXCOORDTYPE_CARTESIAN;
271	else
272		texcoordtype = TEXCOORDTYPE_HOMOGENEOUS;
273
274	switch (channel->repeat) {
275	default:
276		assert(0);
277	case RepeatNone:
278		wrap_mode_u = TEXCOORDMODE_CLAMP_BORDER;
279		break;
280	case RepeatNormal:
281		wrap_mode_u = TEXCOORDMODE_WRAP;
282		break;
283	case RepeatPad:
284		wrap_mode_u = TEXCOORDMODE_CLAMP;
285		break;
286	case RepeatReflect:
287		wrap_mode_u = TEXCOORDMODE_MIRROR;
288		break;
289	}
290	if (channel->is_linear)
291		wrap_mode_v = TEXCOORDMODE_WRAP;
292	else
293		wrap_mode_v = wrap_mode_u;
294
295	switch (channel->filter) {
296	default:
297		assert(0);
298	case PictFilterNearest:
299		filter = (FILTER_NEAREST << TM0S3_MAG_FILTER_SHIFT |
300			  FILTER_NEAREST << TM0S3_MIN_FILTER_SHIFT |
301			  MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
302		break;
303	case PictFilterBilinear:
304		filter = (FILTER_LINEAR << TM0S3_MAG_FILTER_SHIFT |
305			  FILTER_LINEAR << TM0S3_MIN_FILTER_SHIFT |
306			  MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
307		break;
308	}
309
310	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 | LOAD_TEXTURE_MAP(unit) | 4);
311	BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
312			     channel->bo,
313			     I915_GEM_DOMAIN_SAMPLER << 16,
314			     0));
315	BATCH(((channel->height - 1) << TM0S1_HEIGHT_SHIFT) |
316	      ((channel->width - 1)  << TM0S1_WIDTH_SHIFT) |
317	      gen2_get_card_format(sna, channel->pict_format) |
318	      gen2_sampler_tiling_bits(channel->bo->tiling));
319	BATCH((channel->bo->pitch / 4 - 1) << TM0S2_PITCH_SHIFT | TM0S2_MAP_2D);
320	BATCH(filter);
321	BATCH(0);	/* default color */
322
323	BATCH(_3DSTATE_MAP_COORD_SET_CMD | TEXCOORD_SET(unit) |
324	      ENABLE_TEXCOORD_PARAMS | TEXCOORDS_ARE_NORMAL | texcoordtype |
325	      ENABLE_ADDR_V_CNTL | TEXCOORD_ADDR_V_MODE(wrap_mode_v) |
326	      ENABLE_ADDR_U_CNTL | TEXCOORD_ADDR_U_MODE(wrap_mode_u));
327}
328
329static void
330gen2_get_blend_factors(const struct sna_composite_op *op,
331		       int blend,
332		       uint32_t *c_out,
333		       uint32_t *a_out)
334{
335	uint32_t cblend, ablend;
336
337	/* If component alpha is active in the mask and the blend operation
338	 * uses the source alpha, then we know we don't need the source
339	 * value (otherwise we would have hit a fallback earlier), so we
340	 * provide the source alpha (src.A * mask.X) as output color.
341	 * Conversely, if CA is set and we don't need the source alpha, then
342	 * we produce the source value (src.X * mask.X) and the source alpha
343	 * is unused..  Otherwise, we provide the non-CA source value
344	 * (src.X * mask.A).
345	 *
346	 * The PICT_FORMAT_RGB(pict) == 0 fixups are not needed on 855+'s a8
347	 * pictures, but we need to implement it for 830/845 and there's no
348	 * harm done in leaving it in.
349	 */
350	cblend = TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X | TB0C_OUTPUT_WRITE_CURRENT;
351	ablend = TB0A_RESULT_SCALE_1X | TB0A_OUTPUT_WRITE_CURRENT;
352
353	/* Get the source picture's channels into TBx_ARG1 */
354	if ((op->has_component_alpha && gen2_blend_op[blend].src_alpha) ||
355	    op->dst.format == PICT_a8) {
356		/* Producing source alpha value, so the first set of channels
357		 * is src.A instead of src.X.  We also do this if the destination
358		 * is a8, in which case src.G is what's written, and the other
359		 * channels are ignored.
360		 */
361		if (op->src.is_opaque) {
362			ablend |= TB0C_ARG1_SEL_ONE;
363			cblend |= TB0C_ARG1_SEL_ONE;
364		} else if (op->src.is_solid) {
365			ablend |= TB0C_ARG1_SEL_DIFFUSE;
366			cblend |= TB0C_ARG1_SEL_DIFFUSE | TB0C_ARG1_REPLICATE_ALPHA;
367		} else {
368			ablend |= TB0C_ARG1_SEL_TEXEL0;
369			cblend |= TB0C_ARG1_SEL_TEXEL0 | TB0C_ARG1_REPLICATE_ALPHA;
370		}
371	} else {
372		if (op->src.is_solid)
373			cblend |= TB0C_ARG1_SEL_DIFFUSE;
374		else if (PICT_FORMAT_RGB(op->src.pict_format) != 0)
375			cblend |= TB0C_ARG1_SEL_TEXEL0;
376		else
377			cblend |= TB0C_ARG1_SEL_ONE | TB0C_ARG1_INVERT;	/* 0.0 */
378
379		if (op->src.is_opaque)
380			ablend |= TB0A_ARG1_SEL_ONE;
381		else if (op->src.is_solid)
382			ablend |= TB0A_ARG1_SEL_DIFFUSE;
383		else
384			ablend |= TB0A_ARG1_SEL_TEXEL0;
385	}
386
387	if (op->mask.bo) {
388		if (op->src.is_solid) {
389			cblend |= TB0C_ARG2_SEL_TEXEL0;
390			ablend |= TB0A_ARG2_SEL_TEXEL0;
391		} else {
392			cblend |= TB0C_ARG2_SEL_TEXEL1;
393			ablend |= TB0A_ARG2_SEL_TEXEL1;
394		}
395
396		if (op->dst.format == PICT_a8 || !op->has_component_alpha)
397			cblend |= TB0C_ARG2_REPLICATE_ALPHA;
398
399		cblend |= TB0C_OP_MODULATE;
400		ablend |= TB0A_OP_MODULATE;
401	} else if (op->mask.is_solid) {
402		cblend |= TB0C_ARG2_SEL_DIFFUSE;
403		ablend |= TB0A_ARG2_SEL_DIFFUSE;
404
405		if (op->dst.format == PICT_a8 || !op->has_component_alpha)
406			cblend |= TB0C_ARG2_REPLICATE_ALPHA;
407
408		cblend |= TB0C_OP_MODULATE;
409		ablend |= TB0A_OP_MODULATE;
410	} else {
411		cblend |= TB0C_OP_ARG1;
412		ablend |= TB0A_OP_ARG1;
413	}
414
415	*c_out = cblend;
416	*a_out = ablend;
417}
418
419static uint32_t gen2_get_blend_cntl(int op,
420				    bool has_component_alpha,
421				    uint32_t dst_format)
422{
423	uint32_t sblend, dblend;
424
425	if (op <= PictOpSrc)
426		return S8_ENABLE_COLOR_BUFFER_WRITE;
427
428	sblend = gen2_blend_op[op].src_blend;
429	dblend = gen2_blend_op[op].dst_blend;
430
431	if (gen2_blend_op[op].dst_alpha) {
432		/* If there's no dst alpha channel, adjust the blend op so that
433		 * we'll treat it as always 1.
434		 */
435		if (PICT_FORMAT_A(dst_format) == 0) {
436			if (sblend == BLENDFACTOR_DST_ALPHA)
437				sblend = BLENDFACTOR_ONE;
438			else if (sblend == BLENDFACTOR_INV_DST_ALPHA)
439				sblend = BLENDFACTOR_ZERO;
440		}
441
442		/* gen2 engine reads 8bit color buffer into green channel
443		 * in cases like color buffer blending etc., and also writes
444		 * back green channel.  So with dst_alpha blend we should use
445		 * color factor.
446		 */
447		if (dst_format == PICT_a8) {
448			if (sblend == BLENDFACTOR_DST_ALPHA)
449				sblend = BLENDFACTOR_DST_COLR;
450			else if (sblend == BLENDFACTOR_INV_DST_ALPHA)
451				sblend = BLENDFACTOR_INV_DST_COLR;
452		}
453	}
454
455	/* If the source alpha is being used, then we should only be in a case
456	 * where the source blend factor is 0, and the source blend value is
457	 * the mask channels multiplied by the source picture's alpha.
458	 */
459	if (has_component_alpha && gen2_blend_op[op].src_alpha) {
460		if (dblend == BLENDFACTOR_SRC_ALPHA)
461			dblend = BLENDFACTOR_SRC_COLR;
462		else if (dblend == BLENDFACTOR_INV_SRC_ALPHA)
463			dblend = BLENDFACTOR_INV_SRC_COLR;
464	}
465
466	return (sblend << S8_SRC_BLEND_FACTOR_SHIFT |
467		dblend << S8_DST_BLEND_FACTOR_SHIFT |
468		S8_ENABLE_COLOR_BLEND | S8_BLENDFUNC_ADD |
469		S8_ENABLE_COLOR_BUFFER_WRITE);
470}
471
472static void gen2_emit_invariant(struct sna *sna)
473{
474	int i;
475
476	for (i = 0; i < 4; i++) {
477		BATCH(_3DSTATE_MAP_CUBE | MAP_UNIT(i));
478		BATCH(_3DSTATE_MAP_TEX_STREAM_CMD | MAP_UNIT(i) |
479		      DISABLE_TEX_STREAM_BUMP |
480		      ENABLE_TEX_STREAM_COORD_SET | TEX_STREAM_COORD_SET(i) |
481		      ENABLE_TEX_STREAM_MAP_IDX | TEX_STREAM_MAP_IDX(i));
482		BATCH(_3DSTATE_MAP_COORD_TRANSFORM);
483		BATCH(DISABLE_TEX_TRANSFORM | TEXTURE_SET(i));
484	}
485
486	BATCH(_3DSTATE_MAP_COORD_SETBIND_CMD);
487	BATCH(TEXBIND_SET3(TEXCOORDSRC_VTXSET_3) |
488	      TEXBIND_SET2(TEXCOORDSRC_VTXSET_2) |
489	      TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
490	      TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
491
492	BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
493
494	BATCH(_3DSTATE_VERTEX_TRANSFORM);
495	BATCH(DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
496
497	BATCH(_3DSTATE_W_STATE_CMD);
498	BATCH(MAGIC_W_STATE_DWORD1);
499	BATCH_F(1.0);
500
501	BATCH(_3DSTATE_INDPT_ALPHA_BLEND_CMD |
502	      DISABLE_INDPT_ALPHA_BLEND |
503	      ENABLE_ALPHA_BLENDFUNC | ABLENDFUNC_ADD);
504
505	BATCH(_3DSTATE_CONST_BLEND_COLOR_CMD);
506	BATCH(0);
507
508	BATCH(_3DSTATE_MODES_1_CMD |
509	      ENABLE_COLR_BLND_FUNC | BLENDFUNC_ADD |
510	      ENABLE_SRC_BLND_FACTOR | SRC_BLND_FACT(BLENDFACTOR_ONE) |
511	      ENABLE_DST_BLND_FACTOR | DST_BLND_FACT(BLENDFACTOR_ZERO));
512
513	BATCH(_3DSTATE_ENABLES_1_CMD |
514	      DISABLE_LOGIC_OP |
515	      DISABLE_STENCIL_TEST |
516	      DISABLE_DEPTH_BIAS |
517	      DISABLE_SPEC_ADD |
518	      DISABLE_FOG |
519	      DISABLE_ALPHA_TEST |
520	      DISABLE_DEPTH_TEST |
521	      ENABLE_COLOR_BLEND);
522
523	BATCH(_3DSTATE_ENABLES_2_CMD |
524	      DISABLE_STENCIL_WRITE |
525	      DISABLE_DITHER |
526	      DISABLE_DEPTH_WRITE |
527	      ENABLE_COLOR_MASK |
528	      ENABLE_COLOR_WRITE |
529	      ENABLE_TEX_CACHE);
530
531	BATCH(_3DSTATE_STIPPLE);
532	BATCH(0);
533
534	BATCH(_3DSTATE_MAP_BLEND_OP_CMD(0) |
535	      TEXPIPE_COLOR |
536	      ENABLE_TEXOUTPUT_WRT_SEL |
537	      TEXOP_OUTPUT_CURRENT |
538	      DISABLE_TEX_CNTRL_STAGE |
539	      TEXOP_SCALE_1X |
540	      TEXOP_MODIFY_PARMS | TEXOP_LAST_STAGE |
541	      TEXBLENDOP_ARG1);
542	BATCH(_3DSTATE_MAP_BLEND_OP_CMD(0) |
543	      TEXPIPE_ALPHA |
544	      ENABLE_TEXOUTPUT_WRT_SEL |
545	      TEXOP_OUTPUT_CURRENT |
546	      TEXOP_SCALE_1X | TEXOP_MODIFY_PARMS |
547	      TEXBLENDOP_ARG1);
548	BATCH(_3DSTATE_MAP_BLEND_ARG_CMD(0) |
549	      TEXPIPE_COLOR |
550	      TEXBLEND_ARG1 |
551	      TEXBLENDARG_MODIFY_PARMS |
552	      TEXBLENDARG_DIFFUSE);
553	BATCH(_3DSTATE_MAP_BLEND_ARG_CMD(0) |
554	      TEXPIPE_ALPHA |
555	      TEXBLEND_ARG1 |
556	      TEXBLENDARG_MODIFY_PARMS |
557	      TEXBLENDARG_DIFFUSE);
558
559#define INVARIANT_SIZE 35
560
561	sna->render_state.gen2.need_invariant = false;
562}
563
564static void
565gen2_get_batch(struct sna *sna, const struct sna_composite_op *op)
566{
567	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
568
569	if (!kgem_check_batch(&sna->kgem, INVARIANT_SIZE+40)) {
570		DBG(("%s: flushing batch: size %d > %d\n",
571		     __FUNCTION__, INVARIANT_SIZE+40,
572		     sna->kgem.surface-sna->kgem.nbatch));
573		kgem_submit(&sna->kgem);
574		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
575	}
576
577	if (!kgem_check_reloc(&sna->kgem, 3)) {
578		DBG(("%s: flushing batch: reloc %d >= %d\n",
579		     __FUNCTION__,
580		     sna->kgem.nreloc + 3,
581		     (int)KGEM_RELOC_SIZE(&sna->kgem)));
582		kgem_submit(&sna->kgem);
583		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
584	}
585
586	if (!kgem_check_exec(&sna->kgem, 3)) {
587		DBG(("%s: flushing batch: exec %d >= %d\n",
588		     __FUNCTION__,
589		     sna->kgem.nexec + 1,
590		     (int)KGEM_EXEC_SIZE(&sna->kgem)));
591		kgem_submit(&sna->kgem);
592		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
593	}
594
595	if (sna->render_state.gen2.need_invariant)
596		gen2_emit_invariant(sna);
597}
598
599static void gen2_emit_target(struct sna *sna, const struct sna_composite_op *op)
600{
601	assert(!too_large(op->dst.width, op->dst.height));
602	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= MAX_3D_PITCH);
603	assert(sna->render.vertex_offset == 0);
604
605	assert(op->dst.bo->unique_id);
606	if (sna->render_state.gen2.target == op->dst.bo->unique_id) {
607		kgem_bo_mark_dirty(op->dst.bo);
608		return;
609	}
610
611	BATCH(_3DSTATE_BUF_INFO_CMD);
612	BATCH(BUF_3D_ID_COLOR_BACK |
613	      gen2_buf_tiling(op->dst.bo->tiling) |
614	      BUF_3D_PITCH(op->dst.bo->pitch));
615	BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
616			     op->dst.bo,
617			     I915_GEM_DOMAIN_RENDER << 16 |
618			     I915_GEM_DOMAIN_RENDER,
619			     0));
620
621	BATCH(_3DSTATE_DST_BUF_VARS_CMD);
622	BATCH(gen2_get_dst_format(op->dst.format));
623
624	BATCH(_3DSTATE_DRAW_RECT_CMD);
625	BATCH(0);
626	BATCH(0);	/* ymin, xmin */
627	BATCH(DRAW_YMAX(op->dst.height - 1) |
628	      DRAW_XMAX(op->dst.width - 1));
629	BATCH(0);	/* yorig, xorig */
630
631	sna->render_state.gen2.target = op->dst.bo->unique_id;
632}
633
634static void gen2_disable_logic_op(struct sna *sna)
635{
636	if (!sna->render_state.gen2.logic_op_enabled)
637		return;
638
639	DBG(("%s\n", __FUNCTION__));
640
641	BATCH(_3DSTATE_ENABLES_1_CMD |
642	      DISABLE_LOGIC_OP | ENABLE_COLOR_BLEND);
643
644	sna->render_state.gen2.logic_op_enabled = 0;
645}
646
647static void gen2_enable_logic_op(struct sna *sna, int op)
648{
649	static const uint8_t logic_op[] = {
650		LOGICOP_CLEAR,		/* GXclear */
651		LOGICOP_AND,		/* GXand */
652		LOGICOP_AND_RVRSE, 	/* GXandReverse */
653		LOGICOP_COPY,		/* GXcopy */
654		LOGICOP_AND_INV,	/* GXandInverted */
655		LOGICOP_NOOP,		/* GXnoop */
656		LOGICOP_XOR,		/* GXxor */
657		LOGICOP_OR,		/* GXor */
658		LOGICOP_NOR,		/* GXnor */
659		LOGICOP_EQUIV,		/* GXequiv */
660		LOGICOP_INV,		/* GXinvert */
661		LOGICOP_OR_RVRSE,	/* GXorReverse */
662		LOGICOP_COPY_INV,	/* GXcopyInverted */
663		LOGICOP_OR_INV,		/* GXorInverted */
664		LOGICOP_NAND,		/* GXnand */
665		LOGICOP_SET		/* GXset */
666	};
667
668	if (sna->render_state.gen2.logic_op_enabled != op+1) {
669		if (!sna->render_state.gen2.logic_op_enabled) {
670			if (op == GXclear || op == GXcopy)
671				return;
672
673			DBG(("%s\n", __FUNCTION__));
674
675			BATCH(_3DSTATE_ENABLES_1_CMD |
676			      ENABLE_LOGIC_OP | DISABLE_COLOR_BLEND);
677		}
678
679		BATCH(_3DSTATE_MODES_4_CMD |
680		      ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(logic_op[op]));
681		sna->render_state.gen2.logic_op_enabled = op+1;
682	}
683}
684
685static void gen2_emit_composite_state(struct sna *sna,
686				      const struct sna_composite_op *op)
687{
688	uint32_t texcoordfmt, v, unwind;
689	uint32_t cblend, ablend;
690	int tex;
691
692	gen2_get_batch(sna, op);
693
694	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
695		if (op->src.bo == op->dst.bo || op->mask.bo == op->dst.bo)
696			BATCH(MI_FLUSH | MI_INVALIDATE_MAP_CACHE);
697		else
698			BATCH(_3DSTATE_MODES_5_CMD |
699			      PIPELINE_FLUSH_RENDER_CACHE |
700			      PIPELINE_FLUSH_TEXTURE_CACHE);
701		kgem_clear_dirty(&sna->kgem);
702	}
703
704	gen2_emit_target(sna, op);
705
706	unwind = sna->kgem.nbatch;
707	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
708	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
709	BATCH((!op->src.is_solid + (op->mask.bo != NULL)) << 12);
710	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
711	BATCH(gen2_get_blend_cntl(op->op,
712				  op->has_component_alpha,
713				  op->dst.format));
714	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
715		   sna->kgem.batch + unwind + 1,
716		   3 * sizeof(uint32_t)) == 0)
717		sna->kgem.nbatch = unwind;
718	else
719		sna->render_state.gen2.ls1 = unwind;
720
721	gen2_disable_logic_op(sna);
722
723	gen2_get_blend_factors(op, op->op, &cblend, &ablend);
724	unwind = sna->kgem.nbatch;
725	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
726	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
727	BATCH(cblend);
728	BATCH(ablend);
729	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls2 + 1,
730		   sna->kgem.batch + unwind + 1,
731		   2 * sizeof(uint32_t)) == 0)
732		sna->kgem.nbatch = unwind;
733	else
734		sna->render_state.gen2.ls2 = unwind;
735
736	tex = texcoordfmt = 0;
737	if (!op->src.is_solid) {
738		if (op->src.is_affine)
739			texcoordfmt |= TEXCOORDFMT_2D << (2*tex);
740		else
741			texcoordfmt |= TEXCOORDFMT_3D << (2*tex);
742		gen2_emit_texture(sna, &op->src, tex++);
743	} else {
744		if (op->src.u.gen2.pixel != sna->render_state.gen2.diffuse) {
745			BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
746			BATCH(op->src.u.gen2.pixel);
747			sna->render_state.gen2.diffuse = op->src.u.gen2.pixel;
748		}
749	}
750	if (op->mask.bo) {
751		if (op->mask.is_affine)
752			texcoordfmt |= TEXCOORDFMT_2D << (2*tex);
753		else
754			texcoordfmt |= TEXCOORDFMT_3D << (2*tex);
755		gen2_emit_texture(sna, &op->mask, tex++);
756	} else if (op->mask.is_solid) {
757		if (op->mask.u.gen2.pixel != sna->render_state.gen2.diffuse) {
758			BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
759			BATCH(op->mask.u.gen2.pixel);
760			sna->render_state.gen2.diffuse = op->mask.u.gen2.pixel;
761		}
762	}
763
764	v = _3DSTATE_VERTEX_FORMAT_2_CMD | texcoordfmt;
765	if (sna->render_state.gen2.vft != v) {
766		BATCH(v);
767		sna->render_state.gen2.vft = v;
768	}
769}
770
771static inline void
772gen2_emit_composite_dstcoord(struct sna *sna, int dstX, int dstY)
773{
774	VERTEX(dstX);
775	VERTEX(dstY);
776}
777
778inline static void
779gen2_emit_composite_linear(struct sna *sna,
780			   const struct sna_composite_channel *channel,
781			   int16_t x, int16_t y)
782{
783	float v;
784
785	v = (x * channel->u.linear.dx +
786	     y * channel->u.linear.dy +
787	     channel->u.linear.offset);
788	DBG(("%s: (%d, %d) -> %f\n", __FUNCTION__, x, y, v));
789	VERTEX(v);
790	VERTEX(v);
791}
792
793static void
794gen2_emit_composite_texcoord(struct sna *sna,
795			     const struct sna_composite_channel *channel,
796			     int16_t x, int16_t y)
797{
798	float s = 0, t = 0, w = 1;
799
800	x += channel->offset[0];
801	y += channel->offset[1];
802
803	if (channel->is_affine) {
804		sna_get_transformed_coordinates(x, y,
805						channel->transform,
806						&s, &t);
807		VERTEX(s * channel->scale[0]);
808		VERTEX(t * channel->scale[1]);
809	} else {
810		sna_get_transformed_coordinates_3d(x, y,
811						   channel->transform,
812						   &s, &t, &w);
813		VERTEX(s * channel->scale[0]);
814		VERTEX(t * channel->scale[1]);
815		VERTEX(w);
816	}
817}
818
819static void
820gen2_emit_composite_vertex(struct sna *sna,
821			   const struct sna_composite_op *op,
822			   int16_t srcX, int16_t srcY,
823			   int16_t mskX, int16_t mskY,
824			   int16_t dstX, int16_t dstY)
825{
826	gen2_emit_composite_dstcoord(sna, dstX, dstY);
827	if (op->src.is_linear)
828		gen2_emit_composite_linear(sna, &op->src, srcX, srcY);
829	else if (!op->src.is_solid)
830		gen2_emit_composite_texcoord(sna, &op->src, srcX, srcY);
831
832	if (op->mask.is_linear)
833		gen2_emit_composite_linear(sna, &op->mask, mskX, mskY);
834	else if (op->mask.bo)
835		gen2_emit_composite_texcoord(sna, &op->mask, mskX, mskY);
836}
837
838fastcall static void
839gen2_emit_composite_primitive(struct sna *sna,
840			      const struct sna_composite_op *op,
841			      const struct sna_composite_rectangles *r)
842{
843	gen2_emit_composite_vertex(sna, op,
844				   r->src.x + r->width,
845				   r->src.y + r->height,
846				   r->mask.x + r->width,
847				   r->mask.y + r->height,
848				   op->dst.x + r->dst.x + r->width,
849				   op->dst.y + r->dst.y + r->height);
850	gen2_emit_composite_vertex(sna, op,
851				   r->src.x,
852				   r->src.y + r->height,
853				   r->mask.x,
854				   r->mask.y + r->height,
855				   op->dst.x + r->dst.x,
856				   op->dst.y + r->dst.y + r->height);
857	gen2_emit_composite_vertex(sna, op,
858				   r->src.x,
859				   r->src.y,
860				   r->mask.x,
861				   r->mask.y,
862				   op->dst.x + r->dst.x,
863				   op->dst.y + r->dst.y);
864}
865
866fastcall static void
867gen2_emit_composite_primitive_constant(struct sna *sna,
868				       const struct sna_composite_op *op,
869				       const struct sna_composite_rectangles *r)
870{
871	int16_t dst_x = r->dst.x + op->dst.x;
872	int16_t dst_y = r->dst.y + op->dst.y;
873
874	gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
875	gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
876	gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
877}
878
879fastcall static void
880gen2_emit_composite_primitive_linear(struct sna *sna,
881				       const struct sna_composite_op *op,
882				       const struct sna_composite_rectangles *r)
883{
884	int16_t dst_x = r->dst.x + op->dst.x;
885	int16_t dst_y = r->dst.y + op->dst.y;
886
887	gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
888	gen2_emit_composite_linear(sna, &op->src,
889				   r->src.x + r->width, r->src.y + r->height);
890
891	gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
892	gen2_emit_composite_linear(sna, &op->src,
893				   r->src.x, r->src.y + r->height);
894
895	gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
896	gen2_emit_composite_linear(sna, &op->src,
897				   r->src.x, r->src.y);
898}
899
900fastcall static void
901gen2_emit_composite_primitive_identity(struct sna *sna,
902				       const struct sna_composite_op *op,
903				       const struct sna_composite_rectangles *r)
904{
905	float w = r->width;
906	float h = r->height;
907	float *v;
908
909	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
910	sna->kgem.nbatch += 12;
911
912	v[8] = v[4] = r->dst.x + op->dst.x;
913	v[0] = v[4] + w;
914
915	v[9] = r->dst.y + op->dst.y;
916	v[5] = v[1] = v[9] + h;
917
918	v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
919	v[2] = v[6] + w * op->src.scale[0];
920
921	v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
922	v[7] = v[3] = v[11] + h * op->src.scale[1];
923}
924
925fastcall static void
926gen2_emit_composite_primitive_affine(struct sna *sna,
927				     const struct sna_composite_op *op,
928				     const struct sna_composite_rectangles *r)
929{
930	PictTransform *transform = op->src.transform;
931	int src_x = r->src.x + (int)op->src.offset[0];
932	int src_y = r->src.y + (int)op->src.offset[1];
933	float *v;
934
935	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
936	sna->kgem.nbatch += 12;
937
938	v[8] = v[4] = r->dst.x + op->dst.x;
939	v[0] = v[4] + r->width;
940
941	v[9] = r->dst.y + op->dst.y;
942	v[5] = v[1] = v[9] + r->height;
943
944	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
945				    transform, op->src.scale,
946				    &v[2], &v[3]);
947
948	_sna_get_transformed_scaled(src_x, src_y + r->height,
949				    transform, op->src.scale,
950				    &v[6], &v[7]);
951
952	_sna_get_transformed_scaled(src_x, src_y,
953				    transform, op->src.scale,
954				    &v[10], &v[11]);
955}
956
957fastcall static void
958gen2_emit_composite_primitive_constant_identity_mask(struct sna *sna,
959						     const struct sna_composite_op *op,
960						     const struct sna_composite_rectangles *r)
961{
962	float w = r->width;
963	float h = r->height;
964	float *v;
965
966	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
967	sna->kgem.nbatch += 12;
968
969	v[8] = v[4] = r->dst.x + op->dst.x;
970	v[0] = v[4] + w;
971
972	v[9] = r->dst.y + op->dst.y;
973	v[5] = v[1] = v[9] + h;
974
975	v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
976	v[2] = v[6] + w * op->mask.scale[0];
977
978	v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
979	v[7] = v[3] = v[11] + h * op->mask.scale[1];
980}
981
982#if defined(sse2) && !defined(__x86_64__)
983sse2 fastcall static void
984gen2_emit_composite_primitive_constant__sse2(struct sna *sna,
985					     const struct sna_composite_op *op,
986					     const struct sna_composite_rectangles *r)
987{
988	int16_t dst_x = r->dst.x + op->dst.x;
989	int16_t dst_y = r->dst.y + op->dst.y;
990
991	gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
992	gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
993	gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
994}
995
996sse2 fastcall static void
997gen2_emit_composite_primitive_linear__sse2(struct sna *sna,
998					   const struct sna_composite_op *op,
999					   const struct sna_composite_rectangles *r)
1000{
1001	int16_t dst_x = r->dst.x + op->dst.x;
1002	int16_t dst_y = r->dst.y + op->dst.y;
1003
1004	gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
1005	gen2_emit_composite_linear(sna, &op->src,
1006				   r->src.x + r->width, r->src.y + r->height);
1007
1008	gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
1009	gen2_emit_composite_linear(sna, &op->src,
1010				   r->src.x, r->src.y + r->height);
1011
1012	gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
1013	gen2_emit_composite_linear(sna, &op->src,
1014				   r->src.x, r->src.y);
1015}
1016
1017sse2 fastcall static void
1018gen2_emit_composite_primitive_identity__sse2(struct sna *sna,
1019					     const struct sna_composite_op *op,
1020					     const struct sna_composite_rectangles *r)
1021{
1022	float w = r->width;
1023	float h = r->height;
1024	float *v;
1025
1026	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
1027	sna->kgem.nbatch += 12;
1028
1029	v[8] = v[4] = r->dst.x + op->dst.x;
1030	v[0] = v[4] + w;
1031
1032	v[9] = r->dst.y + op->dst.y;
1033	v[5] = v[1] = v[9] + h;
1034
1035	v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1036	v[2] = v[6] + w * op->src.scale[0];
1037
1038	v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1039	v[7] = v[3] = v[11] + h * op->src.scale[1];
1040}
1041
1042sse2 fastcall static void
1043gen2_emit_composite_primitive_affine__sse2(struct sna *sna,
1044					   const struct sna_composite_op *op,
1045					   const struct sna_composite_rectangles *r)
1046{
1047	PictTransform *transform = op->src.transform;
1048	int src_x = r->src.x + (int)op->src.offset[0];
1049	int src_y = r->src.y + (int)op->src.offset[1];
1050	float *v;
1051
1052	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
1053	sna->kgem.nbatch += 12;
1054
1055	v[8] = v[4] = r->dst.x + op->dst.x;
1056	v[0] = v[4] + r->width;
1057
1058	v[9] = r->dst.y + op->dst.y;
1059	v[5] = v[1] = v[9] + r->height;
1060
1061	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
1062				    transform, op->src.scale,
1063				    &v[2], &v[3]);
1064
1065	_sna_get_transformed_scaled(src_x, src_y + r->height,
1066				    transform, op->src.scale,
1067				    &v[6], &v[7]);
1068
1069	_sna_get_transformed_scaled(src_x, src_y,
1070				    transform, op->src.scale,
1071				    &v[10], &v[11]);
1072}
1073
1074sse2 fastcall static void
1075gen2_emit_composite_primitive_constant_identity_mask__sse2(struct sna *sna,
1076							   const struct sna_composite_op *op,
1077							   const struct sna_composite_rectangles *r)
1078{
1079	float w = r->width;
1080	float h = r->height;
1081	float *v;
1082
1083	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
1084	sna->kgem.nbatch += 12;
1085
1086	v[8] = v[4] = r->dst.x + op->dst.x;
1087	v[0] = v[4] + w;
1088
1089	v[9] = r->dst.y + op->dst.y;
1090	v[5] = v[1] = v[9] + h;
1091
1092	v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
1093	v[2] = v[6] + w * op->mask.scale[0];
1094
1095	v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
1096	v[7] = v[3] = v[11] + h * op->mask.scale[1];
1097}
1098#endif
1099
1100static void gen2_magic_ca_pass(struct sna *sna,
1101			       const struct sna_composite_op *op)
1102{
1103	uint32_t ablend, cblend, *src, *dst;
1104	int n;
1105
1106	if (!op->need_magic_ca_pass)
1107		return;
1108
1109	DBG(("%s: batch=%x, vertex=%x\n", __FUNCTION__,
1110	     sna->kgem.nbatch, sna->render.vertex_offset));
1111
1112	assert(op->mask.bo);
1113	assert(op->has_component_alpha);
1114
1115	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(8) | 0);
1116	BATCH(BLENDFACTOR_ONE << S8_SRC_BLEND_FACTOR_SHIFT |
1117	      BLENDFACTOR_ONE << S8_DST_BLEND_FACTOR_SHIFT |
1118	      S8_ENABLE_COLOR_BLEND | S8_BLENDFUNC_ADD |
1119	      S8_ENABLE_COLOR_BUFFER_WRITE);
1120	sna->render_state.gen2.ls1 = 0;
1121
1122	gen2_get_blend_factors(op, PictOpAdd, &cblend, &ablend);
1123	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
1124	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
1125	BATCH(cblend);
1126	BATCH(ablend);
1127	sna->render_state.gen2.ls2 = 0;
1128
1129	src = sna->kgem.batch + sna->render.vertex_offset;
1130	dst = sna->kgem.batch + sna->kgem.nbatch;
1131	n = 1 + sna->render.vertex_index;
1132	sna->kgem.nbatch += n;
1133	assert(sna->kgem.nbatch <= KGEM_BATCH_SIZE(&sna->kgem));
1134	while (n--)
1135		*dst++ = *src++;
1136}
1137
1138static void gen2_vertex_flush(struct sna *sna,
1139			      const struct sna_composite_op *op)
1140{
1141	if (sna->render.vertex_index == 0)
1142		return;
1143
1144	sna->kgem.batch[sna->render.vertex_offset] |=
1145		sna->render.vertex_index - 1;
1146
1147	gen2_magic_ca_pass(sna, op);
1148
1149	sna->render.vertex_offset = 0;
1150	sna->render.vertex_index = 0;
1151}
1152
1153inline static int gen2_get_rectangles(struct sna *sna,
1154				      const struct sna_composite_op *op,
1155				      int want)
1156{
1157	int rem = batch_space(sna), size, need;
1158
1159	DBG(("%s: want=%d, floats_per_vertex=%d, rem=%d\n",
1160	     __FUNCTION__, want, op->floats_per_vertex, rem));
1161
1162	assert(op->floats_per_vertex);
1163	assert(op->floats_per_rect == 3 * op->floats_per_vertex);
1164
1165	need = 1;
1166	size = op->floats_per_rect;
1167	if (op->need_magic_ca_pass)
1168		need += 6 + size*sna->render.vertex_index, size *= 2;
1169
1170	DBG(("%s: want=%d, need=%d,size=%d, rem=%d\n",
1171	     __FUNCTION__, want, need, size, rem));
1172	if (rem < need + size) {
1173		gen2_vertex_flush(sna, op);
1174		kgem_submit(&sna->kgem);
1175		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1176		return 0;
1177	}
1178
1179	rem -= need;
1180	if (sna->render.vertex_offset == 0) {
1181		if ((sna->kgem.batch[sna->kgem.nbatch-1] & ~0xffff) ==
1182		    (PRIM3D_INLINE | PRIM3D_RECTLIST)) {
1183			uint32_t *b = &sna->kgem.batch[sna->kgem.nbatch-1];
1184			assert(*b & 0xffff);
1185			sna->render.vertex_index = 1 + (*b & 0xffff);
1186			*b = PRIM3D_INLINE | PRIM3D_RECTLIST;
1187			sna->render.vertex_offset = sna->kgem.nbatch - 1;
1188			assert(!op->need_magic_ca_pass);
1189		} else {
1190			sna->render.vertex_offset = sna->kgem.nbatch;
1191			BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
1192		}
1193	}
1194
1195	if (want > 1 && want * size > rem)
1196		want = rem / size;
1197
1198	assert(want);
1199	sna->render.vertex_index += want*op->floats_per_rect;
1200	return want;
1201}
1202
1203fastcall static void
1204gen2_render_composite_blt(struct sna *sna,
1205			  const struct sna_composite_op *op,
1206			  const struct sna_composite_rectangles *r)
1207{
1208	if (!gen2_get_rectangles(sna, op, 1)) {
1209		gen2_emit_composite_state(sna, op);
1210		gen2_get_rectangles(sna, op, 1);
1211	}
1212
1213	op->prim_emit(sna, op, r);
1214}
1215
1216fastcall static void
1217gen2_render_composite_box(struct sna *sna,
1218			  const struct sna_composite_op *op,
1219			  const BoxRec *box)
1220{
1221	struct sna_composite_rectangles r;
1222
1223	if (!gen2_get_rectangles(sna, op, 1)) {
1224		gen2_emit_composite_state(sna, op);
1225		gen2_get_rectangles(sna, op, 1);
1226	}
1227
1228	DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
1229	     box->x1, box->y1,
1230	     box->x2 - box->x1,
1231	     box->y2 - box->y1));
1232
1233	r.dst.x  = box->x1; r.dst.y  = box->y1;
1234	r.width = box->x2 - box->x1;
1235	r.height = box->y2 - box->y1;
1236	r.src = r.mask = r.dst;
1237
1238	op->prim_emit(sna, op, &r);
1239}
1240
1241static void
1242gen2_render_composite_boxes(struct sna *sna,
1243			    const struct sna_composite_op *op,
1244			    const BoxRec *box, int nbox)
1245{
1246	do {
1247		int nbox_this_time;
1248
1249		nbox_this_time = gen2_get_rectangles(sna, op, nbox);
1250		if (nbox_this_time == 0) {
1251			gen2_emit_composite_state(sna, op);
1252			nbox_this_time = gen2_get_rectangles(sna, op, nbox);
1253		}
1254		nbox -= nbox_this_time;
1255
1256		do {
1257			struct sna_composite_rectangles r;
1258
1259			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
1260			     box->x1, box->y1,
1261			     box->x2 - box->x1,
1262			     box->y2 - box->y1));
1263
1264			r.dst.x  = box->x1; r.dst.y  = box->y1;
1265			r.width = box->x2 - box->x1;
1266			r.height = box->y2 - box->y1;
1267			r.src = r.mask = r.dst;
1268
1269			op->prim_emit(sna, op, &r);
1270			box++;
1271		} while (--nbox_this_time);
1272	} while (nbox);
1273}
1274
1275static void gen2_render_composite_done(struct sna *sna,
1276				       const struct sna_composite_op *op)
1277{
1278	gen2_vertex_flush(sna, op);
1279
1280	if (op->mask.bo)
1281		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1282	if (op->src.bo)
1283		kgem_bo_destroy(&sna->kgem, op->src.bo);
1284	sna_render_composite_redirect_done(sna, op);
1285}
1286
1287static bool
1288gen2_composite_solid_init(struct sna *sna,
1289			  struct sna_composite_channel *channel,
1290			  uint32_t color)
1291{
1292	channel->filter = PictFilterNearest;
1293	channel->repeat = RepeatNormal;
1294	channel->is_solid  = true;
1295	channel->is_affine = true;
1296	channel->width  = 1;
1297	channel->height = 1;
1298	channel->pict_format = PICT_a8r8g8b8;
1299
1300	channel->bo = NULL;
1301	channel->u.gen2.pixel = color;
1302
1303	channel->scale[0]  = channel->scale[1]  = 1;
1304	channel->offset[0] = channel->offset[1] = 0;
1305	return true;
1306}
1307
1308#define xFixedToDouble(f) pixman_fixed_to_double(f)
1309
1310static bool
1311gen2_composite_linear_init(struct sna *sna,
1312			   PicturePtr picture,
1313			   struct sna_composite_channel *channel,
1314			   int x, int y,
1315			   int w, int h,
1316			   int dst_x, int dst_y)
1317{
1318	PictLinearGradient *linear =
1319		(PictLinearGradient *)picture->pSourcePict;
1320	pixman_fixed_t tx, ty;
1321	float x0, y0, sf;
1322	float dx, dy;
1323
1324	DBG(("%s: p1=(%f, %f), p2=(%f, %f)\n",
1325	     __FUNCTION__,
1326	     xFixedToDouble(linear->p1.x), xFixedToDouble(linear->p1.y),
1327	     xFixedToDouble(linear->p2.x), xFixedToDouble(linear->p2.y)));
1328
1329	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
1330		return 0;
1331
1332	if (!sna_transform_is_affine(picture->transform)) {
1333		DBG(("%s: fallback due to projective transform\n",
1334		     __FUNCTION__));
1335		return sna_render_picture_fixup(sna, picture, channel,
1336						x, y, w, h, dst_x, dst_y);
1337	}
1338
1339	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
1340	if (!channel->bo)
1341		return 0;
1342
1343	channel->filter = PictFilterNearest;
1344	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1345	channel->is_linear = true;
1346	channel->width  = channel->bo->pitch / 4;
1347	channel->height = 1;
1348	channel->pict_format = PICT_a8r8g8b8;
1349
1350	channel->scale[0]  = channel->scale[1]  = 1;
1351	channel->offset[0] = channel->offset[1] = 0;
1352
1353	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
1354		dx = xFixedToDouble(linear->p2.x - linear->p1.x);
1355		dy = xFixedToDouble(linear->p2.y - linear->p1.y);
1356
1357		x0 = xFixedToDouble(linear->p1.x);
1358		y0 = xFixedToDouble(linear->p1.y);
1359
1360		if (tx | ty) {
1361			x0 -= pixman_fixed_to_double(tx);
1362			y0 -= pixman_fixed_to_double(ty);
1363		}
1364	} else {
1365		struct pixman_f_vector p1, p2;
1366		struct pixman_f_transform m, inv;
1367
1368		pixman_f_transform_from_pixman_transform(&m, picture->transform);
1369		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
1370		     __FUNCTION__,
1371		     m.m[0][0], m.m[0][1], m.m[0][2],
1372		     m.m[1][0], m.m[1][1], m.m[1][2],
1373		     m.m[2][0], m.m[2][1], m.m[2][2]));
1374		if (!pixman_f_transform_invert(&inv, &m))
1375			return 0;
1376
1377		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
1378		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
1379		p1.v[2] = 1.;
1380		pixman_f_transform_point(&inv, &p1);
1381
1382		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
1383		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
1384		p2.v[2] = 1.;
1385		pixman_f_transform_point(&inv, &p2);
1386
1387		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
1388		     __FUNCTION__,
1389		     p1.v[0], p1.v[1], p1.v[2],
1390		     p2.v[0], p2.v[1], p2.v[2]));
1391
1392		dx = p2.v[0] - p1.v[0];
1393		dy = p2.v[1] - p1.v[1];
1394
1395		x0 = p1.v[0];
1396		y0 = p1.v[1];
1397	}
1398
1399	sf = dx*dx + dy*dy;
1400	dx /= sf;
1401	dy /= sf;
1402
1403	channel->u.linear.dx = dx;
1404	channel->u.linear.dy = dy;
1405	channel->u.linear.offset = -dx*(x0+dst_x-x) + -dy*(y0+dst_y-y);
1406
1407	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
1408	     __FUNCTION__, dx, dy, channel->u.linear.offset));
1409
1410	return channel->bo != NULL;
1411}
1412
1413static bool source_is_covered(PicturePtr picture,
1414			      int x, int y,
1415			      int width, int height)
1416{
1417	int x1, y1, x2, y2;
1418
1419	if (picture->repeat && picture->repeatType != RepeatNone)
1420		return true;
1421
1422	if (picture->pDrawable == NULL)
1423		return false;
1424
1425	if (picture->transform) {
1426		pixman_box16_t sample;
1427
1428		sample.x1 = x;
1429		sample.y1 = y;
1430		sample.x2 = x + width;
1431		sample.y2 = y + height;
1432
1433		pixman_transform_bounds(picture->transform, &sample);
1434
1435		x1 = sample.x1;
1436		x2 = sample.x2;
1437		y1 = sample.y1;
1438		y2 = sample.y2;
1439	} else {
1440		x1 = x;
1441		y1 = y;
1442		x2 = x + width;
1443		y2 = y + height;
1444	}
1445
1446	return
1447		x1 >= 0 && y1 >= 0 &&
1448		x2 <= picture->pDrawable->width &&
1449		y2 <= picture->pDrawable->height;
1450}
1451
1452static bool
1453gen2_check_card_format(struct sna *sna,
1454		       PicturePtr picture,
1455		       struct sna_composite_channel *channel,
1456		       int x, int y, int w, int h,
1457		       bool *fixup_alpha)
1458{
1459	uint32_t format = picture->format;
1460	unsigned int i;
1461
1462	for (i = 0; i < ARRAY_SIZE(i8xx_tex_formats); i++) {
1463		if (i8xx_tex_formats[i].fmt == format)
1464			return true;
1465	}
1466
1467	for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++) {
1468		if (i85x_tex_formats[i].fmt == format) {
1469			if (sna->kgem.gen >= 021)
1470				return true;
1471
1472			if (source_is_covered(picture, x, y, w,h)) {
1473				channel->is_opaque = true;
1474				return true;
1475			}
1476
1477			*fixup_alpha = true;
1478			return false;
1479		}
1480	}
1481
1482	*fixup_alpha = false;
1483	return false;
1484}
1485
1486static int
1487gen2_composite_picture(struct sna *sna,
1488		       PicturePtr picture,
1489		       struct sna_composite_channel *channel,
1490		       int x, int y,
1491		       int w, int h,
1492		       int dst_x, int dst_y,
1493		       bool precise)
1494{
1495	PixmapPtr pixmap;
1496	uint32_t color;
1497	int16_t dx, dy;
1498	bool fixup_alpha;
1499
1500	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1501	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1502
1503	channel->is_solid = false;
1504	channel->is_linear = false;
1505	channel->is_opaque = false;
1506	channel->is_affine = true;
1507	channel->transform = NULL;
1508	channel->card_format = -1;
1509
1510	if (sna_picture_is_solid(picture, &color))
1511		return gen2_composite_solid_init(sna, channel, color);
1512
1513	if (!gen2_check_repeat(picture)) {
1514		DBG(("%s -- fallback, unhandled repeat %d\n",
1515		     __FUNCTION__, picture->repeat));
1516		return sna_render_picture_fixup(sna, picture, channel,
1517						x, y, w, h, dst_x, dst_y);
1518	}
1519
1520	if (!gen2_check_filter(picture)) {
1521		DBG(("%s -- fallback, unhandled filter %d\n",
1522		     __FUNCTION__, picture->filter));
1523		return sna_render_picture_fixup(sna, picture, channel,
1524						x, y, w, h, dst_x, dst_y);
1525	}
1526
1527	if (picture->pDrawable == NULL) {
1528		int ret;
1529
1530		if (picture->pSourcePict->type == SourcePictTypeLinear)
1531			return gen2_composite_linear_init(sna, picture, channel,
1532							  x, y,
1533							  w, h,
1534							  dst_x, dst_y);
1535
1536		DBG(("%s -- fallback, unhandled source %d\n",
1537		     __FUNCTION__, picture->pSourcePict->type));
1538		ret = -1;
1539		if (!precise)
1540			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1541								      x, y, w, h, dst_x, dst_y);
1542		if (ret == -1)
1543			ret = sna_render_picture_fixup(sna, picture, channel,
1544						       x, y, w, h, dst_x, dst_y);
1545		return ret;
1546	}
1547
1548	if (picture->alphaMap) {
1549		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1550		return sna_render_picture_fixup(sna, picture, channel,
1551						x, y, w, h, dst_x, dst_y);
1552	}
1553
1554	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1555	channel->filter = picture->filter;
1556
1557	pixmap = get_drawable_pixmap(picture->pDrawable);
1558	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1559
1560	x += dx + picture->pDrawable->x;
1561	y += dy + picture->pDrawable->y;
1562
1563	channel->is_affine = sna_transform_is_affine(picture->transform);
1564	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1565		DBG(("%s: integer translation (%d, %d), removing\n",
1566		     __FUNCTION__, dx, dy));
1567		x += dx;
1568		y += dy;
1569		channel->transform = NULL;
1570		channel->filter = PictFilterNearest;
1571
1572		if (channel->repeat &&
1573		    (x >= 0 &&
1574		     y >= 0 &&
1575		     x + w < pixmap->drawable.width &&
1576		     y + h < pixmap->drawable.height)) {
1577			struct sna_pixmap *priv = sna_pixmap(pixmap);
1578			if (priv && priv->clear) {
1579				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1580				return gen2_composite_solid_init(sna, channel, priv->clear_color);
1581			}
1582		}
1583	} else
1584		channel->transform = picture->transform;
1585
1586	if (!gen2_check_card_format(sna, picture, channel, x,  y, w ,h, &fixup_alpha))
1587		return sna_render_picture_convert(sna, picture, channel, pixmap,
1588						  x, y, w, h, dst_x, dst_y, fixup_alpha);
1589
1590	channel->pict_format = picture->format;
1591	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
1592		return sna_render_picture_extract(sna, picture, channel,
1593						  x, y, w, h, dst_x, dst_y);
1594
1595	return sna_render_pixmap_bo(sna, channel, pixmap,
1596				    x, y, w, h, dst_x, dst_y);
1597}
1598
1599static bool
1600gen2_composite_set_target(struct sna *sna,
1601			  struct sna_composite_op *op,
1602			  PicturePtr dst,
1603			  int x, int y, int w, int h,
1604			  bool partial)
1605{
1606	BoxRec box;
1607	unsigned hint;
1608
1609	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1610	op->dst.format = dst->format;
1611	op->dst.width = op->dst.pixmap->drawable.width;
1612	op->dst.height = op->dst.pixmap->drawable.height;
1613
1614	if (w && h) {
1615		box.x1 = x;
1616		box.y1 = y;
1617		box.x2 = x + w;
1618		box.y2 = y + h;
1619	} else
1620		sna_render_picture_extents(dst, &box);
1621
1622	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
1623	if (!partial) {
1624		hint |= IGNORE_DAMAGE;
1625		if (w == op->dst.width && h == op->dst.height)
1626			hint |= REPLACES;
1627	}
1628
1629	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
1630	if (op->dst.bo == NULL)
1631		return false;
1632
1633	if (hint & REPLACES) {
1634		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
1635		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
1636	}
1637
1638	assert((op->dst.bo->pitch & 7) == 0);
1639
1640	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1641			    &op->dst.x, &op->dst.y);
1642
1643	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1644	     __FUNCTION__,
1645	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
1646	     op->dst.width, op->dst.height,
1647	     op->dst.bo->pitch,
1648	     op->dst.x, op->dst.y,
1649	     op->damage ? *op->damage : (void *)-1));
1650
1651	assert(op->dst.bo->proxy == NULL);
1652
1653	if (((too_large(op->dst.width, op->dst.height) ||
1654	      op->dst.bo->pitch > MAX_3D_PITCH)) &&
1655	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1656		return false;
1657
1658	return true;
1659}
1660
1661static bool
1662is_unhandled_gradient(PicturePtr picture, bool precise)
1663{
1664	if (picture->pDrawable)
1665		return false;
1666
1667	switch (picture->pSourcePict->type) {
1668	case SourcePictTypeSolidFill:
1669	case SourcePictTypeLinear:
1670		return false;
1671	default:
1672		return precise;
1673	}
1674}
1675
1676static bool
1677has_alphamap(PicturePtr p)
1678{
1679	return p->alphaMap != NULL;
1680}
1681
1682static bool
1683need_upload(PicturePtr p)
1684{
1685	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
1686}
1687
1688static bool
1689source_is_busy(PixmapPtr pixmap)
1690{
1691	struct sna_pixmap *priv = sna_pixmap(pixmap);
1692	if (priv == NULL)
1693		return false;
1694
1695	if (priv->clear)
1696		return false;
1697
1698	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
1699		return true;
1700
1701	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
1702		return true;
1703
1704	return priv->gpu_damage && !priv->cpu_damage;
1705}
1706
1707static bool
1708source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
1709{
1710	if (sna_picture_is_solid(p, NULL))
1711		return false;
1712
1713	if (is_unhandled_gradient(p, precise) || !gen2_check_repeat(p))
1714		return true;
1715
1716	if (pixmap && source_is_busy(pixmap))
1717		return false;
1718
1719	return has_alphamap(p) || !gen2_check_filter(p) || need_upload(p);
1720}
1721
1722static bool
1723gen2_composite_fallback(struct sna *sna,
1724			PicturePtr src,
1725			PicturePtr mask,
1726			PicturePtr dst)
1727{
1728	PixmapPtr src_pixmap;
1729	PixmapPtr mask_pixmap;
1730	PixmapPtr dst_pixmap;
1731	bool src_fallback, mask_fallback;
1732
1733	if (!gen2_check_dst_format(dst->format)) {
1734		DBG(("%s: unknown destination format: %d\n",
1735		     __FUNCTION__, dst->format));
1736		return true;
1737	}
1738
1739	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
1740
1741	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
1742	src_fallback = source_fallback(src, src_pixmap,
1743				       dst->polyMode == PolyModePrecise);
1744
1745	if (mask) {
1746		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
1747		mask_fallback = source_fallback(mask, mask_pixmap,
1748						dst->polyMode == PolyModePrecise);
1749	} else {
1750		mask_pixmap = NULL;
1751		mask_fallback = NULL;
1752	}
1753
1754	/* If we are using the destination as a source and need to
1755	 * readback in order to upload the source, do it all
1756	 * on the cpu.
1757	 */
1758	if (src_pixmap == dst_pixmap && src_fallback) {
1759		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
1760		return true;
1761	}
1762	if (mask_pixmap == dst_pixmap && mask_fallback) {
1763		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
1764		return true;
1765	}
1766
1767	/* If anything is on the GPU, push everything out to the GPU */
1768	if (dst_use_gpu(dst_pixmap)) {
1769		DBG(("%s: dst is already on the GPU, try to use GPU\n",
1770		     __FUNCTION__));
1771		return false;
1772	}
1773
1774	if (src_pixmap && !src_fallback) {
1775		DBG(("%s: src is already on the GPU, try to use GPU\n",
1776		     __FUNCTION__));
1777		return false;
1778	}
1779	if (mask_pixmap && !mask_fallback) {
1780		DBG(("%s: mask is already on the GPU, try to use GPU\n",
1781		     __FUNCTION__));
1782		return false;
1783	}
1784
1785	/* However if the dst is not on the GPU and we need to
1786	 * render one of the sources using the CPU, we may
1787	 * as well do the entire operation in place onthe CPU.
1788	 */
1789	if (src_fallback) {
1790		DBG(("%s: dst is on the CPU and src will fallback\n",
1791		     __FUNCTION__));
1792		return true;
1793	}
1794
1795	if (mask && mask_fallback) {
1796		DBG(("%s: dst is on the CPU and mask will fallback\n",
1797		     __FUNCTION__));
1798		return true;
1799	}
1800
1801	if (too_large(dst_pixmap->drawable.width,
1802		      dst_pixmap->drawable.height) &&
1803	    dst_is_cpu(dst_pixmap)) {
1804		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
1805		return true;
1806	}
1807
1808	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
1809	     __FUNCTION__));
1810	return dst_use_cpu(dst_pixmap);
1811}
1812
1813static int
1814reuse_source(struct sna *sna,
1815	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
1816	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
1817{
1818	uint32_t color;
1819
1820	if (src_x != msk_x || src_y != msk_y)
1821		return false;
1822
1823	if (sna_picture_is_solid(mask, &color))
1824		return gen2_composite_solid_init(sna, mc, color);
1825
1826	if (sc->is_solid)
1827		return false;
1828
1829	if (src == mask) {
1830		DBG(("%s: mask is source\n", __FUNCTION__));
1831		*mc = *sc;
1832		mc->bo = kgem_bo_reference(mc->bo);
1833		return true;
1834	}
1835
1836	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
1837		return false;
1838
1839	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
1840
1841	if (!sna_transform_equal(src->transform, mask->transform))
1842		return false;
1843
1844	if (!sna_picture_alphamap_equal(src, mask))
1845		return false;
1846
1847	if (!gen2_check_repeat(mask))
1848		return false;
1849
1850	if (!gen2_check_filter(mask))
1851		return false;
1852
1853	if (!gen2_check_format(sna, mask))
1854		return false;
1855
1856	DBG(("%s: reusing source channel for mask with a twist\n",
1857	     __FUNCTION__));
1858
1859	*mc = *sc;
1860	mc->repeat = mask->repeat ? mask->repeatType : RepeatNone;
1861	mc->filter = mask->filter;
1862	mc->pict_format = mask->format;
1863	mc->bo = kgem_bo_reference(mc->bo);
1864	return true;
1865}
1866
1867static bool
1868gen2_render_composite(struct sna *sna,
1869		      uint8_t op,
1870		      PicturePtr src,
1871		      PicturePtr mask,
1872		      PicturePtr dst,
1873		      int16_t src_x,  int16_t src_y,
1874		      int16_t mask_x, int16_t mask_y,
1875		      int16_t dst_x,  int16_t dst_y,
1876		      int16_t width,  int16_t height,
1877		      unsigned flags,
1878		      struct sna_composite_op *tmp)
1879{
1880	DBG(("%s()\n", __FUNCTION__));
1881
1882	if (op >= ARRAY_SIZE(gen2_blend_op)) {
1883		DBG(("%s: fallback due to unhandled blend op: %d\n",
1884		     __FUNCTION__, op));
1885		return false;
1886	}
1887
1888	if (mask == NULL &&
1889	    sna_blt_composite(sna, op, src, dst,
1890			      src_x, src_y,
1891			      dst_x, dst_y,
1892			      width, height,
1893			      flags, tmp))
1894		return true;
1895
1896	if (gen2_composite_fallback(sna, src, mask, dst))
1897		goto fallback;
1898
1899	if (need_tiling(sna, width, height))
1900		return sna_tiling_composite(op, src, mask, dst,
1901					    src_x,  src_y,
1902					    mask_x, mask_y,
1903					    dst_x,  dst_y,
1904					    width,  height,
1905					    tmp);
1906
1907	tmp->op = op;
1908	sna_render_composite_redirect_init(tmp);
1909
1910	if (!gen2_composite_set_target(sna, tmp, dst,
1911				       dst_x, dst_y, width, height,
1912				       flags & COMPOSITE_PARTIAL || op > PictOpSrc)) {
1913		DBG(("%s: unable to set render target\n",
1914		     __FUNCTION__));
1915		goto fallback;
1916	}
1917
1918	switch (gen2_composite_picture(sna, src, &tmp->src,
1919				       src_x, src_y,
1920				       width, height,
1921				       dst_x, dst_y,
1922				       dst->polyMode == PolyModePrecise)) {
1923	case -1:
1924		DBG(("%s: fallback -- unable to prepare source\n",
1925		     __FUNCTION__));
1926		goto cleanup_dst;
1927	case 0:
1928		gen2_composite_solid_init(sna, &tmp->src, 0);
1929		break;
1930	case 1:
1931		if (mask == NULL && tmp->src.bo &&
1932		    sna_blt_composite__convert(sna,
1933					       dst_x, dst_y, width, height,
1934					       tmp))
1935			return true;
1936		break;
1937	}
1938
1939	if (mask) {
1940		if (!reuse_source(sna,
1941				  src, &tmp->src, src_x, src_y,
1942				  mask, &tmp->mask, mask_x, mask_y)) {
1943			switch (gen2_composite_picture(sna, mask, &tmp->mask,
1944						       mask_x, mask_y,
1945						       width,  height,
1946						       dst_x,  dst_y,
1947						       dst->polyMode == PolyModePrecise)) {
1948			case -1:
1949				DBG(("%s: fallback -- unable to prepare mask\n",
1950				     __FUNCTION__));
1951				goto cleanup_src;
1952			case 0:
1953				gen2_composite_solid_init(sna, &tmp->mask, 0);
1954			case 1:
1955				break;
1956			}
1957		}
1958
1959		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
1960			/* Check if it's component alpha that relies on a source alpha
1961			 * and on the source value.  We can only get one of those
1962			 * into the single source value that we get to blend with.
1963			 */
1964			tmp->has_component_alpha = true;
1965			if (gen2_blend_op[op].src_alpha &&
1966			    (gen2_blend_op[op].src_blend != BLENDFACTOR_ZERO)) {
1967				if (op != PictOpOver) {
1968					DBG(("%s: fallback -- unsupported CA blend (src_blend=%d)\n",
1969					     __FUNCTION__,
1970					     gen2_blend_op[op].src_blend));
1971					goto cleanup_src;
1972				}
1973
1974				tmp->need_magic_ca_pass = true;
1975				tmp->op = PictOpOutReverse;
1976			}
1977		}
1978
1979		/* convert solid to a texture (pure convenience) */
1980		if (tmp->mask.is_solid && tmp->src.is_solid) {
1981			assert(tmp->mask.is_affine);
1982			tmp->mask.bo = sna_render_get_solid(sna, tmp->mask.u.gen2.pixel);
1983			if (!tmp->mask.bo)
1984				goto cleanup_src;
1985		}
1986	}
1987
1988	tmp->floats_per_vertex = 2;
1989	if (!tmp->src.is_solid)
1990		tmp->floats_per_vertex += tmp->src.is_affine ? 2 : 3;
1991	if (tmp->mask.bo)
1992		tmp->floats_per_vertex += tmp->mask.is_affine ? 2 : 3;
1993	tmp->floats_per_rect = 3*tmp->floats_per_vertex;
1994
1995	tmp->prim_emit = gen2_emit_composite_primitive;
1996	if (tmp->mask.bo) {
1997		if (tmp->mask.transform == NULL) {
1998			if (tmp->src.is_solid) {
1999				assert(tmp->floats_per_rect == 12);
2000#if defined(sse2) && !defined(__x86_64__)
2001				if (sna->cpu_features & SSE2) {
2002					tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask__sse2;
2003				} else
2004#endif
2005				{
2006					tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask;
2007				}
2008			}
2009		}
2010	} else {
2011		if (tmp->src.is_solid) {
2012			assert(tmp->floats_per_rect == 6);
2013#if defined(sse2) && !defined(__x86_64__)
2014			if (sna->cpu_features & SSE2) {
2015				tmp->prim_emit = gen2_emit_composite_primitive_constant__sse2;
2016			} else
2017#endif
2018			{
2019				tmp->prim_emit = gen2_emit_composite_primitive_constant;
2020			}
2021		} else if (tmp->src.is_linear) {
2022			assert(tmp->floats_per_rect == 12);
2023#if defined(sse2) && !defined(__x86_64__)
2024			if (sna->cpu_features & SSE2) {
2025				tmp->prim_emit = gen2_emit_composite_primitive_linear__sse2;
2026			} else
2027#endif
2028			{
2029				tmp->prim_emit = gen2_emit_composite_primitive_linear;
2030			}
2031		} else if (tmp->src.transform == NULL) {
2032			assert(tmp->floats_per_rect == 12);
2033#if defined(sse2) && !defined(__x86_64__)
2034			if (sna->cpu_features & SSE2) {
2035				tmp->prim_emit = gen2_emit_composite_primitive_identity__sse2;
2036			} else
2037#endif
2038			{
2039				tmp->prim_emit = gen2_emit_composite_primitive_identity;
2040			}
2041		} else if (tmp->src.is_affine) {
2042			assert(tmp->floats_per_rect == 12);
2043			tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
2044			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
2045#if defined(sse2) && !defined(__x86_64__)
2046			if (sna->cpu_features & SSE2) {
2047				tmp->prim_emit = gen2_emit_composite_primitive_affine__sse2;
2048			} else
2049#endif
2050			{
2051				tmp->prim_emit = gen2_emit_composite_primitive_affine;
2052			}
2053		}
2054	}
2055
2056	tmp->blt   = gen2_render_composite_blt;
2057	tmp->box   = gen2_render_composite_box;
2058	tmp->boxes = gen2_render_composite_boxes;
2059	tmp->done  = gen2_render_composite_done;
2060
2061	if (!kgem_check_bo(&sna->kgem,
2062			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2063			   NULL)) {
2064		kgem_submit(&sna->kgem);
2065		if (!kgem_check_bo(&sna->kgem,
2066				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2067				   NULL)) {
2068			DBG(("%s: fallback, operation does not fit into GTT\n",
2069			     __FUNCTION__));
2070			goto cleanup_mask;
2071		}
2072	}
2073
2074	gen2_emit_composite_state(sna, tmp);
2075	return true;
2076
2077cleanup_mask:
2078	if (tmp->mask.bo) {
2079		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2080		tmp->mask.bo = NULL;
2081	}
2082cleanup_src:
2083	if (tmp->src.bo) {
2084		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2085		tmp->src.bo = NULL;
2086	}
2087cleanup_dst:
2088	if (tmp->redirect.real_bo) {
2089		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2090		tmp->redirect.real_bo = NULL;
2091	}
2092fallback:
2093	return (mask == NULL &&
2094		sna_blt_composite(sna, op, src, dst,
2095				  src_x, src_y,
2096				  dst_x, dst_y,
2097				  width, height,
2098				  flags | COMPOSITE_FALLBACK, tmp));
2099}
2100
2101fastcall static void
2102gen2_emit_composite_spans_primitive_constant(struct sna *sna,
2103					     const struct sna_composite_spans_op *op,
2104					     const BoxRec *box,
2105					     float opacity)
2106{
2107	float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2108	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2109	sna->kgem.nbatch += 9;
2110
2111	v[0] = op->base.dst.x + box->x2;
2112	v[1] = op->base.dst.y + box->y2;
2113	*((uint32_t *)v + 2) = alpha;
2114
2115	v[3] = op->base.dst.x + box->x1;
2116	v[4] = v[1];
2117	*((uint32_t *)v + 5) = alpha;
2118
2119	v[6] = v[3];
2120	v[7] = op->base.dst.y + box->y1;
2121	*((uint32_t *)v + 8) = alpha;
2122}
2123
2124fastcall static void
2125gen2_emit_composite_spans_primitive_linear(struct sna *sna,
2126					     const struct sna_composite_spans_op *op,
2127					     const BoxRec *box,
2128					     float opacity)
2129{
2130	union {
2131		float f;
2132		uint32_t u;
2133	} alpha;
2134
2135	alpha.u = (uint8_t)(255 * opacity) << 24;
2136
2137	gen2_emit_composite_dstcoord(sna,
2138				     op->base.dst.x + box->x2,
2139				     op->base.dst.y + box->y2);
2140	VERTEX(alpha.f);
2141	gen2_emit_composite_linear(sna, &op->base.src, box->x2, box->y2);
2142
2143	gen2_emit_composite_dstcoord(sna,
2144				     op->base.dst.x + box->x1,
2145				     op->base.dst.y + box->y2);
2146	VERTEX(alpha.f);
2147	gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y2);
2148
2149	gen2_emit_composite_dstcoord(sna,
2150				     op->base.dst.x + box->x1,
2151				     op->base.dst.y + box->y1);
2152	VERTEX(alpha.f);
2153	gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y1);
2154}
2155
2156fastcall static void
2157gen2_emit_composite_spans_primitive_identity_source(struct sna *sna,
2158						    const struct sna_composite_spans_op *op,
2159						    const BoxRec *box,
2160						    float opacity)
2161{
2162	float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2163	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2164	sna->kgem.nbatch += 15;
2165
2166	v[0] = op->base.dst.x + box->x2;
2167	v[1] = op->base.dst.y + box->y2;
2168	*((uint32_t *)v + 2) = alpha;
2169	v[3] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
2170	v[4] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
2171
2172	v[5] = op->base.dst.x + box->x1;
2173	v[6] = v[1];
2174	*((uint32_t *)v + 7) = alpha;
2175	v[8] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
2176	v[9] = v[4];
2177
2178	v[10] = v[5];
2179	v[11] = op->base.dst.y + box->y1;
2180	*((uint32_t *)v + 12) = alpha;
2181	v[13] = v[8];
2182	v[14] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
2183}
2184
2185fastcall static void
2186gen2_emit_composite_spans_primitive_affine_source(struct sna *sna,
2187						  const struct sna_composite_spans_op *op,
2188						  const BoxRec *box,
2189						  float opacity)
2190{
2191	PictTransform *transform = op->base.src.transform;
2192	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2193	float *v;
2194
2195	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2196	sna->kgem.nbatch += 15;
2197
2198	v[0]  = op->base.dst.x + box->x2;
2199	v[6]  = v[1] = op->base.dst.y + box->y2;
2200	v[10] = v[5] = op->base.dst.x + box->x1;
2201	v[11] = op->base.dst.y + box->y1;
2202	*((uint32_t *)v + 2) = alpha;
2203	*((uint32_t *)v + 7) = alpha;
2204	*((uint32_t *)v + 12) = alpha;
2205
2206	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
2207				    (int)op->base.src.offset[1] + box->y2,
2208				    transform, op->base.src.scale,
2209				    &v[3], &v[4]);
2210
2211	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
2212				    (int)op->base.src.offset[1] + box->y2,
2213				    transform, op->base.src.scale,
2214				    &v[8], &v[9]);
2215
2216	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
2217				    (int)op->base.src.offset[1] + box->y1,
2218				    transform, op->base.src.scale,
2219				    &v[13], &v[14]);
2220}
2221
2222#if defined(sse2) && !defined(__x86_64__)
2223sse2 fastcall static void
2224gen2_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
2225						   const struct sna_composite_spans_op *op,
2226						   const BoxRec *box,
2227						   float opacity)
2228{
2229	float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2230	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2231	sna->kgem.nbatch += 9;
2232
2233	v[0] = op->base.dst.x + box->x2;
2234	v[1] = op->base.dst.y + box->y2;
2235	*((uint32_t *)v + 2) = alpha;
2236
2237	v[3] = op->base.dst.x + box->x1;
2238	v[4] = v[1];
2239	*((uint32_t *)v + 5) = alpha;
2240
2241	v[6] = v[3];
2242	v[7] = op->base.dst.y + box->y1;
2243	*((uint32_t *)v + 8) = alpha;
2244}
2245
2246sse2 fastcall static void
2247gen2_emit_composite_spans_primitive_linear__sse2(struct sna *sna,
2248						 const struct sna_composite_spans_op *op,
2249						 const BoxRec *box,
2250						 float opacity)
2251{
2252	union {
2253		float f;
2254		uint32_t u;
2255	} alpha;
2256
2257	alpha.u = (uint8_t)(255 * opacity) << 24;
2258
2259	gen2_emit_composite_dstcoord(sna,
2260				     op->base.dst.x + box->x2,
2261				     op->base.dst.y + box->y2);
2262	VERTEX(alpha.f);
2263	gen2_emit_composite_linear(sna, &op->base.src, box->x2, box->y2);
2264
2265	gen2_emit_composite_dstcoord(sna,
2266				     op->base.dst.x + box->x1,
2267				     op->base.dst.y + box->y2);
2268	VERTEX(alpha.f);
2269	gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y2);
2270
2271	gen2_emit_composite_dstcoord(sna,
2272				     op->base.dst.x + box->x1,
2273				     op->base.dst.y + box->y1);
2274	VERTEX(alpha.f);
2275	gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y1);
2276}
2277
2278sse2 fastcall static void
2279gen2_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
2280							  const struct sna_composite_spans_op *op,
2281							  const BoxRec *box,
2282							  float opacity)
2283{
2284	float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2285	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2286	sna->kgem.nbatch += 15;
2287
2288	v[0] = op->base.dst.x + box->x2;
2289	v[1] = op->base.dst.y + box->y2;
2290	*((uint32_t *)v + 2) = alpha;
2291	v[3] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
2292	v[4] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
2293
2294	v[5] = op->base.dst.x + box->x1;
2295	v[6] = v[1];
2296	*((uint32_t *)v + 7) = alpha;
2297	v[8] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
2298	v[9] = v[4];
2299
2300	v[10] = v[5];
2301	v[11] = op->base.dst.y + box->y1;
2302	*((uint32_t *)v + 12) = alpha;
2303	v[13] = v[8];
2304	v[14] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
2305}
2306
2307sse2 fastcall static void
2308gen2_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
2309							const struct sna_composite_spans_op *op,
2310							const BoxRec *box,
2311							float opacity)
2312{
2313	PictTransform *transform = op->base.src.transform;
2314	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
2315	float *v;
2316
2317	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
2318	sna->kgem.nbatch += 15;
2319
2320	v[0]  = op->base.dst.x + box->x2;
2321	v[6]  = v[1] = op->base.dst.y + box->y2;
2322	v[10] = v[5] = op->base.dst.x + box->x1;
2323	v[11] = op->base.dst.y + box->y1;
2324	*((uint32_t *)v + 2) = alpha;
2325	*((uint32_t *)v + 7) = alpha;
2326	*((uint32_t *)v + 12) = alpha;
2327
2328	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
2329				    (int)op->base.src.offset[1] + box->y2,
2330				    transform, op->base.src.scale,
2331				    &v[3], &v[4]);
2332
2333	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
2334				    (int)op->base.src.offset[1] + box->y2,
2335				    transform, op->base.src.scale,
2336				    &v[8], &v[9]);
2337
2338	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
2339				    (int)op->base.src.offset[1] + box->y1,
2340				    transform, op->base.src.scale,
2341				    &v[13], &v[14]);
2342}
2343#endif
2344
2345static void
2346gen2_emit_composite_spans_vertex(struct sna *sna,
2347				 const struct sna_composite_spans_op *op,
2348				 int16_t x, int16_t y,
2349				 float opacity)
2350{
2351	gen2_emit_composite_dstcoord(sna, x + op->base.dst.x, y + op->base.dst.y);
2352	BATCH((uint8_t)(opacity * 255) << 24);
2353	assert(!op->base.src.is_solid);
2354	if (op->base.src.is_linear)
2355		gen2_emit_composite_linear(sna, &op->base.src, x, y);
2356	else
2357		gen2_emit_composite_texcoord(sna, &op->base.src, x, y);
2358}
2359
2360fastcall static void
2361gen2_emit_composite_spans_primitive(struct sna *sna,
2362				    const struct sna_composite_spans_op *op,
2363				    const BoxRec *box,
2364				    float opacity)
2365{
2366	gen2_emit_composite_spans_vertex(sna, op, box->x2, box->y2, opacity);
2367	gen2_emit_composite_spans_vertex(sna, op, box->x1, box->y2, opacity);
2368	gen2_emit_composite_spans_vertex(sna, op, box->x1, box->y1, opacity);
2369}
2370
2371static void
2372gen2_emit_spans_pipeline(struct sna *sna,
2373			 const struct sna_composite_spans_op *op)
2374{
2375	uint32_t cblend, ablend;
2376	uint32_t unwind;
2377
2378	cblend =
2379		TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X | TB0C_OP_MODULATE |
2380		TB0C_ARG1_SEL_DIFFUSE | TB0C_ARG1_REPLICATE_ALPHA |
2381		TB0C_OUTPUT_WRITE_CURRENT;
2382	ablend =
2383		TB0A_RESULT_SCALE_1X | TB0A_OP_MODULATE |
2384		TB0A_ARG1_SEL_DIFFUSE |
2385		TB0A_OUTPUT_WRITE_CURRENT;
2386
2387	if (op->base.src.is_solid) {
2388		ablend |= TB0A_ARG2_SEL_SPECULAR;
2389		cblend |= TB0C_ARG2_SEL_SPECULAR;
2390		if (op->base.dst.format == PICT_a8)
2391			cblend |= TB0C_ARG2_REPLICATE_ALPHA;
2392	} else if (op->base.dst.format == PICT_a8) {
2393		ablend |= TB0A_ARG2_SEL_TEXEL0;
2394		cblend |= TB0C_ARG2_SEL_TEXEL0 | TB0C_ARG2_REPLICATE_ALPHA;
2395	} else {
2396		if (PICT_FORMAT_RGB(op->base.src.pict_format) != 0)
2397			cblend |= TB0C_ARG2_SEL_TEXEL0;
2398		else
2399			cblend |= TB0C_ARG2_SEL_ONE | TB0C_ARG2_INVERT;
2400
2401		if (op->base.src.is_opaque)
2402			ablend |= TB0A_ARG2_SEL_ONE;
2403		else
2404			ablend |= TB0A_ARG2_SEL_TEXEL0;
2405	}
2406
2407	unwind = sna->kgem.nbatch;
2408	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
2409	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
2410	BATCH(cblend);
2411	BATCH(ablend);
2412	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls2 + 1,
2413		   sna->kgem.batch + unwind + 1,
2414		   2 * sizeof(uint32_t)) == 0)
2415		sna->kgem.nbatch = unwind;
2416	else
2417		sna->render_state.gen2.ls2 = unwind;
2418}
2419
2420static void gen2_emit_composite_spans_state(struct sna *sna,
2421					    const struct sna_composite_spans_op *op)
2422{
2423	uint32_t unwind;
2424
2425	gen2_get_batch(sna, &op->base);
2426	gen2_emit_target(sna, &op->base);
2427
2428	unwind = sna->kgem.nbatch;
2429	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
2430	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
2431	BATCH(!op->base.src.is_solid << 12);
2432	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY | S3_DIFFUSE_PRESENT);
2433	BATCH(gen2_get_blend_cntl(op->base.op, false, op->base.dst.format));
2434	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
2435		   sna->kgem.batch + unwind + 1,
2436		   3 * sizeof(uint32_t)) == 0)
2437		sna->kgem.nbatch = unwind;
2438	else
2439		sna->render_state.gen2.ls1 = unwind;
2440
2441	gen2_disable_logic_op(sna);
2442	gen2_emit_spans_pipeline(sna, op);
2443
2444	if (op->base.src.is_solid) {
2445		if (op->base.src.u.gen2.pixel != sna->render_state.gen2.specular) {
2446			BATCH(_3DSTATE_DFLT_SPECULAR_CMD);
2447			BATCH(op->base.src.u.gen2.pixel);
2448			sna->render_state.gen2.specular = op->base.src.u.gen2.pixel;
2449		}
2450	} else {
2451		uint32_t v =_3DSTATE_VERTEX_FORMAT_2_CMD |
2452			(op->base.src.is_affine ? TEXCOORDFMT_2D : TEXCOORDFMT_3D);
2453		if (sna->render_state.gen2.vft != v) {
2454			BATCH(v);
2455			sna->render_state.gen2.vft = v;
2456		}
2457		gen2_emit_texture(sna, &op->base.src, 0);
2458	}
2459}
2460
2461fastcall static void
2462gen2_render_composite_spans_box(struct sna *sna,
2463				const struct sna_composite_spans_op *op,
2464				const BoxRec *box, float opacity)
2465{
2466	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2467	     __FUNCTION__,
2468	     op->base.src.offset[0], op->base.src.offset[1],
2469	     opacity,
2470	     op->base.dst.x, op->base.dst.y,
2471	     box->x1, box->y1,
2472	     box->x2 - box->x1,
2473	     box->y2 - box->y1));
2474
2475	if (gen2_get_rectangles(sna, &op->base, 1) == 0) {
2476		gen2_emit_composite_spans_state(sna, op);
2477		gen2_get_rectangles(sna, &op->base, 1);
2478	}
2479
2480	op->prim_emit(sna, op, box, opacity);
2481}
2482
2483static void
2484gen2_render_composite_spans_boxes(struct sna *sna,
2485				  const struct sna_composite_spans_op *op,
2486				  const BoxRec *box, int nbox,
2487				  float opacity)
2488{
2489	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2490	     __FUNCTION__, nbox,
2491	     op->base.src.offset[0], op->base.src.offset[1],
2492	     opacity,
2493	     op->base.dst.x, op->base.dst.y));
2494
2495	do {
2496		int nbox_this_time;
2497
2498		nbox_this_time = gen2_get_rectangles(sna, &op->base, nbox);
2499		if (nbox_this_time == 0) {
2500			gen2_emit_composite_spans_state(sna, op);
2501			nbox_this_time = gen2_get_rectangles(sna, &op->base, nbox);
2502		}
2503		nbox -= nbox_this_time;
2504
2505		do {
2506			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2507			     box->x1, box->y1,
2508			     box->x2 - box->x1,
2509			     box->y2 - box->y1));
2510
2511			op->prim_emit(sna, op, box++, opacity);
2512		} while (--nbox_this_time);
2513	} while (nbox);
2514}
2515
2516fastcall static void
2517gen2_render_composite_spans_done(struct sna *sna,
2518				 const struct sna_composite_spans_op *op)
2519{
2520	DBG(("%s()\n", __FUNCTION__));
2521
2522	gen2_vertex_flush(sna, &op->base);
2523
2524	if (op->base.src.bo)
2525		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2526
2527	sna_render_composite_redirect_done(sna, &op->base);
2528}
2529
2530static bool
2531gen2_check_composite_spans(struct sna *sna,
2532			   uint8_t op, PicturePtr src, PicturePtr dst,
2533			   int16_t width, int16_t height, unsigned flags)
2534{
2535	if (op >= ARRAY_SIZE(gen2_blend_op))
2536		return false;
2537
2538	if (gen2_composite_fallback(sna, src, NULL, dst))
2539		return false;
2540
2541	if (need_tiling(sna, width, height)) {
2542		if (!is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2543			DBG(("%s: fallback, tiled operation not on GPU\n",
2544			     __FUNCTION__));
2545			return false;
2546		}
2547	}
2548
2549	return true;
2550}
2551
2552static bool
2553gen2_render_composite_spans(struct sna *sna,
2554			    uint8_t op,
2555			    PicturePtr src,
2556			    PicturePtr dst,
2557			    int16_t src_x,  int16_t src_y,
2558			    int16_t dst_x,  int16_t dst_y,
2559			    int16_t width,  int16_t height,
2560			    unsigned flags,
2561			    struct sna_composite_spans_op *tmp)
2562{
2563	DBG(("%s(src=(%d, %d), dst=(%d, %d), size=(%d, %d))\n", __FUNCTION__,
2564	     src_x, src_y, dst_x, dst_y, width, height));
2565
2566	assert(gen2_check_composite_spans(sna, op, src, dst, width, height, flags));
2567	if (need_tiling(sna, width, height)) {
2568		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2569		     __FUNCTION__, width, height));
2570		return sna_tiling_composite_spans(op, src, dst,
2571						  src_x, src_y, dst_x, dst_y,
2572						  width, height, flags, tmp);
2573	}
2574
2575	tmp->base.op = op;
2576	sna_render_composite_redirect_init(&tmp->base);
2577	if (!gen2_composite_set_target(sna, &tmp->base, dst,
2578				       dst_x, dst_y, width, height,
2579				       true)) {
2580		DBG(("%s: unable to set render target\n",
2581		     __FUNCTION__));
2582		return false;
2583	}
2584
2585	switch (gen2_composite_picture(sna, src, &tmp->base.src,
2586				       src_x, src_y,
2587				       width, height,
2588				       dst_x, dst_y,
2589				       dst->polyMode == PolyModePrecise)) {
2590	case -1:
2591		goto cleanup_dst;
2592	case 0:
2593		gen2_composite_solid_init(sna, &tmp->base.src, 0);
2594	case 1:
2595		break;
2596	}
2597	assert(tmp->base.src.bo || tmp->base.src.is_solid);
2598
2599	tmp->prim_emit = gen2_emit_composite_spans_primitive;
2600	tmp->base.floats_per_vertex = 3;
2601	if (tmp->base.src.is_solid) {
2602#if defined(sse2) && !defined(__x86_64__)
2603		if (sna->cpu_features & SSE2) {
2604			tmp->prim_emit = gen2_emit_composite_spans_primitive_constant__sse2;
2605		} else
2606#endif
2607		{
2608			tmp->prim_emit = gen2_emit_composite_spans_primitive_constant;
2609		}
2610	} else if (tmp->base.src.is_linear) {
2611		tmp->base.floats_per_vertex += 2;
2612#if defined(sse2) && !defined(__x86_64__)
2613		if (sna->cpu_features & SSE2) {
2614			tmp->prim_emit = gen2_emit_composite_spans_primitive_linear__sse2;
2615		} else
2616#endif
2617		{
2618			tmp->prim_emit = gen2_emit_composite_spans_primitive_linear;
2619		}
2620	} else {
2621		assert(tmp->base.src.bo);
2622		tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
2623		if (tmp->base.src.transform == NULL) {
2624#if defined(sse2) && !defined(__x86_64__)
2625			if (sna->cpu_features & SSE2) {
2626				tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source__sse2;
2627			} else
2628#endif
2629			{
2630				tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source;
2631			}
2632		} else if (tmp->base.src.is_affine) {
2633			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
2634			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
2635#if defined(sse2) && !defined(__x86_64__)
2636			if (sna->cpu_features & SSE2) {
2637				tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source__sse2;
2638			} else
2639#endif
2640			{
2641				tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source;
2642			}
2643		}
2644	}
2645	tmp->base.mask.bo = NULL;
2646	tmp->base.floats_per_rect = 3*tmp->base.floats_per_vertex;
2647
2648	tmp->box   = gen2_render_composite_spans_box;
2649	tmp->boxes = gen2_render_composite_spans_boxes;
2650	tmp->done  = gen2_render_composite_spans_done;
2651
2652	if (!kgem_check_bo(&sna->kgem,
2653			   tmp->base.dst.bo, tmp->base.src.bo,
2654			   NULL)) {
2655		kgem_submit(&sna->kgem);
2656		if (!kgem_check_bo(&sna->kgem,
2657				   tmp->base.dst.bo, tmp->base.src.bo,
2658				   NULL))
2659			goto cleanup_src;
2660	}
2661
2662	gen2_emit_composite_spans_state(sna, tmp);
2663	return true;
2664
2665cleanup_src:
2666	if (tmp->base.src.bo)
2667		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2668cleanup_dst:
2669	if (tmp->base.redirect.real_bo)
2670		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2671	return false;
2672}
2673
2674static void
2675gen2_emit_fill_pipeline(struct sna *sna, const struct sna_composite_op *op)
2676{
2677	uint32_t blend, unwind;
2678
2679	unwind = sna->kgem.nbatch;
2680	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
2681	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
2682
2683	blend = TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X | TB0C_OP_ARG1 |
2684		TB0C_ARG1_SEL_DIFFUSE |
2685		TB0C_OUTPUT_WRITE_CURRENT;
2686	if (op->dst.format == PICT_a8)
2687		blend |= TB0C_ARG1_REPLICATE_ALPHA;
2688	BATCH(blend);
2689
2690	BATCH(TB0A_RESULT_SCALE_1X | TB0A_OP_ARG1 |
2691	      TB0A_ARG1_SEL_DIFFUSE |
2692	      TB0A_OUTPUT_WRITE_CURRENT);
2693
2694	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls2 + 1,
2695		   sna->kgem.batch + unwind + 1,
2696		   2 * sizeof(uint32_t)) == 0)
2697		sna->kgem.nbatch = unwind;
2698	else
2699		sna->render_state.gen2.ls2 = unwind;
2700}
2701
2702static void gen2_emit_fill_composite_state(struct sna *sna,
2703					   const struct sna_composite_op *op,
2704					   uint32_t pixel)
2705{
2706	uint32_t ls1;
2707
2708	gen2_get_batch(sna, op);
2709	gen2_emit_target(sna, op);
2710
2711	ls1 = sna->kgem.nbatch;
2712	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
2713	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
2714	BATCH(0);
2715	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
2716	BATCH(gen2_get_blend_cntl(op->op, false, op->dst.format));
2717	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
2718		   sna->kgem.batch + ls1 + 1,
2719		   3 * sizeof(uint32_t)) == 0)
2720		sna->kgem.nbatch = ls1;
2721	else
2722		sna->render_state.gen2.ls1 = ls1;
2723
2724	gen2_emit_fill_pipeline(sna, op);
2725
2726	if (pixel != sna->render_state.gen2.diffuse) {
2727		BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
2728		BATCH(pixel);
2729		sna->render_state.gen2.diffuse = pixel;
2730	}
2731}
2732
2733static bool
2734gen2_render_fill_boxes_try_blt(struct sna *sna,
2735			       CARD8 op, PictFormat format,
2736			       const xRenderColor *color,
2737			       const DrawableRec *dst, struct kgem_bo *dst_bo,
2738			       const BoxRec *box, int n)
2739{
2740	uint8_t alu;
2741	uint32_t pixel;
2742
2743	if (op > PictOpSrc)
2744		return false;
2745
2746	if (op == PictOpClear) {
2747		alu = GXclear;
2748		pixel = 0;
2749	} else if (!sna_get_pixel_from_rgba(&pixel,
2750					    color->red,
2751					    color->green,
2752					    color->blue,
2753					    color->alpha,
2754					    format))
2755		return false;
2756	else
2757		alu = GXcopy;
2758
2759	return sna_blt_fill_boxes(sna, alu,
2760				  dst_bo, dst->bitsPerPixel,
2761				  pixel, box, n);
2762}
2763
2764static bool
2765gen2_render_fill_boxes(struct sna *sna,
2766		       CARD8 op,
2767		       PictFormat format,
2768		       const xRenderColor *color,
2769		       const DrawableRec *dst, struct kgem_bo *dst_bo,
2770		       const BoxRec *box, int n)
2771{
2772	struct sna_composite_op tmp;
2773	uint32_t pixel;
2774
2775	if (op >= ARRAY_SIZE(gen2_blend_op)) {
2776		DBG(("%s: fallback due to unhandled blend op: %d\n",
2777		     __FUNCTION__, op));
2778		return false;
2779	}
2780
2781#if NO_FILL_BOXES
2782	return gen2_render_fill_boxes_try_blt(sna, op, format, color,
2783					      dst, dst_bo,
2784					      box, n);
2785#endif
2786	if (gen2_render_fill_boxes_try_blt(sna, op, format, color,
2787					   dst, dst_bo,
2788					   box, n))
2789		return true;
2790
2791
2792	DBG(("%s (op=%d, format=%x, color=(%04x,%04x,%04x, %04x))\n",
2793	     __FUNCTION__, op, (int)format,
2794	     color->red, color->green, color->blue, color->alpha));
2795
2796	if (too_large(dst->width, dst->height) ||
2797	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH ||
2798	    !gen2_check_dst_format(format)) {
2799		DBG(("%s: try blt, too large or incompatible destination\n",
2800		     __FUNCTION__));
2801		if (!gen2_check_dst_format(format))
2802			return false;
2803
2804		assert(dst_bo->pitch >= 8);
2805		return sna_tiling_fill_boxes(sna, op, format, color,
2806					     dst, dst_bo, box, n);
2807	}
2808
2809	if (op == PictOpClear)
2810		pixel = 0;
2811	else if (!sna_get_pixel_from_rgba(&pixel,
2812					  color->red,
2813					  color->green,
2814					  color->blue,
2815					  color->alpha,
2816					  PICT_a8r8g8b8))
2817		return false;
2818
2819	DBG(("%s: using shader for op=%d, format=%x, pixel=%x\n",
2820	     __FUNCTION__, op, (int)format, pixel));
2821
2822	memset(&tmp, 0, sizeof(tmp));
2823	tmp.op = op;
2824	tmp.dst.pixmap = (PixmapPtr)dst;
2825	tmp.dst.width = dst->width;
2826	tmp.dst.height = dst->height;
2827	tmp.dst.format = format;
2828	tmp.dst.bo = dst_bo;
2829	tmp.floats_per_vertex = 2;
2830	tmp.floats_per_rect = 6;
2831
2832	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2833		kgem_submit(&sna->kgem);
2834		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
2835			return false;
2836	}
2837
2838	gen2_emit_fill_composite_state(sna, &tmp, pixel);
2839
2840	do {
2841		int n_this_time = gen2_get_rectangles(sna, &tmp, n);
2842		if (n_this_time == 0) {
2843			gen2_emit_fill_composite_state(sna, &tmp, pixel);
2844			n_this_time = gen2_get_rectangles(sna, &tmp, n);
2845		}
2846		n -= n_this_time;
2847
2848		do {
2849			DBG(("	(%d, %d), (%d, %d): %x\n",
2850			     box->x1, box->y1, box->x2, box->y2, pixel));
2851			VERTEX(box->x2);
2852			VERTEX(box->y2);
2853			VERTEX(box->x1);
2854			VERTEX(box->y2);
2855			VERTEX(box->x1);
2856			VERTEX(box->y1);
2857			box++;
2858		} while (--n_this_time);
2859	} while (n);
2860
2861	gen2_vertex_flush(sna, &tmp);
2862	return true;
2863}
2864
2865static void gen2_emit_fill_state(struct sna *sna,
2866				 const struct sna_composite_op *op)
2867{
2868	uint32_t ls1;
2869
2870	gen2_get_batch(sna, op);
2871	gen2_emit_target(sna, op);
2872
2873	ls1 = sna->kgem.nbatch;
2874	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
2875	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
2876	BATCH(0);
2877	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
2878	BATCH(S8_ENABLE_COLOR_BUFFER_WRITE);
2879	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
2880		   sna->kgem.batch + ls1 + 1,
2881		   3 * sizeof(uint32_t)) == 0)
2882		sna->kgem.nbatch = ls1;
2883	else
2884		sna->render_state.gen2.ls1 = ls1;
2885
2886	gen2_enable_logic_op(sna, op->op);
2887	gen2_emit_fill_pipeline(sna, op);
2888
2889	if (op->src.u.gen2.pixel != sna->render_state.gen2.diffuse) {
2890		BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
2891		BATCH(op->src.u.gen2.pixel);
2892		sna->render_state.gen2.diffuse = op->src.u.gen2.pixel;
2893	}
2894}
2895
2896static void
2897gen2_render_fill_op_blt(struct sna *sna,
2898			const struct sna_fill_op *op,
2899			int16_t x, int16_t y, int16_t w, int16_t h)
2900{
2901	if (!gen2_get_rectangles(sna, &op->base, 1)) {
2902		gen2_emit_fill_state(sna, &op->base);
2903		gen2_get_rectangles(sna, &op->base, 1);
2904	}
2905
2906	VERTEX(x+w);
2907	VERTEX(y+h);
2908	VERTEX(x);
2909	VERTEX(y+h);
2910	VERTEX(x);
2911	VERTEX(y);
2912}
2913
2914fastcall static void
2915gen2_render_fill_op_box(struct sna *sna,
2916			const struct sna_fill_op *op,
2917			const BoxRec *box)
2918{
2919	if (!gen2_get_rectangles(sna, &op->base, 1)) {
2920		gen2_emit_fill_state(sna, &op->base);
2921		gen2_get_rectangles(sna, &op->base, 1);
2922	}
2923
2924	VERTEX(box->x2);
2925	VERTEX(box->y2);
2926	VERTEX(box->x1);
2927	VERTEX(box->y2);
2928	VERTEX(box->x1);
2929	VERTEX(box->y1);
2930}
2931
2932fastcall static void
2933gen2_render_fill_op_boxes(struct sna *sna,
2934			  const struct sna_fill_op *op,
2935			  const BoxRec *box,
2936			  int nbox)
2937{
2938	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
2939	     box->x1, box->y1, box->x2, box->y2, nbox));
2940
2941	do {
2942		int nbox_this_time = gen2_get_rectangles(sna, &op->base, nbox);
2943		if (nbox_this_time == 0) {
2944			gen2_emit_fill_state(sna, &op->base);
2945			nbox_this_time = gen2_get_rectangles(sna, &op->base, nbox);
2946		}
2947		nbox -= nbox_this_time;
2948
2949		do {
2950			VERTEX(box->x2);
2951			VERTEX(box->y2);
2952			VERTEX(box->x1);
2953			VERTEX(box->y2);
2954			VERTEX(box->x1);
2955			VERTEX(box->y1);
2956			box++;
2957		} while (--nbox_this_time);
2958	} while (nbox);
2959}
2960
2961static void
2962gen2_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
2963{
2964	gen2_vertex_flush(sna, &op->base);
2965}
2966
2967static bool
2968gen2_render_fill(struct sna *sna, uint8_t alu,
2969		 PixmapPtr dst, struct kgem_bo *dst_bo,
2970		 uint32_t color, unsigned flags,
2971		 struct sna_fill_op *tmp)
2972{
2973#if NO_FILL
2974	return sna_blt_fill(sna, alu,
2975			    dst_bo, dst->drawable.bitsPerPixel,
2976			    color,
2977			    tmp);
2978#endif
2979
2980	/* Prefer to use the BLT if already engaged */
2981	if (sna_blt_fill(sna, alu,
2982			 dst_bo, dst->drawable.bitsPerPixel,
2983			 color,
2984			 tmp))
2985		return true;
2986
2987	/* Must use the BLT if we can't RENDER... */
2988	if (too_large(dst->drawable.width, dst->drawable.height) ||
2989	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
2990		return false;
2991
2992	tmp->base.op = alu;
2993	tmp->base.dst.pixmap = dst;
2994	tmp->base.dst.width = dst->drawable.width;
2995	tmp->base.dst.height = dst->drawable.height;
2996	tmp->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2997	tmp->base.dst.bo = dst_bo;
2998	tmp->base.dst.x = tmp->base.dst.y = 0;
2999	tmp->base.floats_per_vertex = 2;
3000	tmp->base.floats_per_rect = 6;
3001
3002	tmp->base.src.u.gen2.pixel =
3003		sna_rgba_for_color(color, dst->drawable.depth);
3004
3005	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3006		kgem_submit(&sna->kgem);
3007		return sna_blt_fill(sna, alu,
3008				    dst_bo, dst->drawable.bitsPerPixel,
3009				    color,
3010				    tmp);
3011	}
3012
3013	tmp->blt   = gen2_render_fill_op_blt;
3014	tmp->box   = gen2_render_fill_op_box;
3015	tmp->boxes = gen2_render_fill_op_boxes;
3016	tmp->points = NULL;
3017	tmp->done  = gen2_render_fill_op_done;
3018
3019	gen2_emit_fill_state(sna, &tmp->base);
3020	return true;
3021}
3022
3023static bool
3024gen2_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3025			     uint32_t color,
3026			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
3027			     uint8_t alu)
3028{
3029	BoxRec box;
3030
3031	box.x1 = x1;
3032	box.y1 = y1;
3033	box.x2 = x2;
3034	box.y2 = y2;
3035
3036	return sna_blt_fill_boxes(sna, alu,
3037				  bo, dst->drawable.bitsPerPixel,
3038				  color, &box, 1);
3039}
3040
3041static bool
3042gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3043		     uint32_t color,
3044		     int16_t x1, int16_t y1,
3045		     int16_t x2, int16_t y2,
3046		     uint8_t alu)
3047{
3048	struct sna_composite_op tmp;
3049
3050#if NO_FILL_ONE
3051	return gen2_render_fill_one_try_blt(sna, dst, bo, color,
3052					    x1, y1, x2, y2, alu);
3053#endif
3054
3055	/* Prefer to use the BLT if already engaged */
3056	if (gen2_render_fill_one_try_blt(sna, dst, bo, color,
3057					 x1, y1, x2, y2, alu))
3058		return true;
3059
3060	/* Must use the BLT if we can't RENDER... */
3061	if (too_large(dst->drawable.width, dst->drawable.height) ||
3062	    bo->pitch < 8 || bo->pitch > MAX_3D_PITCH)
3063		return false;
3064
3065	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3066		kgem_submit(&sna->kgem);
3067
3068		if (gen2_render_fill_one_try_blt(sna, dst, bo, color,
3069						 x1, y1, x2, y2, alu))
3070			return true;
3071
3072		if (!kgem_check_bo(&sna->kgem, bo, NULL))
3073			return false;
3074	}
3075
3076	tmp.op = alu;
3077	tmp.dst.pixmap = dst;
3078	tmp.dst.width = dst->drawable.width;
3079	tmp.dst.height = dst->drawable.height;
3080	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3081	tmp.dst.bo = bo;
3082	tmp.floats_per_vertex = 2;
3083	tmp.floats_per_rect = 6;
3084	tmp.need_magic_ca_pass = false;
3085
3086	tmp.src.u.gen2.pixel =
3087		sna_rgba_for_color(color, dst->drawable.depth);
3088
3089	gen2_emit_fill_state(sna, &tmp);
3090	gen2_get_rectangles(sna, &tmp, 1);
3091	DBG(("%s: (%d, %d), (%d, %d): %x\n", __FUNCTION__,
3092	     x1, y1, x2, y2, tmp.src.u.gen2.pixel));
3093	VERTEX(x2);
3094	VERTEX(y2);
3095	VERTEX(x1);
3096	VERTEX(y2);
3097	VERTEX(x1);
3098	VERTEX(y1);
3099	gen2_vertex_flush(sna, &tmp);
3100
3101	return true;
3102}
3103
3104static void
3105gen2_render_copy_setup_source(struct sna_composite_channel *channel,
3106			      const DrawableRec *draw,
3107			      struct kgem_bo *bo)
3108{
3109	assert(draw->width && draw->height);
3110
3111	channel->filter = PictFilterNearest;
3112	channel->repeat = RepeatNone;
3113	channel->width  = draw->width;
3114	channel->height = draw->height;
3115	channel->scale[0] = 1.f/draw->width;
3116	channel->scale[1] = 1.f/draw->height;
3117	channel->offset[0] = 0;
3118	channel->offset[1] = 0;
3119	channel->pict_format = sna_format_for_depth(draw->depth);
3120	channel->bo = bo;
3121	channel->is_affine = 1;
3122
3123	DBG(("%s: source=%d, (%dx%d), format=%08x\n",
3124	     __FUNCTION__, bo->handle,
3125	     channel->width, channel->height,
3126	     channel->pict_format));
3127}
3128
3129static void
3130gen2_emit_copy_pipeline(struct sna *sna, const struct sna_composite_op *op)
3131{
3132	uint32_t blend, unwind;
3133
3134	unwind = sna->kgem.nbatch;
3135	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
3136	      LOAD_TEXTURE_BLEND_STAGE(0) | 1);
3137
3138	blend = TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X | TB0C_OP_ARG1 |
3139		TB0C_OUTPUT_WRITE_CURRENT;
3140	if (op->dst.format == PICT_a8)
3141		blend |= TB0C_ARG1_REPLICATE_ALPHA | TB0C_ARG1_SEL_TEXEL0;
3142	else if (PICT_FORMAT_RGB(op->src.pict_format) != 0)
3143		blend |= TB0C_ARG1_SEL_TEXEL0;
3144	else
3145		blend |= TB0C_ARG1_SEL_ONE | TB0C_ARG1_INVERT;	/* 0.0 */
3146	BATCH(blend);
3147
3148	blend = TB0A_RESULT_SCALE_1X | TB0A_OP_ARG1 |
3149		TB0A_OUTPUT_WRITE_CURRENT;
3150	if (PICT_FORMAT_A(op->src.pict_format) == 0)
3151		blend |= TB0A_ARG1_SEL_ONE;
3152	else
3153		blend |= TB0A_ARG1_SEL_TEXEL0;
3154	BATCH(blend);
3155
3156	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls2 + 1,
3157		   sna->kgem.batch + unwind + 1,
3158		   2 * sizeof(uint32_t)) == 0)
3159		sna->kgem.nbatch = unwind;
3160	else
3161		sna->render_state.gen2.ls2 = unwind;
3162}
3163
3164static void gen2_emit_copy_state(struct sna *sna, const struct sna_composite_op *op)
3165{
3166	uint32_t ls1, v;
3167
3168	gen2_get_batch(sna, op);
3169
3170	if (kgem_bo_is_dirty(op->src.bo)) {
3171		if (op->src.bo == op->dst.bo)
3172			BATCH(MI_FLUSH | MI_INVALIDATE_MAP_CACHE);
3173		else
3174			BATCH(_3DSTATE_MODES_5_CMD |
3175			      PIPELINE_FLUSH_RENDER_CACHE |
3176			      PIPELINE_FLUSH_TEXTURE_CACHE);
3177		kgem_clear_dirty(&sna->kgem);
3178	}
3179	gen2_emit_target(sna, op);
3180
3181	ls1 = sna->kgem.nbatch;
3182	BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
3183	      I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
3184	BATCH(1<<12);
3185	BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
3186	BATCH(S8_ENABLE_COLOR_BUFFER_WRITE);
3187	if (memcmp(sna->kgem.batch + sna->render_state.gen2.ls1 + 1,
3188		   sna->kgem.batch + ls1 + 1,
3189		   3 * sizeof(uint32_t)) == 0)
3190		sna->kgem.nbatch = ls1;
3191	else
3192		sna->render_state.gen2.ls1 = ls1;
3193
3194	gen2_enable_logic_op(sna, op->op);
3195	gen2_emit_copy_pipeline(sna, op);
3196
3197	v = _3DSTATE_VERTEX_FORMAT_2_CMD | TEXCOORDFMT_2D;
3198	if (sna->render_state.gen2.vft != v) {
3199		BATCH(v);
3200		sna->render_state.gen2.vft = v;
3201	}
3202
3203	gen2_emit_texture(sna, &op->src, 0);
3204}
3205
3206static bool
3207gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
3208		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
3209		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
3210		       const BoxRec *box, int n, unsigned flags)
3211{
3212	struct sna_composite_op tmp;
3213
3214#if NO_COPY_BOXES
3215	if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3216		return false;
3217
3218	return sna_blt_copy_boxes(sna, alu,
3219				  src_bo, src_dx, src_dy,
3220				  dst_bo, dst_dx, dst_dy,
3221				  dst->drawable.bitsPerPixel,
3222				  box, n);
3223#endif
3224
3225	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
3226	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
3227
3228	if (sna_blt_compare_depth(src, dst) &&
3229	    sna_blt_copy_boxes(sna, alu,
3230			       src_bo, src_dx, src_dy,
3231			       dst_bo, dst_dx, dst_dy,
3232			       dst->bitsPerPixel,
3233			       box, n))
3234		return true;
3235
3236	if (src_bo == dst_bo || /* XXX handle overlap using 3D ? */
3237	    too_large(src->width, src->height) ||
3238	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch < 8) {
3239fallback:
3240		return sna_blt_copy_boxes_fallback(sna, alu,
3241						   src, src_bo, src_dx, src_dy,
3242						   dst, dst_bo, dst_dx, dst_dy,
3243						   box, n);
3244	}
3245
3246	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
3247		kgem_submit(&sna->kgem);
3248		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
3249			goto fallback;
3250	}
3251
3252	assert(dst_bo->pitch >= 8);
3253
3254	memset(&tmp, 0, sizeof(tmp));
3255	tmp.op = alu;
3256
3257	tmp.dst.pixmap = (PixmapPtr)dst;
3258	tmp.dst.width = dst->width;
3259	tmp.dst.height = dst->height;
3260	tmp.dst.format = sna_format_for_depth(dst->depth);
3261	tmp.dst.bo = dst_bo;
3262	tmp.dst.x = tmp.dst.y = 0;
3263	tmp.damage = NULL;
3264
3265	DBG(("%s: target=%d, format=%08x, size=%dx%d\n",
3266	     __FUNCTION__, dst_bo->handle,
3267	     (unsigned)tmp.dst.format,
3268	     tmp.dst.width,
3269	     tmp.dst.height));
3270
3271	sna_render_composite_redirect_init(&tmp);
3272	if (too_large(tmp.dst.width, tmp.dst.height) ||
3273	    dst_bo->pitch > MAX_3D_PITCH) {
3274		BoxRec extents = box[0];
3275		int i;
3276
3277		for (i = 1; i < n; i++) {
3278			if (box[i].x1 < extents.x1)
3279				extents.x1 = box[i].x1;
3280			if (box[i].y1 < extents.y1)
3281				extents.y1 = box[i].y1;
3282
3283			if (box[i].x2 > extents.x2)
3284				extents.x2 = box[i].x2;
3285			if (box[i].y2 > extents.y2)
3286				extents.y2 = box[i].y2;
3287		}
3288		if (!sna_render_composite_redirect(sna, &tmp,
3289						   extents.x1 + dst_dx,
3290						   extents.y1 + dst_dy,
3291						   extents.x2 - extents.x1,
3292						   extents.y2 - extents.y1,
3293						   alu != GXcopy || n > 1))
3294			goto fallback_tiled;
3295	}
3296
3297	tmp.floats_per_vertex = 4;
3298	tmp.floats_per_rect = 12;
3299
3300	dst_dx += tmp.dst.x;
3301	dst_dy += tmp.dst.y;
3302	tmp.dst.x = tmp.dst.y = 0;
3303
3304	gen2_render_copy_setup_source(&tmp.src, src, src_bo);
3305	gen2_emit_copy_state(sna, &tmp);
3306	do {
3307		int n_this_time;
3308
3309		n_this_time = gen2_get_rectangles(sna, &tmp, n);
3310		if (n_this_time == 0) {
3311			gen2_emit_copy_state(sna, &tmp);
3312			n_this_time = gen2_get_rectangles(sna, &tmp, n);
3313		}
3314		n -= n_this_time;
3315
3316		do {
3317			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
3318			     box->x1 + src_dx, box->y1 + src_dy,
3319			     box->x1 + dst_dx, box->y1 + dst_dy,
3320			     box->x2 - box->x1, box->y2 - box->y1));
3321			VERTEX(box->x2 + dst_dx);
3322			VERTEX(box->y2 + dst_dy);
3323			VERTEX((box->x2 + src_dx) * tmp.src.scale[0]);
3324			VERTEX((box->y2 + src_dy) * tmp.src.scale[1]);
3325
3326			VERTEX(box->x1 + dst_dx);
3327			VERTEX(box->y2 + dst_dy);
3328			VERTEX((box->x1 + src_dx) * tmp.src.scale[0]);
3329			VERTEX((box->y2 + src_dy) * tmp.src.scale[1]);
3330
3331			VERTEX(box->x1 + dst_dx);
3332			VERTEX(box->y1 + dst_dy);
3333			VERTEX((box->x1 + src_dx) * tmp.src.scale[0]);
3334			VERTEX((box->y1 + src_dy) * tmp.src.scale[1]);
3335
3336			box++;
3337		} while (--n_this_time);
3338	} while (n);
3339
3340	gen2_vertex_flush(sna, &tmp);
3341	sna_render_composite_redirect_done(sna, &tmp);
3342	return true;
3343
3344fallback_tiled:
3345	return sna_tiling_copy_boxes(sna, alu,
3346				     src, src_bo, src_dx, src_dy,
3347				     dst, dst_bo, dst_dx, dst_dy,
3348				     box, n);
3349}
3350
3351static void
3352gen2_render_copy_blt(struct sna *sna,
3353		     const struct sna_copy_op *op,
3354		     int16_t sx, int16_t sy,
3355		     int16_t w, int16_t h,
3356		     int16_t dx, int16_t dy)
3357{
3358	if (!gen2_get_rectangles(sna, &op->base, 1)) {
3359		gen2_emit_copy_state(sna, &op->base);
3360		gen2_get_rectangles(sna, &op->base, 1);
3361	}
3362
3363	VERTEX(dx+w);
3364	VERTEX(dy+h);
3365	VERTEX((sx+w)*op->base.src.scale[0]);
3366	VERTEX((sy+h)*op->base.src.scale[1]);
3367
3368	VERTEX(dx);
3369	VERTEX(dy+h);
3370	VERTEX(sx*op->base.src.scale[0]);
3371	VERTEX((sy+h)*op->base.src.scale[1]);
3372
3373	VERTEX(dx);
3374	VERTEX(dy);
3375	VERTEX(sx*op->base.src.scale[0]);
3376	VERTEX(sy*op->base.src.scale[1]);
3377}
3378
3379static void
3380gen2_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
3381{
3382	gen2_vertex_flush(sna, &op->base);
3383}
3384
3385static bool
3386gen2_render_copy(struct sna *sna, uint8_t alu,
3387		 PixmapPtr src, struct kgem_bo *src_bo,
3388		 PixmapPtr dst, struct kgem_bo *dst_bo,
3389		 struct sna_copy_op *tmp)
3390{
3391#if NO_COPY
3392	if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3393		return false;
3394
3395	return sna_blt_copy(sna, alu,
3396			    src_bo, dst_bo,
3397			    dst->drawable.bitsPerPixel,
3398			    tmp);
3399#endif
3400
3401	/* Prefer to use the BLT */
3402	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
3403	    sna_blt_copy(sna, alu,
3404			 src_bo, dst_bo,
3405			 dst->drawable.bitsPerPixel,
3406			 tmp))
3407		return true;
3408
3409	/* Must use the BLT if we can't RENDER... */
3410	if (too_large(src->drawable.width, src->drawable.height) ||
3411	    too_large(dst->drawable.width, dst->drawable.height) ||
3412	    src_bo->pitch > MAX_3D_PITCH ||
3413	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
3414fallback:
3415		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3416			return false;
3417
3418		return sna_blt_copy(sna, alu, src_bo, dst_bo,
3419				    dst->drawable.bitsPerPixel,
3420				    tmp);
3421	}
3422
3423	tmp->base.op = alu;
3424
3425	tmp->base.dst.pixmap = dst;
3426	tmp->base.dst.width = dst->drawable.width;
3427	tmp->base.dst.height = dst->drawable.height;
3428	tmp->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3429	tmp->base.dst.bo = dst_bo;
3430
3431	gen2_render_copy_setup_source(&tmp->base.src, &src->drawable, src_bo);
3432	tmp->base.mask.bo = NULL;
3433
3434	tmp->base.floats_per_vertex = 4;
3435	tmp->base.floats_per_rect = 12;
3436
3437	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
3438		kgem_submit(&sna->kgem);
3439		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
3440			goto fallback;
3441	}
3442
3443	tmp->blt  = gen2_render_copy_blt;
3444	tmp->done = gen2_render_copy_done;
3445
3446	gen2_emit_composite_state(sna, &tmp->base);
3447	return true;
3448}
3449
3450static void
3451gen2_render_reset(struct sna *sna)
3452{
3453	sna->render_state.gen2.need_invariant = true;
3454	sna->render_state.gen2.logic_op_enabled = 0;
3455	sna->render_state.gen2.target = 0;
3456
3457	sna->render_state.gen2.ls1 = 0;
3458	sna->render_state.gen2.ls2 = 0;
3459	sna->render_state.gen2.vft = 0;
3460
3461	sna->render_state.gen2.diffuse = 0x0c0ffee0;
3462	sna->render_state.gen2.specular = 0x0c0ffee0;
3463}
3464
3465static void
3466gen2_render_flush(struct sna *sna)
3467{
3468	assert(sna->render.vertex_index == 0);
3469	assert(sna->render.vertex_offset == 0);
3470}
3471
3472static void
3473gen2_render_context_switch(struct kgem *kgem,
3474			   int new_mode)
3475{
3476	struct sna *sna = container_of(kgem, struct sna, kgem);
3477
3478	if (!kgem->nbatch)
3479		return;
3480
3481	/* Reload BLT registers following a lost context */
3482	sna->blt_state.fill_bo = 0;
3483
3484	if (kgem_ring_is_idle(kgem, kgem->ring)) {
3485		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
3486		_kgem_submit(kgem);
3487	}
3488}
3489
3490const char *gen2_render_init(struct sna *sna, const char *backend)
3491{
3492	struct sna_render *render = &sna->render;
3493
3494	sna->kgem.context_switch = gen2_render_context_switch;
3495
3496	/* Use the BLT (and overlay) for everything except when forced to
3497	 * use the texture combiners.
3498	 */
3499#if !NO_COMPOSITE
3500	render->composite = gen2_render_composite;
3501	render->prefer_gpu |= PREFER_GPU_RENDER;
3502#endif
3503#if !NO_COMPOSITE_SPANS
3504	render->check_composite_spans = gen2_check_composite_spans;
3505	render->composite_spans = gen2_render_composite_spans;
3506	render->prefer_gpu |= PREFER_GPU_SPANS;
3507#endif
3508	render->fill_boxes = gen2_render_fill_boxes;
3509	render->fill = gen2_render_fill;
3510	render->fill_one = gen2_render_fill_one;
3511	render->copy = gen2_render_copy;
3512	render->copy_boxes = gen2_render_copy_boxes;
3513
3514	/* XXX YUV color space conversion for video? */
3515
3516	render->reset = gen2_render_reset;
3517	render->flush = gen2_render_flush;
3518
3519	render->max_3d_size = MAX_3D_SIZE;
3520	render->max_3d_pitch = MAX_3D_PITCH;
3521	return "Almador (gen2)";
3522}
3523